I am beginner to python and I am working with scrapy. I have used xmlitemexporter to export my scraped data to xml file. But i get only "<"/item"">" in the xml file.
My items.py is like follow:
from scrapy.item import Item, Field
class WorkwithitemsItem(Item):
title = Field()
link = Field()
publish = Field()
description = Field()
And the spider is like:
from scrapy import log
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from workwithitems.items import WorkwithitemsItem
class MySpider(BaseSpider):
name = 'spidey'
allowed_domains = ['ekantipur.com']
start_urls = [
'http://www.ekantipur.com/en/rss',
]
def parse(self, response):
self.log('A response from %s just arrived!' % response.url)
sel = Selector(response)
title = sel.xpath('//title/text()').extract()
link = sel.xpath('//link/text()').extract()
publish = sel.xpath('//pubDate/text()').extract()
description = sel.xpath('//description/text()').extract()
WorkwithitemsItem(title = title[2:], link = link[2:],
publish = publish, description = description[1:])
And the pipelines.py is:
from scrapy import signals
from scrapy.contrib.exporter import XmlItemExporter
class XmlExportPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s_products.xml' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = XmlItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
The settings.py is:
BOT_NAME = 'workwithitems'
SPIDER_MODULES = ['workwithitems.spiders']
NEWSPIDER_MODULE = 'workwithitems.spiders'
FEED_EXPORTERS_BASE = {
'xml': 'scrapy.contrib.exporter.XmlItemExporter',
}
ITEM_PIPELINES = {
'workwithitems.pipelines.XmlExportPipeline': 800,
}
I can't figure out where my problem is.
Ok! I found the problem. What i did is just put a 'return' at the last line in spider.py
return WorkwithitemsItem(title = title[2:], link = link[2:],
publish = publish, description = description[1:]
)
Related
I'm trying to put all the data that I'm scraping in MongoDB to monitor the properties prices.
I've already done a lot of tests, but it's not working.
I will put the code here, if anyone could help me. Please.
init.py
import scrapy
from realstatedata.items import RealstatedataItem
class RsdataSpider(scrapy.Spider):
name = 'realstatedata'
allowed_domains = ['vivareal.com.br']
start_urls = ['https://www.vivareal.com.br/aluguel/sp/sao-jose-dos-campos/apartamento_residencial/#preco-ate=2000']
def parse(self, response):
nextpageurl = response.xpath('//a[#title="Próxima página"]/#href')
yield from self.scrape(response)
if nextpageurl:
path = nextpageurl.extract_first()
#print(path)
# Got #pagina=2 => Replace with ?pagina=2
path = '?' + path[1:]
#print(path)
nextpage = response.urljoin(path)
print("Found url: {}".format(nextpage))
yield scrapy.Request(nextpage)
def scrape(self, response):
for resource in response.xpath('//article[#class="property-card__container js-property-card"]/..'):
item = RealstatedataItem()
item['description'] = resource.xpath('.//h2/span[#class="property-card__title js-cardLink js-card-title"]/text()').extract_first()
item['address'] = resource.xpath('.//span[#class="property-card__address"]/text()').extract_first()
item['prop_area'] = resource.xpath('.//span[#class="property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area"]/text()').extract_first()
item['prop_rooms'] = resource.xpath('.//span[#class="property-card__detail-value js-property-card-value"]/text()').extract_first()
item['prop_bath'] = resource.xpath('.//span[#class="property-card__detail-value js-property-card-value"]/text()').extract_first()
item['prop_parking'] = resource.xpath('.//ul/li[4]/span[#class="property-card__detail-value js-property-card-value"]/text()').extract_first()
item['price_rent'] = resource.xpath('.//p[#style="display: block;"]/text()').extract_first()
item['price_cond'] = resource.xpath('.//strong[#class="js-condo-price"]/text()').extract_first()
item['realstate_name'] = resource.xpath('.//picture/img/#alt').extract_first()
yield item
settings.py
BOT_NAME = 'realstatedata'
SPIDER_MODULES = ['realstatedata.spiders']
NEWSPIDER_MODULE = 'realstatedata.spiders'
ITEM_PIPELINES = ['realstatedata.pipelines.MongoPipeline', ]
MONGO_URI = 'mongodb://localhost:27017'
MONGO_DATABASE = "vivareal_db"
items.py
import scrapy
class RealstatedataItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
description = scrapy.Field()
address = scrapy.Field()
prop_area = scrapy.Field()
prop_rooms = scrapy.Field()
prop_bath = scrapy.Field()
prop_parking = scrapy.Field()
price_rent = scrapy.Field()
price_cond = scrapy.Field()
realstate_name = scrapy.Field()
pass
pipeline.py
in this part of the code, i've tried two differents forms to do, but none works.
import pymongo
import logging
class MongoPipeline(object):
collection_name = 'rent_properties'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
#classmethod
def from_crawler(cls, crawler):
## pull in information from settings.py
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self, spider):
## initializing spider
## opening db connection
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
## clean up when spider is closed
self.client.close()
def process_item(self, item, spider):
## how to handle each post
self.db[self.collection_name].insert(dict(item))
logging.debug("Properties added to MongoDB")
return item
Obviously your settings for enabling pipeline is wrong. ITEM_PIPELINES should be defined as a dict but not a list. In your code, the pipeline is not loaded at all.
ITEM_PIPELINES = {
"realstatedata.pipelines.MongoPipeline": 100,
}
The value in the dict represents priority when more than 1 pipeline are enabled.
I am working Google search crawling using scrapy. This is the code and it works well to get search results.
GoogleBot.py:
class GoogleBotsSpider(scrapy.Spider):
name = 'GoogleScrapyBot'
allowed_domains = ['google.com']
start_urls = ['https://www.google.com/search?q=apple&hl=en&rlz=&start=0']
def parse(self, response):
item = {}
all_page = response.xpath('//*[#id="main"]')
for page in all_page:
title = page.xpath('//*[#id="main"]/div/div/div/a/h3/div/text()').extract()
link = page.xpath('//*[#id="main"]/div/div/div/a/#href').extract()
for title, link in zip(title, link):
print(title)
print(link.lstrip("/url?q="))
My next step is use "pipeline" on Scrapy to save a csv file for results.
Here is the code that I have written so far.
setting.py:
ITEM_PIPELINES = {'GoogleScrapy.pipelines.GooglePipeline': 300,}
pipelines.py:
from scrapy.exporters import CsvItemExporter
class GooglePipeline(object):
def __init__(self):
self.file = open("GoogleSearchResult.csv", 'wb')
self.exporter = CsvItemExporter(self.file, encoding='utf-8')
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
This is modified my spider code.
GoogleBot.py:
def parse(self, response):
item = {}
all_page = response.xpath('//*[#id="main"]')
for page in all_page:
item['title'] = page.xpath('//*[#id="main"]/div/div/div/a/h3/div/text()').extract()
item['link'] = page.xpath('//*[#id="main"]/div/div/div/a/#href').extract()
for title, link in zip(title, link):
print(title)
print(link.lstrip("/url?q="))
yield item
It has error where in:
for title, link in zip(title, link):
print(title)
print(link.lstrip("/url?q="))
I get this error:
for title, link in zip(title, link):
UnboundLocalError: local variable 'title' referenced before assignment
Here is the working output according to your comment.
import scrapy
class GoogleBotsSpider(scrapy.Spider):
name = 'GoogleScrapyBot'
allowed_domains = ['google.com']
start_urls = ['https://www.google.com/search?q=apple&hl=en&rlz=&start=0']
def parse(self, response):
all_page = response.xpath('//*[#id="main"]')
for page in all_page:
titles = page.xpath('//*[#id="main"]/div/div/div/a/h3/div//text()').extract()
for title in titles:
links = page.xpath('//*[#id="main"]/div/div/div/a/#href').extract()
for link in links:
item={
'Title': title,
'Link':link
}
yield item
hello i'm new on python/scrapy world, i need to export my list of products to csv like this exemple:
what i want
but i get this one:
what i got
/////
spider:
/////
import scrapy
import csv
from escrap.items import EscrapItem
class EscrapSpider(scrapy.Spider):
name = "tunisianet"
allowed_domains = ["tunisianet.com.tn"]
start_urls = [
"http://www.tunisianet.com.tn/385-logiciels-informatique-tunisie/"
]
def parse(self, response):
for sel in response.xpath('//*[contains(#class, "ajax_block_product")]'):
item = EscrapItem()
item['revendeur'] = '\n'.join(sel.xpath('//*[contains(#class, "center_block")]/h2/a/#href').re('tunisianet'))
item['produit'] = '\n'.join(sel.xpath('//*[contains(#class, "center_block")]/h2/a/text()').extract())
item['lien'] = '\n'.join(sel.xpath('//*[contains(#class, "center_block")]/h2/a/#href').extract())
item['description'] = '\n'.join(sel.xpath('//*[contains(#class, "product_desc")]/a/text()').extract())
item['prix'] = '\n'.join(sel.xpath('//*[contains(#class, "price")]/text()').extract())
data = [item['revendeur'], item['produit'], item['lien'], item['description'], item['prix']]
out = open('out.csv', 'w')
for row in data:
for column in row:
out.write(column.encode('utf-8'))
return data
/////
items:
/////
import scrapy
class EscrapItem(scrapy.Item):
revendeur = scrapy.Field()
produit = scrapy.Field()
lien = scrapy.Field()
description = scrapy.Field()
prix = scrapy.Field()
/////
pipelines:
/////
class EscrapPipeline(object):
# put all words in lowercase
words_to_filter = ['politics', 'religion']
def process_item(self, item, spider):
for word in self.words_to_filter:
if word in unicode([item['revendeur'],item['produit'],item['lien'],item['description'],item ['prix']]).lower():
raise DropItem("Contains forbidden word: %s" % word)
else:
return item
/////
my setting:
/////
BOT_NAME = 'escrap'
SPIDER_MODULES = ['escrap.spiders']
NEWSPIDER_MODULE = 'escrap.spiders'
ITEM_PIPELINES = {'escrap.pipelines.EscrapPipeline': 1}
FEED_EXPORTERS = {
'csv': 'escrap.escrap_csv_item_exporter.EscrapCsvItemExporter',
}
FIELDS_TO_EXPORT = [
'revendeur',
'produit',
'lien',
'description',
'prix'
]
You don't need to create the csv file yourself when parsing items, scrapy can export by default to a csv file.
so change your parse method to:
def parse(self, response):
for sel in response.xpath('//*[contains(#class, "ajax_block_product")]'):
item = EscrapItem()
item['revendeur'] = '\n'.join(sel.xpath('//*[contains(#class, "center_block")]/h2/a/#href').re('tunisianet'))
item['produit'] = '\n'.join(sel.xpath('//*[contains(#class, "center_block")]/h2/a/text()').extract())
item['lien'] = '\n'.join(sel.xpath('//*[contains(#class, "center_block")]/h2/a/#href').extract())
item['description'] = '\n'.join(sel.xpath('//*[contains(#class, "product_desc")]/a/text()').extract())
item['prix'] = '\n'.join(sel.xpath('//*[contains(#class, "price")]/text()').extract())
yield item
later when calling scrapy you can call it with:
scrapy crawl myspider -o output.csv
Now you have all your items exported to a csv file.
If you still want to control it on your own pipeline, check here to create your own exporter. It would like this:
from scrapy import signals
from scrapy.exporters import CsvItemExporter
class CSVExportPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s_products.csv' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = CsvItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
To create your own pipeline make sure to read this entirely.
You should probably set the cell where you want to write you data. Something like:
worksheet.write('A1','thing you want to write')
Or it may be default to write content in 'A'
it export but not with the form i want, i want the form like this one :
http://i.imgur.com/r8LaVem.png , but i got this one http://i.imgur.com/8IVnlui.png .
here is my final class :
def parse(self, response):
item = TfawItem()
data= []
items = []
out = open('out.csv', 'a')
x = response.xpath('//*[contains(#class, "ajax_block_product")]')
for i in range(0, len(x)):
item['revendeur'] = response.xpath('//*[contains(#class, "center_block")]/h2/a/#href').re('tunisianet')[i]
item['produit'] = response.xpath('//*[contains(#class, "center_block")]/h2/a/text()').extract()[i]
item['url'] = response.xpath('//*[contains(#class, "center_block")]/h2/a/#href').extract()[i]
item['description'] = response.xpath('//*[contains(#class, "product_desc")]/a/text()').extract()[i]
item['prix'] = response.xpath('//*[contains(#class, "price")]/text()').extract()[i]
data = item['revendeur'], item['produit'], item['url'], item['description'], item['prix']
out.write(str(data))
out.write('\n')
I have a problem where it prints the XML files correctly but it doesn't populate the XML file with any content.
The output in terminal is this:
[u'Tove'] [u'Jani'] [u'Reminder'] [u"Don't forget me this weekend!"]
However the output site_products.xml results in this (which is wrong, no data):
<?xml version="1.0" encoding="utf-8"?>
<items></items>
spider.py
from scrapy.contrib.spiders import XMLFeedSpider
from crawler.items import CrawlerItem
class SiteSpider(XMLFeedSpider):
name = 'site'
allowed_domains = ['www.w3schools.com']
start_urls = ['http://www.w3schools.com/xml/note.xml']
itertag = 'note'
def parse_node(self, response, selector):
to = selector.xpath('//to/text()').extract()
who = selector.xpath('//from/text()').extract()
heading = selector.xpath('//heading/text()').extract()
body = selector.xpath('//body/text()').extract()
return item
pipelines.py
from scrapy import signals
from scrapy.contrib.exporter import XmlItemExporter
class XmlExportPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s_products.xml' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = XmlItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
items.py
import scrapy
class CrawlerItem(scrapy.Item):
to = scrapy.Field()
who = scrapy.Field()
heading = scrapy.Field()
body = scrapy.Field()
pass
settings.py
BOT_NAME = 'crawler'
SPIDER_MODULES = ['crawler.spiders']
NEWSPIDER_MODULE = 'crawler.spiders'
ITEM_PIPELINES = {'crawler.pipelines.XmlExportPipeline': 300,}
Any help with this would be really appreciated.
You need to instantiate a CrawlerItem instance in your parse_node() method:
def parse_node(self, response, selector):
item = CrawlerItem()
item['to'] = selector.xpath('//to/text()').extract()
item['who'] = selector.xpath('//from/text()').extract()
item['heading'] = selector.xpath('//heading/text()').extract()
item['body'] = selector.xpath('//body/text()').extract()
return item
I am trying to build a spider for a school project where I am scraping recipes from allrecipes.com. Everything is working really well, however I seem to be unable to remove duplicate recipes where one url contains the actual recipe, and the other contains the same url with "video=true" appended.
Here is my attempt to dealing with this in pipelines.py:
from scrapy.exceptions import DropItem
from scrapy import log
class DuplicatesPipeline(object):
# minCal = 50
def __init__(self):
self.urls_seen = set()
def process_vids(self, item, spider):
video = "video=true"
url = str(item.get('url'))
if video in url:
raise DropItem("Contains video")
else:
return item
def process_item(self, item, spider):
unique_id = item.get('url')
if unique_id in self.urls_seen:
raise DropItem("Duplicate Item found (%s)" % unique_id)
else:
self.urls_seen.add('url')
return item
settings.py:
# Scrapy settings for dirbot project
BOT_NAME = 'dirbot'
SPIDER_MODULES = ['dirbot.spiders']
NEWSPIDER_MODULE = 'dirbot.spiders'
DEFAULT_ITEM_CLASS = 'dirbot.items.Website'
ITEM_PIPELINES = {'dirbot.pipelines.DuplicatesPipeline': 300,}
items.py:
from scrapy.item import Item, Field
class Website(Item):
name = Field()
url = Field()
description = Field()
kcal = Field()
carbs = Field()
fat = Field()
protein = Field()
main = Field()
sugar = Field()
fibre = Field()
author = Field()
rating = Field()
img = Field()
dnot.py:
from scrapy.spider import Spider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request
from scrapy.http.request import Request
from dirbot.items import Website
from scrapy.contrib.spiders import CrawlSpider,Rule
import urlparse
import scrapy
page = "http://allrecipes.com/recipes/main.aspx?Page=%d#recipes"
class DmozSpider(Spider):
name = "dnot"
allowed_domains = ["allrecipes.com"]
start_urls = [page % 1]
rules = [Rule(SgmlLinkExtractor(allow=('allrecipes.com'), restrict_xpaths = '//a[contains(.,"NEXT")]'),
callback="parse", follow= True),
]
def __init__(self):
self.page_number = 1
def parse(self, response):
print "-------------------------------------------------"
print self.page_number
print "-------------------------------------------------"
sel = Selector(response)
sites = response.xpath('//div[#id="divGridItemWrapper"]')
items = []
for site in sites:
item = Website()
recipe = response.xpath('//a[contains(#href, "/Recipe/")]/#href').extract()
url = "http://www.allrecipes.com"
for nth in recipe:
go = urlparse.urljoin(url, str(nth))
items.append(item)
for link in go:
yield Request(go, self.recipes)
if self.page_number <= 3:
self.page_number += 1
yield Request(page % self.page_number)
else:
pass
def recipes(self,response):
item = Website()
sel = Selector(response)
recipe = response.xpath('//div[#id="content-wrapper"]')
items = []
print "second page - %s" % response.url
for i in recipe:
item['url'] = response.url
item['description'] = i.xpath('//span[#itemprop="description"]/text()').extract()
item['name'] = i.xpath('//h1[#itemprop="name"]/text()').extract()
item['kcal'] = i.xpath('//ul/li[contains(.,"kcal")]/span/text()').extract()
item['carbs'] = i.xpath('//ul/li[contains(.,"Carbohydrates")]/following-sibling::li[1]//span[#id="lblNutrientValue"]/text()').extract()
item['fat'] = i.xpath('//ul/li[contains(.,"Fat")]/following-sibling::li[1]//span[#id="lblNutrientValue"]/text()').extract()
item['protein'] = i.xpath('//ul/li[contains(.,"Protein")]/following-sibling::li[1]//span[#id="lblNutrientValue"]/text()').extract()
item['main'] = "allrecipes.com"
item['sugar'] = i.xpath('//li/span[#itemprop="sugarContent"]/text()').extract()
item['fibre'] = i.xpath('//li/span[#itemprop="proteinContent"]/text()').extract()
item['author'] = i.xpath('//span[#id="lblUser0"]/text()').extract()
item['rating'] = i.xpath('//div[#class="rating-stars-img"][1]/meta[1][#itemprop="ratingValue"]/#content').extract()
item['img'] = i.xpath('//img[#id="imgPhoto"]/#src').extract()
items.append(item)
yield item
I am a little new with Python, and I'm not sure if I need to convert the item['url'] into a string or not; however I have tried with the "str" and without. I have also tried a few other methods that others have used for doing something similar, but nothing has worked for me so far.
Hoping someone can point me in the right direction. Thanks in advance!
Example:
item['url'] = http://allrecipes.com/Recipe/Delicious-Ham-and-Potato-Soup/Detail.aspx?evt19=1&referringHubId=1
item['url'] = http://allrecipes.com/Recipe/Delicious-Ham-and-Potato-Soup/Detail.aspx?evt19=1&referringHubId=1&video=true
You need to create a class that implements the process_item method on the pipelines.py file, something like:
from urllib import urlencode
from urlparse import urlparse, urlunparse, parse_qs
class DuplicatesPipeline(object):
def __init__(self):
self.ids_seen = set()
def process_item(self, item, spider):
url = item['url']
u = urlparse(url)
query = parse_qs(u.query)
query.pop('video', None)
u = u._replace(query=urlencode(query, True))
unique_id = urlunparse(u)
if unique_id and unique_id in self.ids_seen:
raise DropItem("Duplicate Item found (%s)" % unique_id)
else:
self.ids_seen.add(unique_id)
return item
Then you need to add that class, to settings.py
ITEM_PIPELINES = {
'yourproject.pipelines.DuplicatesPipeline': 300,
}
Also, your process_vids method isn't being used.
let me know if it helps you.