Scrapy prints fields but doesn't populate XML file - python

I have a problem where it prints the XML files correctly but it doesn't populate the XML file with any content.
The output in terminal is this:
[u'Tove'] [u'Jani'] [u'Reminder'] [u"Don't forget me this weekend!"]
However the output site_products.xml results in this (which is wrong, no data):
<?xml version="1.0" encoding="utf-8"?>
<items></items>
spider.py
from scrapy.contrib.spiders import XMLFeedSpider
from crawler.items import CrawlerItem
class SiteSpider(XMLFeedSpider):
name = 'site'
allowed_domains = ['www.w3schools.com']
start_urls = ['http://www.w3schools.com/xml/note.xml']
itertag = 'note'
def parse_node(self, response, selector):
to = selector.xpath('//to/text()').extract()
who = selector.xpath('//from/text()').extract()
heading = selector.xpath('//heading/text()').extract()
body = selector.xpath('//body/text()').extract()
return item
pipelines.py
from scrapy import signals
from scrapy.contrib.exporter import XmlItemExporter
class XmlExportPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s_products.xml' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = XmlItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
items.py
import scrapy
class CrawlerItem(scrapy.Item):
to = scrapy.Field()
who = scrapy.Field()
heading = scrapy.Field()
body = scrapy.Field()
pass
settings.py
BOT_NAME = 'crawler'
SPIDER_MODULES = ['crawler.spiders']
NEWSPIDER_MODULE = 'crawler.spiders'
ITEM_PIPELINES = {'crawler.pipelines.XmlExportPipeline': 300,}
Any help with this would be really appreciated.

You need to instantiate a CrawlerItem instance in your parse_node() method:
def parse_node(self, response, selector):
item = CrawlerItem()
item['to'] = selector.xpath('//to/text()').extract()
item['who'] = selector.xpath('//from/text()').extract()
item['heading'] = selector.xpath('//heading/text()').extract()
item['body'] = selector.xpath('//body/text()').extract()
return item

Related

Scrapy use case - new links only

At a high level - I think I'm trying to use the Scrapy framework like a scraping library.
My use case is, I have a webpage with links to meeting minutes I'd like to scrape, as time passes, more links to meeting minutes are added.
My plan was to use a regular spider to scrape the links to meeting minutes, and pipeline/CsvItemExporter the list of links to a CSV.
Regular Spider 1 - webpage with links to meeting minutes I'd like to scrape, exports to csv:
class QuotesSpider(scrapy.Spider):
name = "easthamptontown-links"
custom_settings = {
'ITEM_PIPELINES': {
'themis.pipelines.ThemisPipeline': 400
}
}
def start_requests(self):
urls = [
'http://easthamptontown.iqm2.com/Citizens/Calendar.aspx?From=1/1/1900&To=12/31/9999',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
rowtops = response.xpath('//div[#class="RowTop"]')
for meeting in rowtops:
yield {
'meeting': meeting.css("a[href*='Detail_Meeting']").get(),
'files': meeting.css("a[href*='FileView']").getall(),
}
Pipeline 1
class ThemisPipeline:
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s.csv' % spider.name, 'wb')
self.files[spider] = file
self.exporter = CsvItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
file_output = {}
_item = ItemAdapter(item).asdict()
if len(_item['files']) > 0:
for filelink in _item['files']:
parser = MyHTMLParser()
parser.feed(filelink)
file_output['filelink'] = parser.lsHref
file_output['filetype'] = parser.lsData
parser.feed(_item['meeting'])
file_output['meetinglink'] = parser.lsHref
file_output['meetingtitle'] = parser.lsTitle
file_output['meetingdate'] = parser.lsData.strip()
self.exporter.export_item(file_output)
else:
DropItem(item)
return item
A CsvReader()/list comprehension feeds the links from the CSV to a second regular spider in start_urls, which, using the links, scrapes the meeting minutes and pipeline/CsvItemExporter to a .txt file named for the link, eg meeting123.txt.
The second time I run the first scraper, compare to the links in the new csv to the original csv, scrape the meeting minutes in links in the new csv but not the original csv
pipeline/CsvItemExporter to a .txt file named for the link, eg meeting124.txt.
My immediate problem is that passing the scraped minutes link to the pipeline to name the file after the minutes link is harder than I would have guessed - the framework doesn't seem to be for this.
regular spider 2 - scrapes meeting minutes from URLs supplied from a CSV:
class ASpider(scrapy.Spider):
name = "town-text"
custom_settings = {
'ITEM_PIPELINES': {
'themis.pipelines.MinutesPipeline': 400
}
}
meetings = csvreader('./town-links.csv')
# don't override start_requests, default scrapy.Request(url=url, callback=self.parse)
start_urls = ['http://http://easthamptontown.iqm2.com/Citizens/' + meeting['filelink'] \
for meeting in meetings \
if 'Zoning' in meeting['meetingtitle'] and \
'Regular Meeting' in meeting['meetingtitle'] and \
meeting['filetype'] == 'Minutes']
def parse(self, response):
for element in response.xpath('//div[#id="Content"]/div/*'):
yield {
'line': element.xpath('.//text()').getall(),
}
pipeline:
class MinutesPipeline:
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s.txt' % spider.name, 'wb')
self.files[spider] = file
self.exporter = CsvItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
_item = ItemAdapter(item).asdict()
self.exporter.export_item(_item)
return item
I'd like to be able to pass the particular URL whose html I am scraping, meeting['filelink'], - to the CSV filename for the items. I tried changing scrapy.Spider to CrawlSpider to attempt to use parse_start_url() but the selector did not return any data using CrawlSpider.
Any thoughts on design for this use case unique to the Scrapy framework would be appreciated.
If you want to use the url as the filename all you need to do is pass the url with the item, and create a new file with said filename and export it.
for example:
In your parse method add a url field to the dictionary and add the response.url as it's value.
def parse(self, response):
for element in response.xpath('//div[#id="Content"]/div/*'):
yield {
'line': element.xpath('.//text()').getall(),
'url': response.url
}
Then in your pipeline:
def process_item(self, item, spider):
url = item["url"]
filename = url.split("/")[-1]
exporter = CsvItemExporter(filename)
text = item["text"]
... do text formatting if needed
... export to text to file
raise DropItem

Scrapy - What to do when no downloadable file is found?

I am currently working on a scrapy program that has the availability to download files from the page I'm scraping from, the issue that I am currently running into is that some of the pages have a datasheet like this page - https://www.tyconsystems.com/rpms24-720-720 - while others do not like this page - https://www.tyconsystems.com/tpdin-cable-232 -
What is the proper way of passing data for when there is no file found on the page?
Additional question, is there anyway to fix the issue with the csv file having multiple lines per item when the item data length is too long? example item - rpms24-720-720.
Below is the code that I am using.
productInfo.py
from copyreg import clear_extension_cache
import scrapy
from ..items import tyconItem
class ProductInfoSpider(scrapy.Spider):
name = "productInfo"
allowed_domains = ['tyconsystems.com']
start_urls = [
'https://www.tyconsystems.com/rpms24-720-720',
'https://www.tyconsystems.com/tpdin-cable-232',
]
def parse(self, response):
for product in response.css('section#listing'):
items = tyconItem() # Unique item for each iteration
name_dirty = product.css('div.product-id span#product_id::text').get()
product_sku = name_dirty.strip()
product_sub_title_dirty = product.css('div.product-details h1.page_headers::text').get()
product_sub_title = product_sub_title_dirty.strip()
#product_store_description = product.css('p.series-card__intro').get()
if product.xpath('//p[contains(#class, "MsoNormal")]'):
summary = product.css('div.item > div p.MsoNormal').getall()
elif product.xpath('//div[contains(#class, "item")]/div'):
summary = product.css('div.item > div').getall()
else:
summary = product.css('div.item').getall()
category_list = product.xpath('//div[#class="container"]//ol//li//a/span//text()').getall()
category = category_list[-2].strip()
description = product.css('div.item > p.MsoNormal::text').getall()
if product.css('div.extrafieldsBlock span.info a::attr(href)').get() == '':
datasheet = 'no-file'
else:
datasheet = product.css('div.extrafieldsBlock span.info a::attr(href)').get()
file_urls = datasheet
specification = product.css('div#tab-6 div.info > table').getall()
price = product.css('span#price::text').get()
products_zoom_image = name_dirty.strip() + '.jpg'
main_image = product.css('div#addl-images a::attr(href)').getall()
image_urls = [response.urljoin(i) for i in main_image]
items['category'] = category,
items['datasheet'] = datasheet,
items['description'] = description,
items['main_image'] = main_image,
items['price'] = price,
items['product_link'] = response.url, # get the product link from response
items['product_sku'] = product_sku,
items['product_sub_title'] = product_sub_title,
items['products_zoom_image'] = products_zoom_image
items['specification'] = specification,
items['summary'] = summary,
items['file_urls'] = [file_urls]
items["name"] = product_sku
items["image_urls"] = image_urls
yield items
items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
from scrapy import Field, Item
class tyconItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
category = scrapy.Field()
datasheet = scrapy.Field()
description = scrapy.Field()
file_urls = scrapy.Field()
files = scrapy.Field()
name = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
main_image = scrapy.Field()
price = scrapy.Field()
product_link = scrapy.Field()
product_sku = scrapy.Field()
product_sub_title = scrapy.Field()
products_zoom_image = scrapy.Field()
specification = scrapy.Field()
summary = scrapy.Field()
pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
# from scrapy.pipelines.images import ImagesPipeline
from scrapy.http import Request
from scrapy.pipelines.files import FilesPipeline
from scrapy.pipelines.images import ImagesPipeline
from io import BytesIO
from PIL import Image
class tyconPipeline:
def process_item(self, item, spider):
return item
class DownfilesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None):
file_name: str = request.url.split("/")[-1]
return file_name
class ImagePipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None, *args, item=None):
filename = request.meta["filename"].strip()
number = request.meta["file_num"]
return filename + "_" + str(number) + ".jpg"
def thumb_path(self, request, thumb_id, response=None, info=None):
filename = request.meta["filename"]
number = request.meta["file_num"]
return f'thumbs/{thumb_id}/{filename}_{str(number)}.jpg'
def get_media_requests(self, item, info):
name = item["name"]
for i, url in enumerate(item["image_urls"]):
meta = {"filename": name, "file_num": i}
yield Request(url, meta=meta)
def convert_image(self, image, size=None):
if size is not None:
# If the size is not None then it is a thumbnail
# so we resize it according the parameter
image = image.resize(size, Image.ANTIALIAS)
else:
# otherwise we give the image to back to the superclass version of
# this method for it to process.
return super().convert_image(image, size=size)
buf = BytesIO() # These next 3 lines are from the scrapy source code.
image.save(buf, 'JPEG', quality=72)
return image, buf
Scrapy Error in Log
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/twisted/internet/defer.py", line 857, in _runCallbacks
current.result = callback( # type: ignore[misc]
File "/usr/lib/python3/dist-packages/scrapy/utils/defer.py", line 162, in f
return deferred_from_coro(coro_f(*coro_args, **coro_kwargs))
File "/usr/lib/python3/dist-packages/scrapy/pipelines/media.py", line 87, in process_item
requests = arg_to_iter(self.get_media_requests(item, info))
File "/usr/lib/python3/dist-packages/scrapy/pipelines/files.py", line 492, in get_media_requests
return [Request(u) for u in urls]
File "/usr/lib/python3/dist-packages/scrapy/pipelines/files.py", line 492, in <listcomp>
return [Request(u) for u in urls]
File "/usr/lib/python3/dist-packages/scrapy/http/request/__init__.py", line 60, in __init__
self._set_url(url)
File "/usr/lib/python3/dist-packages/scrapy/http/request/__init__.py", line 98, in _set_url
raise TypeError(f"Request url must be str, got {type(url).__name__}")
TypeError: Request url must be str, got NoneType
Thanks everyone!
Two approaches are possible:
1. Override get_media_requests
Override get_media_requests in your pipelines to check for the existence of URLs as follows:
class DownfilesPipeline(FilesPipeline):
def get_media_requests(self, item, info):
urls = ItemAdapter(item).get(self.files_urls_field, [])
if not all(urls):
return #THIS - Don't return Request if there is no URL
return [Request(u) for u in URLs]
# Rest of the code
class ImagePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
urls = item.get("image_urls")
if not all(urls):
return None #THIS - Don't return Request if there is no URL
name = item["name"]
for i, url in enumerate(item["image_urls"]):
meta = {"filename": name, "file_num": i}
yield Request(url, meta=meta)
2. Return different items
You can have different item types returned from the Spider based on if you have an image to download or not. For ease, I prefer using anonymous dictionaries as follows:
def parse(self, response)
item={}
items['category'] = category,
items['datasheet'] = datasheet,
...
if file_to_download:
items['file_urls'] = [file_urls]
if image_to_download:
items['image_urls'] = [image_urls]
items["name"] = product_sku
items["image_urls"] = image_urls
yield item
Hope it helps!

Scrapy Pipeline with MongoDB is not working

I'm trying to put all the data that I'm scraping in MongoDB to monitor the properties prices.
I've already done a lot of tests, but it's not working.
I will put the code here, if anyone could help me. Please.
init.py
import scrapy
from realstatedata.items import RealstatedataItem
class RsdataSpider(scrapy.Spider):
name = 'realstatedata'
allowed_domains = ['vivareal.com.br']
start_urls = ['https://www.vivareal.com.br/aluguel/sp/sao-jose-dos-campos/apartamento_residencial/#preco-ate=2000']
def parse(self, response):
nextpageurl = response.xpath('//a[#title="Próxima página"]/#href')
yield from self.scrape(response)
if nextpageurl:
path = nextpageurl.extract_first()
#print(path)
# Got #pagina=2 => Replace with ?pagina=2
path = '?' + path[1:]
#print(path)
nextpage = response.urljoin(path)
print("Found url: {}".format(nextpage))
yield scrapy.Request(nextpage)
def scrape(self, response):
for resource in response.xpath('//article[#class="property-card__container js-property-card"]/..'):
item = RealstatedataItem()
item['description'] = resource.xpath('.//h2/span[#class="property-card__title js-cardLink js-card-title"]/text()').extract_first()
item['address'] = resource.xpath('.//span[#class="property-card__address"]/text()').extract_first()
item['prop_area'] = resource.xpath('.//span[#class="property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area"]/text()').extract_first()
item['prop_rooms'] = resource.xpath('.//span[#class="property-card__detail-value js-property-card-value"]/text()').extract_first()
item['prop_bath'] = resource.xpath('.//span[#class="property-card__detail-value js-property-card-value"]/text()').extract_first()
item['prop_parking'] = resource.xpath('.//ul/li[4]/span[#class="property-card__detail-value js-property-card-value"]/text()').extract_first()
item['price_rent'] = resource.xpath('.//p[#style="display: block;"]/text()').extract_first()
item['price_cond'] = resource.xpath('.//strong[#class="js-condo-price"]/text()').extract_first()
item['realstate_name'] = resource.xpath('.//picture/img/#alt').extract_first()
yield item
settings.py
BOT_NAME = 'realstatedata'
SPIDER_MODULES = ['realstatedata.spiders']
NEWSPIDER_MODULE = 'realstatedata.spiders'
ITEM_PIPELINES = ['realstatedata.pipelines.MongoPipeline', ]
MONGO_URI = 'mongodb://localhost:27017'
MONGO_DATABASE = "vivareal_db"
items.py
import scrapy
class RealstatedataItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
description = scrapy.Field()
address = scrapy.Field()
prop_area = scrapy.Field()
prop_rooms = scrapy.Field()
prop_bath = scrapy.Field()
prop_parking = scrapy.Field()
price_rent = scrapy.Field()
price_cond = scrapy.Field()
realstate_name = scrapy.Field()
pass
pipeline.py
in this part of the code, i've tried two differents forms to do, but none works.
import pymongo
import logging
class MongoPipeline(object):
collection_name = 'rent_properties'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
#classmethod
def from_crawler(cls, crawler):
## pull in information from settings.py
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self, spider):
## initializing spider
## opening db connection
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
## clean up when spider is closed
self.client.close()
def process_item(self, item, spider):
## how to handle each post
self.db[self.collection_name].insert(dict(item))
logging.debug("Properties added to MongoDB")
return item
Obviously your settings for enabling pipeline is wrong. ITEM_PIPELINES should be defined as a dict but not a list. In your code, the pipeline is not loaded at all.
ITEM_PIPELINES = {
"realstatedata.pipelines.MongoPipeline": 100,
}
The value in the dict represents priority when more than 1 pipeline are enabled.

output python to csv regular

hello i'm new on python/scrapy world, i need to export my list of products to csv like this exemple:
what i want
but i get this one:
what i got
/////
spider:
/////
import scrapy
import csv
from escrap.items import EscrapItem
class EscrapSpider(scrapy.Spider):
name = "tunisianet"
allowed_domains = ["tunisianet.com.tn"]
start_urls = [
"http://www.tunisianet.com.tn/385-logiciels-informatique-tunisie/"
]
def parse(self, response):
for sel in response.xpath('//*[contains(#class, "ajax_block_product")]'):
item = EscrapItem()
item['revendeur'] = '\n'.join(sel.xpath('//*[contains(#class, "center_block")]/h2/a/#href').re('tunisianet'))
item['produit'] = '\n'.join(sel.xpath('//*[contains(#class, "center_block")]/h2/a/text()').extract())
item['lien'] = '\n'.join(sel.xpath('//*[contains(#class, "center_block")]/h2/a/#href').extract())
item['description'] = '\n'.join(sel.xpath('//*[contains(#class, "product_desc")]/a/text()').extract())
item['prix'] = '\n'.join(sel.xpath('//*[contains(#class, "price")]/text()').extract())
data = [item['revendeur'], item['produit'], item['lien'], item['description'], item['prix']]
out = open('out.csv', 'w')
for row in data:
for column in row:
out.write(column.encode('utf-8'))
return data
/////
items:
/////
import scrapy
class EscrapItem(scrapy.Item):
revendeur = scrapy.Field()
produit = scrapy.Field()
lien = scrapy.Field()
description = scrapy.Field()
prix = scrapy.Field()
/////
pipelines:
/////
class EscrapPipeline(object):
# put all words in lowercase
words_to_filter = ['politics', 'religion']
def process_item(self, item, spider):
for word in self.words_to_filter:
if word in unicode([item['revendeur'],item['produit'],item['lien'],item['description'],item ['prix']]).lower():
raise DropItem("Contains forbidden word: %s" % word)
else:
return item
/////
my setting:
/////
BOT_NAME = 'escrap'
SPIDER_MODULES = ['escrap.spiders']
NEWSPIDER_MODULE = 'escrap.spiders'
ITEM_PIPELINES = {'escrap.pipelines.EscrapPipeline': 1}
FEED_EXPORTERS = {
'csv': 'escrap.escrap_csv_item_exporter.EscrapCsvItemExporter',
}
FIELDS_TO_EXPORT = [
'revendeur',
'produit',
'lien',
'description',
'prix'
]
You don't need to create the csv file yourself when parsing items, scrapy can export by default to a csv file.
so change your parse method to:
def parse(self, response):
for sel in response.xpath('//*[contains(#class, "ajax_block_product")]'):
item = EscrapItem()
item['revendeur'] = '\n'.join(sel.xpath('//*[contains(#class, "center_block")]/h2/a/#href').re('tunisianet'))
item['produit'] = '\n'.join(sel.xpath('//*[contains(#class, "center_block")]/h2/a/text()').extract())
item['lien'] = '\n'.join(sel.xpath('//*[contains(#class, "center_block")]/h2/a/#href').extract())
item['description'] = '\n'.join(sel.xpath('//*[contains(#class, "product_desc")]/a/text()').extract())
item['prix'] = '\n'.join(sel.xpath('//*[contains(#class, "price")]/text()').extract())
yield item
later when calling scrapy you can call it with:
scrapy crawl myspider -o output.csv
Now you have all your items exported to a csv file.
If you still want to control it on your own pipeline, check here to create your own exporter. It would like this:
from scrapy import signals
from scrapy.exporters import CsvItemExporter
class CSVExportPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s_products.csv' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = CsvItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
To create your own pipeline make sure to read this entirely.
You should probably set the cell where you want to write you data. Something like:
worksheet.write('A1','thing you want to write')
Or it may be default to write content in 'A'
it export but not with the form i want, i want the form like this one :
http://i.imgur.com/r8LaVem.png , but i got this one http://i.imgur.com/8IVnlui.png .
here is my final class :
def parse(self, response):
item = TfawItem()
data= []
items = []
out = open('out.csv', 'a')
x = response.xpath('//*[contains(#class, "ajax_block_product")]')
for i in range(0, len(x)):
item['revendeur'] = response.xpath('//*[contains(#class, "center_block")]/h2/a/#href').re('tunisianet')[i]
item['produit'] = response.xpath('//*[contains(#class, "center_block")]/h2/a/text()').extract()[i]
item['url'] = response.xpath('//*[contains(#class, "center_block")]/h2/a/#href').extract()[i]
item['description'] = response.xpath('//*[contains(#class, "product_desc")]/a/text()').extract()[i]
item['prix'] = response.xpath('//*[contains(#class, "price")]/text()').extract()[i]
data = item['revendeur'], item['produit'], item['url'], item['description'], item['prix']
out.write(str(data))
out.write('\n')

No output in XML file using XMLITEMEXPORTER

I am beginner to python and I am working with scrapy. I have used xmlitemexporter to export my scraped data to xml file. But i get only "<"/item"">" in the xml file.
My items.py is like follow:
from scrapy.item import Item, Field
class WorkwithitemsItem(Item):
title = Field()
link = Field()
publish = Field()
description = Field()
And the spider is like:
from scrapy import log
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from workwithitems.items import WorkwithitemsItem
class MySpider(BaseSpider):
name = 'spidey'
allowed_domains = ['ekantipur.com']
start_urls = [
'http://www.ekantipur.com/en/rss',
]
def parse(self, response):
self.log('A response from %s just arrived!' % response.url)
sel = Selector(response)
title = sel.xpath('//title/text()').extract()
link = sel.xpath('//link/text()').extract()
publish = sel.xpath('//pubDate/text()').extract()
description = sel.xpath('//description/text()').extract()
WorkwithitemsItem(title = title[2:], link = link[2:],
publish = publish, description = description[1:])
And the pipelines.py is:
from scrapy import signals
from scrapy.contrib.exporter import XmlItemExporter
class XmlExportPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s_products.xml' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = XmlItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
The settings.py is:
BOT_NAME = 'workwithitems'
SPIDER_MODULES = ['workwithitems.spiders']
NEWSPIDER_MODULE = 'workwithitems.spiders'
FEED_EXPORTERS_BASE = {
'xml': 'scrapy.contrib.exporter.XmlItemExporter',
}
ITEM_PIPELINES = {
'workwithitems.pipelines.XmlExportPipeline': 800,
}
I can't figure out where my problem is.
Ok! I found the problem. What i did is just put a 'return' at the last line in spider.py
return WorkwithitemsItem(title = title[2:], link = link[2:],
publish = publish, description = description[1:]
)

Categories

Resources