hello i'm new on python/scrapy world, i need to export my list of products to csv like this exemple:
what i want
but i get this one:
what i got
/////
spider:
/////
import scrapy
import csv
from escrap.items import EscrapItem
class EscrapSpider(scrapy.Spider):
name = "tunisianet"
allowed_domains = ["tunisianet.com.tn"]
start_urls = [
"http://www.tunisianet.com.tn/385-logiciels-informatique-tunisie/"
]
def parse(self, response):
for sel in response.xpath('//*[contains(#class, "ajax_block_product")]'):
item = EscrapItem()
item['revendeur'] = '\n'.join(sel.xpath('//*[contains(#class, "center_block")]/h2/a/#href').re('tunisianet'))
item['produit'] = '\n'.join(sel.xpath('//*[contains(#class, "center_block")]/h2/a/text()').extract())
item['lien'] = '\n'.join(sel.xpath('//*[contains(#class, "center_block")]/h2/a/#href').extract())
item['description'] = '\n'.join(sel.xpath('//*[contains(#class, "product_desc")]/a/text()').extract())
item['prix'] = '\n'.join(sel.xpath('//*[contains(#class, "price")]/text()').extract())
data = [item['revendeur'], item['produit'], item['lien'], item['description'], item['prix']]
out = open('out.csv', 'w')
for row in data:
for column in row:
out.write(column.encode('utf-8'))
return data
/////
items:
/////
import scrapy
class EscrapItem(scrapy.Item):
revendeur = scrapy.Field()
produit = scrapy.Field()
lien = scrapy.Field()
description = scrapy.Field()
prix = scrapy.Field()
/////
pipelines:
/////
class EscrapPipeline(object):
# put all words in lowercase
words_to_filter = ['politics', 'religion']
def process_item(self, item, spider):
for word in self.words_to_filter:
if word in unicode([item['revendeur'],item['produit'],item['lien'],item['description'],item ['prix']]).lower():
raise DropItem("Contains forbidden word: %s" % word)
else:
return item
/////
my setting:
/////
BOT_NAME = 'escrap'
SPIDER_MODULES = ['escrap.spiders']
NEWSPIDER_MODULE = 'escrap.spiders'
ITEM_PIPELINES = {'escrap.pipelines.EscrapPipeline': 1}
FEED_EXPORTERS = {
'csv': 'escrap.escrap_csv_item_exporter.EscrapCsvItemExporter',
}
FIELDS_TO_EXPORT = [
'revendeur',
'produit',
'lien',
'description',
'prix'
]
You don't need to create the csv file yourself when parsing items, scrapy can export by default to a csv file.
so change your parse method to:
def parse(self, response):
for sel in response.xpath('//*[contains(#class, "ajax_block_product")]'):
item = EscrapItem()
item['revendeur'] = '\n'.join(sel.xpath('//*[contains(#class, "center_block")]/h2/a/#href').re('tunisianet'))
item['produit'] = '\n'.join(sel.xpath('//*[contains(#class, "center_block")]/h2/a/text()').extract())
item['lien'] = '\n'.join(sel.xpath('//*[contains(#class, "center_block")]/h2/a/#href').extract())
item['description'] = '\n'.join(sel.xpath('//*[contains(#class, "product_desc")]/a/text()').extract())
item['prix'] = '\n'.join(sel.xpath('//*[contains(#class, "price")]/text()').extract())
yield item
later when calling scrapy you can call it with:
scrapy crawl myspider -o output.csv
Now you have all your items exported to a csv file.
If you still want to control it on your own pipeline, check here to create your own exporter. It would like this:
from scrapy import signals
from scrapy.exporters import CsvItemExporter
class CSVExportPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s_products.csv' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = CsvItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
To create your own pipeline make sure to read this entirely.
You should probably set the cell where you want to write you data. Something like:
worksheet.write('A1','thing you want to write')
Or it may be default to write content in 'A'
it export but not with the form i want, i want the form like this one :
http://i.imgur.com/r8LaVem.png , but i got this one http://i.imgur.com/8IVnlui.png .
here is my final class :
def parse(self, response):
item = TfawItem()
data= []
items = []
out = open('out.csv', 'a')
x = response.xpath('//*[contains(#class, "ajax_block_product")]')
for i in range(0, len(x)):
item['revendeur'] = response.xpath('//*[contains(#class, "center_block")]/h2/a/#href').re('tunisianet')[i]
item['produit'] = response.xpath('//*[contains(#class, "center_block")]/h2/a/text()').extract()[i]
item['url'] = response.xpath('//*[contains(#class, "center_block")]/h2/a/#href').extract()[i]
item['description'] = response.xpath('//*[contains(#class, "product_desc")]/a/text()').extract()[i]
item['prix'] = response.xpath('//*[contains(#class, "price")]/text()').extract()[i]
data = item['revendeur'], item['produit'], item['url'], item['description'], item['prix']
out.write(str(data))
out.write('\n')
Related
I am trying to save scraped items in separate json files, but I don't see any output files. The pipeline and the item is defined in the piplines.py and items.py files in the scrapy project folder. Do I have to call process_item() explicitly or will it be called automatically when I return item in scrape()? I enabled the pipeline in CrawlerProcess(settings={'ITEM_PIPELINES'}). Thanks.
The pipeline
import json,datetime
class JsonWriterPipeline(object):
def process_item(self, item, spider):
# return item
fileName = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + '.json'
try:
with open(fileName,'w') as fp:
json.dump(dict(item),fp)
return item
except:
return item
class ProjectItem(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
class mySpider(CrawlSpider):
name = 'mySPider'
allowed_domains = ['allowedDOmain.org']
start_urls = ['https://url.org']
def parse(self,response):
monthSelector = '//div[#class="archives-column"]/ul/li/a[contains(text(),"November 2019")]/#href'
monthLink = response.xpath(monthSelector).extract_first()
yield response.follow(monthLink,callback=self.scrape)
def scrape(self,response):
# get the links to all individual articles
linkSelector = '.entry-title a::attr(href)'
allLinks = response.css(linkSelector).extract()
for link in allLinks:
# item = articleItem()
item = ProjectItem()
item['url'] = link
request = response.follow(link,callback=self.getContent)
request.meta['item'] = item
item = request.meta['item']
yield item
nextPageSelector = 'span.page-link a::attr(href)'
nextPageLink = response.css(nextPageSelector).extract_first()
yield response.follow(nextPageLink,callback=self.scrape)
def getContent(self,response):
item = response.meta['item']
TITLE_SELECTOR = '.entry-title ::text'
item['title'] = response.css(TITLE_SELECTOR).extract_first()
yield item
To settings.py, add:
ITEM_PIPELINES = {
'myproject.pipelines.JsonWriterPipeline':100
}
where myproject is the name of your project/folder.
See the very last heading on this page : https://docs.scrapy.org/en/latest/topics/item-pipeline.html
When running a spider inside a script, the settings need to be imported using the method described in the following. Running scrapy from script not including pipeline
How can I return NaN for urls who do not have any: ".//*[#id='object']//tbody//tr//td//span//a[2]"?. I tried to:
def parse(self, response):
links = response.xpath(".//*[#id='object']//tbody//tr//td//span//a[2]")
if not links:
item = ToyItem()
item['link'] = 'NaN'
item['name'] = response.url
return item
for links in links:
item = ToyItem()
item['link'] = links.xpath('#href').extract_first()
item['name'] = response.url # <-- see here
yield item
list_of_dics = []
list_of_dics.append(item)
df = pd.DataFrame(list_of_dics)
print(df)
df.to_csv('/Users/user/Desktop/crawled_table.csv', index=False)
However, instead of returning (*):
'link1.com' 'NaN'
'link2.com' 'NAN'
'link3.com' 'extracted3.link.com'
I got:
'link3.com' 'extracted3.link.com'
How can I return (*)?
You can rework this to use scrapy pipelines:
from scrapy import Spider
class MySpider(Spider):
name = 'myspider'
start_urls = ['link1','link2','link3']
def parse(self, response):
links = response.xpath(".//*[#id='object']//tbody//tr//td//span//a[2]")
if not links:
item = ToyItem()
item['link'] = 'NaN'
item['name'] = response.url
yield item
else:
for links in links:
item = ToyItem()
item['link'] = link.xpath('#href').extract_first()
item['name'] = response.url # <-- see here
yield item
Now in your pipelines.py
class PandasPipeline:
def open_spider(self, spider):
self.data = []
def process_item(self, item, spider):
self.data.append(item)
return item
def close_spider(self, spider):
df = pd.DataFrame(self.data)
print('saving dataframe')
df.to_csv('/Users/user/Desktop/crawled_table.csv', index=False)
and for settings.py:
ITEM_PIPELINES = {
'myproject.pipelines.PandasPipeline': 900
}
I have a problem where it prints the XML files correctly but it doesn't populate the XML file with any content.
The output in terminal is this:
[u'Tove'] [u'Jani'] [u'Reminder'] [u"Don't forget me this weekend!"]
However the output site_products.xml results in this (which is wrong, no data):
<?xml version="1.0" encoding="utf-8"?>
<items></items>
spider.py
from scrapy.contrib.spiders import XMLFeedSpider
from crawler.items import CrawlerItem
class SiteSpider(XMLFeedSpider):
name = 'site'
allowed_domains = ['www.w3schools.com']
start_urls = ['http://www.w3schools.com/xml/note.xml']
itertag = 'note'
def parse_node(self, response, selector):
to = selector.xpath('//to/text()').extract()
who = selector.xpath('//from/text()').extract()
heading = selector.xpath('//heading/text()').extract()
body = selector.xpath('//body/text()').extract()
return item
pipelines.py
from scrapy import signals
from scrapy.contrib.exporter import XmlItemExporter
class XmlExportPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s_products.xml' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = XmlItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
items.py
import scrapy
class CrawlerItem(scrapy.Item):
to = scrapy.Field()
who = scrapy.Field()
heading = scrapy.Field()
body = scrapy.Field()
pass
settings.py
BOT_NAME = 'crawler'
SPIDER_MODULES = ['crawler.spiders']
NEWSPIDER_MODULE = 'crawler.spiders'
ITEM_PIPELINES = {'crawler.pipelines.XmlExportPipeline': 300,}
Any help with this would be really appreciated.
You need to instantiate a CrawlerItem instance in your parse_node() method:
def parse_node(self, response, selector):
item = CrawlerItem()
item['to'] = selector.xpath('//to/text()').extract()
item['who'] = selector.xpath('//from/text()').extract()
item['heading'] = selector.xpath('//heading/text()').extract()
item['body'] = selector.xpath('//body/text()').extract()
return item
I am beginner to python and I am working with scrapy. I have used xmlitemexporter to export my scraped data to xml file. But i get only "<"/item"">" in the xml file.
My items.py is like follow:
from scrapy.item import Item, Field
class WorkwithitemsItem(Item):
title = Field()
link = Field()
publish = Field()
description = Field()
And the spider is like:
from scrapy import log
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from workwithitems.items import WorkwithitemsItem
class MySpider(BaseSpider):
name = 'spidey'
allowed_domains = ['ekantipur.com']
start_urls = [
'http://www.ekantipur.com/en/rss',
]
def parse(self, response):
self.log('A response from %s just arrived!' % response.url)
sel = Selector(response)
title = sel.xpath('//title/text()').extract()
link = sel.xpath('//link/text()').extract()
publish = sel.xpath('//pubDate/text()').extract()
description = sel.xpath('//description/text()').extract()
WorkwithitemsItem(title = title[2:], link = link[2:],
publish = publish, description = description[1:])
And the pipelines.py is:
from scrapy import signals
from scrapy.contrib.exporter import XmlItemExporter
class XmlExportPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s_products.xml' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = XmlItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
The settings.py is:
BOT_NAME = 'workwithitems'
SPIDER_MODULES = ['workwithitems.spiders']
NEWSPIDER_MODULE = 'workwithitems.spiders'
FEED_EXPORTERS_BASE = {
'xml': 'scrapy.contrib.exporter.XmlItemExporter',
}
ITEM_PIPELINES = {
'workwithitems.pipelines.XmlExportPipeline': 800,
}
I can't figure out where my problem is.
Ok! I found the problem. What i did is just put a 'return' at the last line in spider.py
return WorkwithitemsItem(title = title[2:], link = link[2:],
publish = publish, description = description[1:]
)
How can I call the writeXML after my parser finish crawling data? Currently I can see the data crawl but don't see the output file. I tried to print under writeXML no output too.
Below are my code:
class FriendSpider(BaseSpider):
# identifies of the Spider
name = "friend"
count = 0
allowed_domains = ["example.com.us"]
start_urls = [
"http://example.com.us/biz/friendlist/"
]
def start_requests(self):
for i in range(0,1722,40):
yield self.make_requests_from_url("http://example.com.us/biz/friendlist/?start=%d" % i)
def parse(self, response):
response = response.replace(body=response.body.replace('<br />', '\n'))
hxs = HtmlXPathSelector(response)
sites = hxs.select('//ul/li')
items = []
for site in sites:
item = Item()
self.count += 1
item['id'] = str(self.count)
item['name'] = site.select('.//div/div/h4/text()').extract()
item['address'] = site.select('h4/span/text()').extract()
item['review'] = ''.join(site.select('.//div[#class="review"]/p/text()').extract())
item['birthdate'] = site.select('.//div/div/h5/text()').extract()
items.append(item)
return items
def writeXML(self, items):
root = ET.Element("Test")
for item in items:
item= ET.SubElement(root,'item')
item.set('id', item['id'])
address= ET.SubElement(item, 'address')
address.text = item['address']
user = ET.SubElement(item, 'user')
user.text = item['user']
birthdate= ET.SubElement(item, 'birthdate')
birthdate.text = item['birthdate']
review = ET.SubElement(item, 'review')
review.text = item['review']
# wrap it in an ElementTree instance, and save as XML
file = open("out.xml", 'w')
tree = ET.ElementTree(root)
tree.write(file,xml_declaration=True,encoding='utf-8',method="xml")
To output using the built-in XML exporter, try the following command:
scrapy crawl friend -o items.xml -t xml
If the output isn't to your liking, then you can try creating your own exporter using the XMLExporter class as a basis.