output python to csv regular - python

hello i'm new on python/scrapy world, i need to export my list of products to csv like this exemple:
what i want
but i get this one:
what i got
/////
spider:
/////
import scrapy
import csv
from escrap.items import EscrapItem
class EscrapSpider(scrapy.Spider):
name = "tunisianet"
allowed_domains = ["tunisianet.com.tn"]
start_urls = [
"http://www.tunisianet.com.tn/385-logiciels-informatique-tunisie/"
]
def parse(self, response):
for sel in response.xpath('//*[contains(#class, "ajax_block_product")]'):
item = EscrapItem()
item['revendeur'] = '\n'.join(sel.xpath('//*[contains(#class, "center_block")]/h2/a/#href').re('tunisianet'))
item['produit'] = '\n'.join(sel.xpath('//*[contains(#class, "center_block")]/h2/a/text()').extract())
item['lien'] = '\n'.join(sel.xpath('//*[contains(#class, "center_block")]/h2/a/#href').extract())
item['description'] = '\n'.join(sel.xpath('//*[contains(#class, "product_desc")]/a/text()').extract())
item['prix'] = '\n'.join(sel.xpath('//*[contains(#class, "price")]/text()').extract())
data = [item['revendeur'], item['produit'], item['lien'], item['description'], item['prix']]
out = open('out.csv', 'w')
for row in data:
for column in row:
out.write(column.encode('utf-8'))
return data
/////
items:
/////
import scrapy
class EscrapItem(scrapy.Item):
revendeur = scrapy.Field()
produit = scrapy.Field()
lien = scrapy.Field()
description = scrapy.Field()
prix = scrapy.Field()
/////
pipelines:
/////
class EscrapPipeline(object):
# put all words in lowercase
words_to_filter = ['politics', 'religion']
def process_item(self, item, spider):
for word in self.words_to_filter:
if word in unicode([item['revendeur'],item['produit'],item['lien'],item['description'],item ['prix']]).lower():
raise DropItem("Contains forbidden word: %s" % word)
else:
return item
/////
my setting:
/////
BOT_NAME = 'escrap'
SPIDER_MODULES = ['escrap.spiders']
NEWSPIDER_MODULE = 'escrap.spiders'
ITEM_PIPELINES = {'escrap.pipelines.EscrapPipeline': 1}
FEED_EXPORTERS = {
'csv': 'escrap.escrap_csv_item_exporter.EscrapCsvItemExporter',
}
FIELDS_TO_EXPORT = [
'revendeur',
'produit',
'lien',
'description',
'prix'
]

You don't need to create the csv file yourself when parsing items, scrapy can export by default to a csv file.
so change your parse method to:
def parse(self, response):
for sel in response.xpath('//*[contains(#class, "ajax_block_product")]'):
item = EscrapItem()
item['revendeur'] = '\n'.join(sel.xpath('//*[contains(#class, "center_block")]/h2/a/#href').re('tunisianet'))
item['produit'] = '\n'.join(sel.xpath('//*[contains(#class, "center_block")]/h2/a/text()').extract())
item['lien'] = '\n'.join(sel.xpath('//*[contains(#class, "center_block")]/h2/a/#href').extract())
item['description'] = '\n'.join(sel.xpath('//*[contains(#class, "product_desc")]/a/text()').extract())
item['prix'] = '\n'.join(sel.xpath('//*[contains(#class, "price")]/text()').extract())
yield item
later when calling scrapy you can call it with:
scrapy crawl myspider -o output.csv
Now you have all your items exported to a csv file.
If you still want to control it on your own pipeline, check here to create your own exporter. It would like this:
from scrapy import signals
from scrapy.exporters import CsvItemExporter
class CSVExportPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s_products.csv' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = CsvItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
To create your own pipeline make sure to read this entirely.

You should probably set the cell where you want to write you data. Something like:
worksheet.write('A1','thing you want to write')
Or it may be default to write content in 'A'

it export but not with the form i want, i want the form like this one :
http://i.imgur.com/r8LaVem.png , but i got this one http://i.imgur.com/8IVnlui.png .
here is my final class :
def parse(self, response):
item = TfawItem()
data= []
items = []
out = open('out.csv', 'a')
x = response.xpath('//*[contains(#class, "ajax_block_product")]')
for i in range(0, len(x)):
item['revendeur'] = response.xpath('//*[contains(#class, "center_block")]/h2/a/#href').re('tunisianet')[i]
item['produit'] = response.xpath('//*[contains(#class, "center_block")]/h2/a/text()').extract()[i]
item['url'] = response.xpath('//*[contains(#class, "center_block")]/h2/a/#href').extract()[i]
item['description'] = response.xpath('//*[contains(#class, "product_desc")]/a/text()').extract()[i]
item['prix'] = response.xpath('//*[contains(#class, "price")]/text()').extract()[i]
data = item['revendeur'], item['produit'], item['url'], item['description'], item['prix']
out.write(str(data))
out.write('\n')

Related

Activating a Pipeline Component in Scrapy to write JSON

I am trying to save scraped items in separate json files, but I don't see any output files. The pipeline and the item is defined in the piplines.py and items.py files in the scrapy project folder. Do I have to call process_item() explicitly or will it be called automatically when I return item in scrape()? I enabled the pipeline in CrawlerProcess(settings={'ITEM_PIPELINES'}). Thanks.
The pipeline
import json,datetime
class JsonWriterPipeline(object):
def process_item(self, item, spider):
# return item
fileName = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + '.json'
try:
with open(fileName,'w') as fp:
json.dump(dict(item),fp)
return item
except:
return item
class ProjectItem(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
class mySpider(CrawlSpider):
name = 'mySPider'
allowed_domains = ['allowedDOmain.org']
start_urls = ['https://url.org']
def parse(self,response):
monthSelector = '//div[#class="archives-column"]/ul/li/a[contains(text(),"November 2019")]/#href'
monthLink = response.xpath(monthSelector).extract_first()
yield response.follow(monthLink,callback=self.scrape)
def scrape(self,response):
# get the links to all individual articles
linkSelector = '.entry-title a::attr(href)'
allLinks = response.css(linkSelector).extract()
for link in allLinks:
# item = articleItem()
item = ProjectItem()
item['url'] = link
request = response.follow(link,callback=self.getContent)
request.meta['item'] = item
item = request.meta['item']
yield item
nextPageSelector = 'span.page-link a::attr(href)'
nextPageLink = response.css(nextPageSelector).extract_first()
yield response.follow(nextPageLink,callback=self.scrape)
def getContent(self,response):
item = response.meta['item']
TITLE_SELECTOR = '.entry-title ::text'
item['title'] = response.css(TITLE_SELECTOR).extract_first()
yield item
To settings.py, add:
ITEM_PIPELINES = {
'myproject.pipelines.JsonWriterPipeline':100
}
where myproject is the name of your project/folder.
See the very last heading on this page : https://docs.scrapy.org/en/latest/topics/item-pipeline.html
When running a spider inside a script, the settings need to be imported using the method described in the following. Running scrapy from script not including pipeline

How to return NaN for sites which do not have crawled info?

How can I return NaN for urls who do not have any: ".//*[#id='object']//tbody//tr//td//span//a[2]"?. I tried to:
def parse(self, response):
links = response.xpath(".//*[#id='object']//tbody//tr//td//span//a[2]")
if not links:
item = ToyItem()
item['link'] = 'NaN'
item['name'] = response.url
return item
for links in links:
item = ToyItem()
item['link'] = links.xpath('#href').extract_first()
item['name'] = response.url # <-- see here
yield item
list_of_dics = []
list_of_dics.append(item)
df = pd.DataFrame(list_of_dics)
print(df)
df.to_csv('/Users/user/Desktop/crawled_table.csv', index=False)
However, instead of returning (*):
'link1.com' 'NaN'
'link2.com' 'NAN'
'link3.com' 'extracted3.link.com'
I got:
'link3.com' 'extracted3.link.com'
How can I return (*)?
You can rework this to use scrapy pipelines:
from scrapy import Spider
class MySpider(Spider):
name = 'myspider'
start_urls = ['link1','link2','link3']
def parse(self, response):
links = response.xpath(".//*[#id='object']//tbody//tr//td//span//a[2]")
if not links:
item = ToyItem()
item['link'] = 'NaN'
item['name'] = response.url
yield item
else:
for links in links:
item = ToyItem()
item['link'] = link.xpath('#href').extract_first()
item['name'] = response.url # <-- see here
yield item
Now in your pipelines.py
class PandasPipeline:
def open_spider(self, spider):
self.data = []
def process_item(self, item, spider):
self.data.append(item)
return item
def close_spider(self, spider):
df = pd.DataFrame(self.data)
print('saving dataframe')
df.to_csv('/Users/user/Desktop/crawled_table.csv', index=False)
and for settings.py:
ITEM_PIPELINES = {
'myproject.pipelines.PandasPipeline': 900
}

Scrapy prints fields but doesn't populate XML file

I have a problem where it prints the XML files correctly but it doesn't populate the XML file with any content.
The output in terminal is this:
[u'Tove'] [u'Jani'] [u'Reminder'] [u"Don't forget me this weekend!"]
However the output site_products.xml results in this (which is wrong, no data):
<?xml version="1.0" encoding="utf-8"?>
<items></items>
spider.py
from scrapy.contrib.spiders import XMLFeedSpider
from crawler.items import CrawlerItem
class SiteSpider(XMLFeedSpider):
name = 'site'
allowed_domains = ['www.w3schools.com']
start_urls = ['http://www.w3schools.com/xml/note.xml']
itertag = 'note'
def parse_node(self, response, selector):
to = selector.xpath('//to/text()').extract()
who = selector.xpath('//from/text()').extract()
heading = selector.xpath('//heading/text()').extract()
body = selector.xpath('//body/text()').extract()
return item
pipelines.py
from scrapy import signals
from scrapy.contrib.exporter import XmlItemExporter
class XmlExportPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s_products.xml' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = XmlItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
items.py
import scrapy
class CrawlerItem(scrapy.Item):
to = scrapy.Field()
who = scrapy.Field()
heading = scrapy.Field()
body = scrapy.Field()
pass
settings.py
BOT_NAME = 'crawler'
SPIDER_MODULES = ['crawler.spiders']
NEWSPIDER_MODULE = 'crawler.spiders'
ITEM_PIPELINES = {'crawler.pipelines.XmlExportPipeline': 300,}
Any help with this would be really appreciated.
You need to instantiate a CrawlerItem instance in your parse_node() method:
def parse_node(self, response, selector):
item = CrawlerItem()
item['to'] = selector.xpath('//to/text()').extract()
item['who'] = selector.xpath('//from/text()').extract()
item['heading'] = selector.xpath('//heading/text()').extract()
item['body'] = selector.xpath('//body/text()').extract()
return item

No output in XML file using XMLITEMEXPORTER

I am beginner to python and I am working with scrapy. I have used xmlitemexporter to export my scraped data to xml file. But i get only "<"/item"">" in the xml file.
My items.py is like follow:
from scrapy.item import Item, Field
class WorkwithitemsItem(Item):
title = Field()
link = Field()
publish = Field()
description = Field()
And the spider is like:
from scrapy import log
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from workwithitems.items import WorkwithitemsItem
class MySpider(BaseSpider):
name = 'spidey'
allowed_domains = ['ekantipur.com']
start_urls = [
'http://www.ekantipur.com/en/rss',
]
def parse(self, response):
self.log('A response from %s just arrived!' % response.url)
sel = Selector(response)
title = sel.xpath('//title/text()').extract()
link = sel.xpath('//link/text()').extract()
publish = sel.xpath('//pubDate/text()').extract()
description = sel.xpath('//description/text()').extract()
WorkwithitemsItem(title = title[2:], link = link[2:],
publish = publish, description = description[1:])
And the pipelines.py is:
from scrapy import signals
from scrapy.contrib.exporter import XmlItemExporter
class XmlExportPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s_products.xml' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = XmlItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
The settings.py is:
BOT_NAME = 'workwithitems'
SPIDER_MODULES = ['workwithitems.spiders']
NEWSPIDER_MODULE = 'workwithitems.spiders'
FEED_EXPORTERS_BASE = {
'xml': 'scrapy.contrib.exporter.XmlItemExporter',
}
ITEM_PIPELINES = {
'workwithitems.pipelines.XmlExportPipeline': 800,
}
I can't figure out where my problem is.
Ok! I found the problem. What i did is just put a 'return' at the last line in spider.py
return WorkwithitemsItem(title = title[2:], link = link[2:],
publish = publish, description = description[1:]
)

Scrapy Spider Parser call function

How can I call the writeXML after my parser finish crawling data? Currently I can see the data crawl but don't see the output file. I tried to print under writeXML no output too.
Below are my code:
class FriendSpider(BaseSpider):
# identifies of the Spider
name = "friend"
count = 0
allowed_domains = ["example.com.us"]
start_urls = [
"http://example.com.us/biz/friendlist/"
]
def start_requests(self):
for i in range(0,1722,40):
yield self.make_requests_from_url("http://example.com.us/biz/friendlist/?start=%d" % i)
def parse(self, response):
response = response.replace(body=response.body.replace('<br />', '\n'))
hxs = HtmlXPathSelector(response)
sites = hxs.select('//ul/li')
items = []
for site in sites:
item = Item()
self.count += 1
item['id'] = str(self.count)
item['name'] = site.select('.//div/div/h4/text()').extract()
item['address'] = site.select('h4/span/text()').extract()
item['review'] = ''.join(site.select('.//div[#class="review"]/p/text()').extract())
item['birthdate'] = site.select('.//div/div/h5/text()').extract()
items.append(item)
return items
def writeXML(self, items):
root = ET.Element("Test")
for item in items:
item= ET.SubElement(root,'item')
item.set('id', item['id'])
address= ET.SubElement(item, 'address')
address.text = item['address']
user = ET.SubElement(item, 'user')
user.text = item['user']
birthdate= ET.SubElement(item, 'birthdate')
birthdate.text = item['birthdate']
review = ET.SubElement(item, 'review')
review.text = item['review']
# wrap it in an ElementTree instance, and save as XML
file = open("out.xml", 'w')
tree = ET.ElementTree(root)
tree.write(file,xml_declaration=True,encoding='utf-8',method="xml")
To output using the built-in XML exporter, try the following command:
scrapy crawl friend -o items.xml -t xml
If the output isn't to your liking, then you can try creating your own exporter using the XMLExporter class as a basis.

Categories

Resources