I try to generate a CSV file with the result of the crawler. Because it is German, I need to have it UTF-8 encoded (ä,ö, etc.). This is my result so far:
spider.py
import scrapy
from scrapy.spiders import BaseSpider
from scrapy.selector import Selector
from Polizeimeldungen.items import PolizeimeldungenItem
class PoliceSpider(scrapy.Spider):
name = "pm"
allowed_domains = ["berlin.de"]
start_urls =
["https://www.berlin.de/polizei/polizeimeldungen/archiv/2014/?page_at_1_0=1"]
def parse(self, response):
for sel in response.css('.row-fluid'):
item = PolizeimeldungenItem()
item['title'] = sel.css('a ::text').extract_first().encode('utf-8')
item['link'] = sel.css('a ::text').extract_first().encode('utf-8') // this is wrong, but it is easy to fix
yield item
items.py
import scrapy
class PolizeimeldungenItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
pipelines.py
import csv
class PolizeimeldungenPipeline(object):
def __init__(self):
self.myCsv = csv.writer(open('Item.csv', 'wb'))
self.myCsv.writerow(['title', 'link'])
def process_item(self, item, spider):
self.myCsv.writerow([item['title'], item['link']])
return item
Settings.py
BOT_NAME = 'Polizeimeldungen'
SPIDER_MODULES = ['Polizeimeldungen.spiders']
NEWSPIDER_MODULE = 'Polizeimeldungen.spiders'
ITEM_PIPELINES = {'Polizeimeldungen.pipelines.PolizeimeldungenPipeline': 100}
AS the result after:
scrapy crawl pm
I get this error message:
TypeError: a bytes-like object is required, not 'str'
Thanks for your help!!
UPDATE: Python 3.6.0 :: Anaconda 4.3.1
I assume that you are using Python 3 (this solution won't work with Python 2).
You need to change two things:
Open the output file in text mode, with the desired output encoding.
In the PolizeimeldungenPipeline's constructor, write:
self.myCsv = csv.writer(open('Item.csv', 'w', encoding='utf-8'))
Don't encode the cells (as in PoliceSpider.parse):
item['title'] = sel.css('a ::text').extract_first()
etc.
Related
I'm brand new to Python so I apologize if there's a dumb mistake here...I've been scouring the web for days, looking at similar issues and combing through Scrapy docs and nothing seems to really resolve this for me...
I have a Scrapy project which successfully scrapes the source website, returns the required items, and then uses an ImagePipeline to download (and then rename accordingly) the images from the returned image links... but only when I run from the terminal with "runspider".
Whenever I use "crawl" from the terminal or CrawlProcess to run the spider from within the script, it returns the items but does not download the images and, I assume, completely misses the ImagePipeline.
I read that I needed to import my settings when running this way in order to properly load the pipeline, which makes sense after looking into the differences between "crawl" and "runspider" but I still cannot get the pipeline working.
There are no error messages but I notice that it does return "[scrapy.middleware] INFO: Enabled item pipelines: []" ... Which I assumed was showing that it is still missing my pipeline?
Here's my spider.py:
import scrapy
from scrapy2.items import Scrapy2Item
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
class spider1(scrapy.Spider):
name = "spider1"
domain = "https://www.amazon.ca/s?k=821826022317"
def start_requests(self):
yield scrapy.Request(url=spider1.domain ,callback = self.parse)
def parse(self, response):
items = Scrapy2Item()
titlevar = response.css('span.a-text-normal ::text').extract_first()
imgvar = [response.css('img ::attr(src)').extract_first()]
skuvar = response.xpath('//meta[#name="keywords"]/#content')[0].extract()
items['title'] = titlevar
items['image_urls'] = imgvar
items['sku'] = skuvar
yield items
process = CrawlerProcess(get_project_settings())
process.crawl(spider1)
process.start()
Here is my items.py:
import scrapy
class Scrapy2Item(scrapy.Item):
title = scrapy.Field()
image_urls = scrapy.Field()
sku = scrapy.Field()
Here is my pipelines.py:
import scrapy
from scrapy.pipelines.images import ImagesPipeline
class Scrapy2Pipeline(ImagesPipeline):
def get_media_requests(self, item, info):
return [scrapy.Request(x, meta={'image_name': item['sku']})
for x in item.get('image_urls', [])]
def file_path(self, request, response=None, info=None):
return '%s.jpg' % request.meta['image_name']
Here is my settings.py:
BOT_NAME = 'scrapy2'
SPIDER_MODULES = ['scrapy2.spiders']
NEWSPIDER_MODULE = 'scrapy2.spiders'
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {
'scrapy2.pipelines.Scrapy2Pipeline': 1,
}
IMAGES_STORE = 'images'
Thank you to anybody that looks at this or even attempts to help me out. It's greatly appreciated.
Since you are running your spider as a script, there is no scrapy project environment, get_project_settings won't work (aside from grabbing the default settings).
The script must be self-contained, i.e. contain everything you need to run your spider (or import it from your python search path, like any regular old python code).
I've reformatted that code for you, so that it runs, when you execute it with the plain python interpreter: python3 script.py.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import scrapy
from scrapy.pipelines.images import ImagesPipeline
BOT_NAME = 'scrapy2'
ROBOTSTXT_OBEY = True
IMAGES_STORE = 'images'
class Scrapy2Item(scrapy.Item):
title = scrapy.Field()
image_urls = scrapy.Field()
sku = scrapy.Field()
class Scrapy2Pipeline(ImagesPipeline):
def get_media_requests(self, item, info):
return [scrapy.Request(x, meta={'image_name': item['sku']})
for x in item.get('image_urls', [])]
def file_path(self, request, response=None, info=None):
return '%s.jpg' % request.meta['image_name']
class spider1(scrapy.Spider):
name = "spider1"
domain = "https://www.amazon.ca/s?k=821826022317"
def start_requests(self):
yield scrapy.Request(url=spider1.domain ,callback = self.parse)
def parse(self, response):
items = Scrapy2Item()
titlevar = response.css('span.a-text-normal ::text').extract_first()
imgvar = [response.css('img ::attr(src)').extract_first()]
skuvar = response.xpath('//meta[#name="keywords"]/#content')[0].extract()
items['title'] = titlevar
items['image_urls'] = imgvar
items['sku'] = skuvar
yield items
if __name__ == "__main__":
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
settings = Settings(values={
'BOT_NAME': BOT_NAME,
'ROBOTSTXT_OBEY': ROBOTSTXT_OBEY,
'ITEM_PIPELINES': {
'__main__.Scrapy2Pipeline': 1,
},
'IMAGES_STORE': IMAGES_STORE,
'TELNETCONSOLE_ENABLED': False,
})
process = CrawlerProcess(settings=settings)
process.crawl(spider1)
process.start()
I used scrapy to crawl comment data from the website http://club.jd.com/comment/productPageComments.action?callback=&productId=1892018&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0, which language is chinese. But i just got the output like this.
And the csv file's output is all messed up.
I don't know what happened. First i thought that was a json decode or json encode problem, then i tried the ways on the internet but i got the same result. Here's my code:
#!/usr/bin/env python
# encoding: utf-8
import scrapy
from scrapy import Request
from scrapy.selector import Selector
from jd_comment.items import JdCommentItem
import json
class JdSpider(scrapy.Spider):
name = 'comment'
def start_requests(self):
url = 'http://club.jd.com/comment/productPageComments.action?callback=&productId=1892018&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0'
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
jsonresponse = json.loads(response.body_as_unicode())
items = []
for comment in jsonresponse['comments']:
item = JdCommentItem()
item['username'] = comment['nickname']
item['user_ID'] = comment['id']
item['time'] = comment['referenceTime']
item['good_ID'] = comment['referenceId']
item['good_name'] = comment['referenceName']
item['content'] = comment['content']
item['score'] = comment['score']
items.append(item)
yield item
Anyone give me a hint would be highly appreciated. Thanks.
I wrote a code in scrapy to scrape coffee shops from yellowpage. The total data is around 870 but I'm getting around 1200 with a minimum number of duplicates. Moreover, in the csv output the data are getting placed in every alternate row. Expecting someone to provide me with a workaround. Thanks in advance.
Folder Name "yellpg" and "items.py" contains
from scrapy.item import Item, Field
class YellpgItem(Item):
name= Field()
address = Field()
phone= Field()
Spider Name "yellsp.py" which contains:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from yellpg.items import YellpgItem
class YellspSpider(CrawlSpider):
name = "yellsp"
allowed_domains = ["yellowpages.com"]
start_urls = (
'https://www.yellowpages.com/search?search_terms=Coffee%20Shops&geo_location_terms=Los%20Angeles%2C%20CA&page=1',
)
rules = (Rule(LinkExtractor(allow=('\&page=.*',)),callback='parse_item',follow=True),)
def parse_item(self, response):
page=response.xpath('//div[#class="info"]')
for titles in page:
item = YellpgItem()
item["name"] = titles.xpath('.//span[#itemprop="name"]/text()').extract()
item["address"] = titles.xpath('.//span[#itemprop="streetAddress" and #class="street-address"]/text()').extract()
item["phone"] = titles.xpath('.//div[#itemprop="telephone" and #class="phones phone primary"]/text()').extract()
yield item
To get the CSV output, the command line I'm using:
scrapy crawl yellsp -o items.csv
I could recommend creating a pipeline that stores items to later check if the new ones are duplicates, but that isn't a real solution here, as it could create memory problems.
The real solution here is that you should avoid "storing" duplicates in your final database.
Define what field of your item is going to behave as the index in your database and everything should be working.
The best way would be to use CSVItemExporter in your pipeline. Create a file named pipeline.py inside your scrapy project and add below code lines.
from scrapy import signals
from scrapy.exporters import CsvItemExporter
class CSVExportPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s_coffer_shops.csv' % spider.name, 'w+b') # hard coded filename, not a good idea
self.files[spider] = file
self.exporter = CsvItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
Now add these lines in setting.py
ITEM_PIPELINES = {
'your_project_name.pipelines.CSVExportPipeline': 300
}
This custom CSVItemExporter will export your data in CSV styles. If you are not getting the data as expected you can modify process_item method to suits your need.
I'm very new to scrapy so it's hard for me to find out what i am doing wrong in case of having no results in csv file. I can see results in the console though. Here is what I tried with:
Main folder is named "realyp".
Spider file is named "yp.py" and the code:
from scrapy.selector import Selector
from scrapy.spider import BaseSpider
from realyp.items import RealypItem
class MySpider(BaseSpider):
name="YellowPage"
allowed_domains=["yellowpages.com"]
start_urls=["https://www.yellowpages.com/search?search_terms=Coffee%20Shops&geo_location_terms=Los%20Angeles%2C%20CA&page=2"]
def parse(self, response):
title = Selector(response)
page=title.xpath('//div[#class="info"]')
items = []
for titles in page:
item = RealypItem()
item["name"] = titles.xpath('.//span[#itemprop="name"]/text()').extract()
item["address"] = titles.xpath('.//span[#itemprop="streetAddress" and #class="street-address"]/text()').extract()
item["phone"] = titles.xpath('.//div[#itemprop="telephone" and #class="phones phone primary"]/text()').extract()
items.append(item)
return items
"items.py" file includes:
from scrapy.item import Item, Field
class RealypItem(Item):
name= Field()
address = Field()
phone= Field()
To get the csv output my command line is:
cd desktop
cd realyp
scrapy crawl YellowPage -o items.csv -t csv
Any help will be greatly appreciated.
As stated by #Granitosauros, you should use yield instead of return. The yield should be inside the for cycle.
In the for cycle, if the path starts with // then all elements in the document which fulfill following criteria are selected (see here).
Here's a (rough) code that works for me:
# -*- coding: utf-8 -*-
from scrapy.selector import Selector
from scrapy.spider import BaseSpider
from realyp.items import RealypItem
class MySpider(BaseSpider):
name="YellowPage"
allowed_domains=["yellowpages.com"]
start_urls=["https://www.yellowpages.com/search?search_terms=Coffee%20Shops&geo_location_terms=Los%20Angeles%2C%20CA&page=2"]
def parse(self, response):
for titles in response.xpath('//div[#class = "result"]/div'):
item = RealypItem()
item["name"] = titles.xpath('div[2]/div[2]/h2 /a/span[#itemprop="name"]/text()').extract()
item["address"] = titles.xpath('string(div[2]/div[2]/div/p[#itemprop="address"])').extract()
item["phone"] = titles.xpath('div[2]/div[2]/div/div[#itemprop="telephone" and #class="phones phone primary"]/text()').extract()
yield item
I am pretty new to Scrapy. I am looking into using it to crawl an entire website for links, in which I would output the items into multiple JSON files. So I could then upload them to Amazon Cloud Search for indexing. Is it possible to split the items into multiple files instead of having just one giant file in the end? From what I've read, the Item Exporters can only output to one file per spider. But I am only using one CrawlSpider for this task. It would be nice if I could set a limit to the number of items included in each file, like 500 or 1000.
Here is the code I have set up so far (based off the Dmoz.org used in the tutorial):
dmoz_spider.py
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from tutorial.items import DmozItem
class DmozSpider(CrawlSpider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/",
]
rules = [Rule(LinkExtractor(), callback='parse_item', follow=True)]
def parse_item(self, response):
for sel in response.xpath('//ul/li'):
item = DmozItem()
item['title'] = sel.xpath('a/text()').extract()
item['link'] = sel.xpath('a/#href').extract()
item['desc'] = sel.xpath('text()').extract()
yield item
items.py
import scrapy
class DmozItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
desc = scrapy.Field()
Thanks for the help.
I don't think built-in feed exporters support writing into multiple files.
One option would be to export into a single file in jsonlines format basically, one JSON object per line which is convenient to pipe and split.
Then, separately, after the crawling is done, you can read the file in the desired chunks and write into separate JSON files.
So I could then upload them to Amazon Cloud Search for indexing.
Note that there is a direct Amazon S3 exporter (not sure it helps, just FYI).
You can add a name to each item and use a custom pipeline to output to different json files. like so:
from scrapy.exporters import JsonItemExporter
from scrapy import signals
class MultiOutputExporter(object):
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
self.items = ['item1','item2']
self.files = {}
self.exporters = {}
for item in self.items:
self.files[item] = open(f'{item}.json', 'w+b')
self.exporters[item] = JsonItemExporter(self.files[item])
self.exporters[item].start_exporting()
def spider_closed(self, spider):
for item in self.items:
self.exporters[item].finish_exporting()
self.files[item].close()
def process_item(self, item, spider):
self.exporters[item.name].export_item()
return item
Then add names to your items as follows:
class Item(scrapy.Item):
name = 'item1'
Now enable the pipeline in scrapy.setting and voila.