Total noob just getting started with scrapy.
In my directory structure I have like this...
#FYI: running on Scrapy 2.4.1
WebScraper/
Webscraper/
spiders/
spider.py # (NOTE: contains spider1 and spider2 classes.)
items.py
middlewares.py
pipelines.py # (NOTE: contains spider1Pipeline and spider2Pipeline)
settings.py # (NOTE: I wrote here:
#ITEM_PIPELINES = {
# 'WebScraper.pipelines.spider1_pipelines': 300,
# 'WebScraper.pipelines.spider2_pipelines': 300,
#}
scrapy.cfg
And spider2.py resembles...
class OneSpider(scrapy.Spider):
name = "spider1"
def start_requests(self):
urls = ["url1.com",]
yield scrapy.Request(
url="http://url1.com",
callback=self.parse
)
def parse(self,response):
## Scrape stuff, put it in a dict
yield dictOfScrapedStuff
class TwoSpider(scrapy.Spider):
name = "spider2"
def start_requests(self):
urls = ["url2.com",]
yield scrapy.Request(
url="http://url2.com",
callback=self.parse
)
def parse(self,response):
## Scrape stuff, put it in a dict
yield dictOfScrapedStuff
With pipelines.py looking like...
class spider1_pipelines(object):
def __init__(self):
self.csvwriter = csv.writer(open('spider1.csv', 'w', newline=''))
self.csvwriter.writerow(['header1', 'header2'])
def process_item(self, item, spider):
row = []
row.append(item['header1'])
row.append(item['header2'])
self.csvwrite.writerow(row)
class spider2_pipelines(object):
def __init__(self):
self.csvwriter = csv.writer(open('spider2.csv', 'w', newline=''))
self.csvwriter.writerow(['header_a', 'header_b'])
def process_item(self, item, spider):
row = []
row.append(item['header_a']) #NOTE: this is not the same as header1
row.append(item['header_b']) #NOTE: this is not the same as header2
self.csvwrite.writerow(row)
I have a question about running spider1 and spider2 on different urls with one terminal command:
nohup scrapy crawl spider1 -o spider1_output.csv --logfile spider1.log & scrapy crawl spider2 -o spider2_output.csv --logfile spider2.log
Note: this is an extension of a previous question specific to this stack overflow post (2018).
Desired result: spider1.csv with data from spider1, spider2.csv with data from spider2.
Current result: spider1.csv with data from spider1, spider2.csv BREAKS but error log contains spider2 data, and that there was a keyerror ['header1'], even though the item for spider2 does not include header1, it only includes header_a.
Does anyone know how to run one spider after the other on different urls, and plug data fetched by spider1, spider2, etc. into pipelines specific to that spider, as in spider1 -> spider1Pipeline -> spider1.csv, spider2 -> spider2Pipelines -> spider2.csv.
Or perhaps this is a matter of specifying the spider1_item and spider2_item from items.py? I wonder if I can specify where to insert spider2's data that way.
Thank you!
You can implement this using custom_settings spider attribute to set settings individually per spider
#spider2.py
class OneSpider(scrapy.Spider):
name = "spider1"
custom_settings = {
'ITEM_PIPELINES': {'WebScraper.pipelines.spider1_pipelines': 300}
...
class TwoSpider(scrapy.Spider):
name = "spider2"
custom_settings = {
'ITEM_PIPELINES': {'WebScraper.pipelines.spider2_pipelines': 300}
...
Related
I'm new to webscraping/scrapy and python
Scrapy version: Scrapy 2.5.1
OS: windows
IDE: pycharm
I am trying to use FEEDS option in scrapy to automatically export the scrapped data from a website to download into excel
Tried following solution but didn't work stackoverflow solution not sure what i'm doing wrong here am i missing something?
i also tried to add the same in my settings.py file after commenting custom_settings in my spider class as per example provided in documentation: https://docs.scrapy.org/en/latest/topics/feed-exports.html?highlight=feed#feeds
for now i achieved my requirement using spider_closed (signal) to write data to CSV by storing all the scraped items data in a array called result
class SpiderFC(scrapy.Spider):
name = "FC"
start_urls = [
url,
]
custom_setting = {"FEEDS": {r"C:\Users\rreddy\PycharmProjects\fcdc\webscrp\outputfinal.csv": {"format": "csv", "overwrite": True}}}
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(SpiderFC, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
return spider
def __init__(self, name=None):
super().__init__(name)
self.count = None
def parse(self, response, **kwargs):
# each item scrapped from parent page has links where the actual data need to be scrapped so i follow each link and scrape data
yield response.follow(notice_href_follow, callback=self.parse_item,
meta={'item': item, 'index': index, 'next_page': next_page})
def parse_item(self, response):
# logic for items to scrape goes here
# they are saved to temp list and appended to result array and then temp list is cleared
result.append(it) # result data is used at the end to write to csv
item.clear()
if next_page:
yield next(self.follow_next(response, next_page))
def follow_next(self, response, next_page):
next_page_url = urljoin(url, next_page[0])
yield response.follow(next_page_url, callback=self.parse)
spider closed signal
def spider_closed(self, spider):
with open(output_path, mode="a", newline='') as f:
writer = csv.writer(f)
for v in result:
writer.writerow([v["city"]])
when all data is scraped and all requests are completed spider_closed signal will write the data to a csv but i'm trying to avoid this logic or code and use inbuilt exporter from scrapy but I'm having trouble in exporting the data
Check your path. If you are on windows then provide the full path in the custom_settings e.g. as below
custom_settings = {
"FEEDS":{r"C:\Users\Name\Path\To\outputfinal.csv" : {"format" : "csv", "overwrite":True}}
}
If you are on linux or MAC then provide the path as below:
custom_settings = {
"FEEDS":{r"/Path/to/folder/fcdc/webscrp/outputfinal.csv" : {"format" : "csv", "overwrite":True}}
}
Alternatively provide the relative path as below which will create a folder structure of fcdc>>webscrp>>outputfinal.csv in the directory from which the spider is run from.
custom_settings = {
"FEEDS":{r"./fcdc/webscrp/outputfinal.csv" : {"format" : "csv", "overwrite":True}}
}
I'm brand new to Python so I apologize if there's a dumb mistake here...I've been scouring the web for days, looking at similar issues and combing through Scrapy docs and nothing seems to really resolve this for me...
I have a Scrapy project which successfully scrapes the source website, returns the required items, and then uses an ImagePipeline to download (and then rename accordingly) the images from the returned image links... but only when I run from the terminal with "runspider".
Whenever I use "crawl" from the terminal or CrawlProcess to run the spider from within the script, it returns the items but does not download the images and, I assume, completely misses the ImagePipeline.
I read that I needed to import my settings when running this way in order to properly load the pipeline, which makes sense after looking into the differences between "crawl" and "runspider" but I still cannot get the pipeline working.
There are no error messages but I notice that it does return "[scrapy.middleware] INFO: Enabled item pipelines: []" ... Which I assumed was showing that it is still missing my pipeline?
Here's my spider.py:
import scrapy
from scrapy2.items import Scrapy2Item
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
class spider1(scrapy.Spider):
name = "spider1"
domain = "https://www.amazon.ca/s?k=821826022317"
def start_requests(self):
yield scrapy.Request(url=spider1.domain ,callback = self.parse)
def parse(self, response):
items = Scrapy2Item()
titlevar = response.css('span.a-text-normal ::text').extract_first()
imgvar = [response.css('img ::attr(src)').extract_first()]
skuvar = response.xpath('//meta[#name="keywords"]/#content')[0].extract()
items['title'] = titlevar
items['image_urls'] = imgvar
items['sku'] = skuvar
yield items
process = CrawlerProcess(get_project_settings())
process.crawl(spider1)
process.start()
Here is my items.py:
import scrapy
class Scrapy2Item(scrapy.Item):
title = scrapy.Field()
image_urls = scrapy.Field()
sku = scrapy.Field()
Here is my pipelines.py:
import scrapy
from scrapy.pipelines.images import ImagesPipeline
class Scrapy2Pipeline(ImagesPipeline):
def get_media_requests(self, item, info):
return [scrapy.Request(x, meta={'image_name': item['sku']})
for x in item.get('image_urls', [])]
def file_path(self, request, response=None, info=None):
return '%s.jpg' % request.meta['image_name']
Here is my settings.py:
BOT_NAME = 'scrapy2'
SPIDER_MODULES = ['scrapy2.spiders']
NEWSPIDER_MODULE = 'scrapy2.spiders'
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {
'scrapy2.pipelines.Scrapy2Pipeline': 1,
}
IMAGES_STORE = 'images'
Thank you to anybody that looks at this or even attempts to help me out. It's greatly appreciated.
Since you are running your spider as a script, there is no scrapy project environment, get_project_settings won't work (aside from grabbing the default settings).
The script must be self-contained, i.e. contain everything you need to run your spider (or import it from your python search path, like any regular old python code).
I've reformatted that code for you, so that it runs, when you execute it with the plain python interpreter: python3 script.py.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import scrapy
from scrapy.pipelines.images import ImagesPipeline
BOT_NAME = 'scrapy2'
ROBOTSTXT_OBEY = True
IMAGES_STORE = 'images'
class Scrapy2Item(scrapy.Item):
title = scrapy.Field()
image_urls = scrapy.Field()
sku = scrapy.Field()
class Scrapy2Pipeline(ImagesPipeline):
def get_media_requests(self, item, info):
return [scrapy.Request(x, meta={'image_name': item['sku']})
for x in item.get('image_urls', [])]
def file_path(self, request, response=None, info=None):
return '%s.jpg' % request.meta['image_name']
class spider1(scrapy.Spider):
name = "spider1"
domain = "https://www.amazon.ca/s?k=821826022317"
def start_requests(self):
yield scrapy.Request(url=spider1.domain ,callback = self.parse)
def parse(self, response):
items = Scrapy2Item()
titlevar = response.css('span.a-text-normal ::text').extract_first()
imgvar = [response.css('img ::attr(src)').extract_first()]
skuvar = response.xpath('//meta[#name="keywords"]/#content')[0].extract()
items['title'] = titlevar
items['image_urls'] = imgvar
items['sku'] = skuvar
yield items
if __name__ == "__main__":
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
settings = Settings(values={
'BOT_NAME': BOT_NAME,
'ROBOTSTXT_OBEY': ROBOTSTXT_OBEY,
'ITEM_PIPELINES': {
'__main__.Scrapy2Pipeline': 1,
},
'IMAGES_STORE': IMAGES_STORE,
'TELNETCONSOLE_ENABLED': False,
})
process = CrawlerProcess(settings=settings)
process.crawl(spider1)
process.start()
I wrote a code in scrapy to scrape coffee shops from yellowpage. The total data is around 870 but I'm getting around 1200 with a minimum number of duplicates. Moreover, in the csv output the data are getting placed in every alternate row. Expecting someone to provide me with a workaround. Thanks in advance.
Folder Name "yellpg" and "items.py" contains
from scrapy.item import Item, Field
class YellpgItem(Item):
name= Field()
address = Field()
phone= Field()
Spider Name "yellsp.py" which contains:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from yellpg.items import YellpgItem
class YellspSpider(CrawlSpider):
name = "yellsp"
allowed_domains = ["yellowpages.com"]
start_urls = (
'https://www.yellowpages.com/search?search_terms=Coffee%20Shops&geo_location_terms=Los%20Angeles%2C%20CA&page=1',
)
rules = (Rule(LinkExtractor(allow=('\&page=.*',)),callback='parse_item',follow=True),)
def parse_item(self, response):
page=response.xpath('//div[#class="info"]')
for titles in page:
item = YellpgItem()
item["name"] = titles.xpath('.//span[#itemprop="name"]/text()').extract()
item["address"] = titles.xpath('.//span[#itemprop="streetAddress" and #class="street-address"]/text()').extract()
item["phone"] = titles.xpath('.//div[#itemprop="telephone" and #class="phones phone primary"]/text()').extract()
yield item
To get the CSV output, the command line I'm using:
scrapy crawl yellsp -o items.csv
I could recommend creating a pipeline that stores items to later check if the new ones are duplicates, but that isn't a real solution here, as it could create memory problems.
The real solution here is that you should avoid "storing" duplicates in your final database.
Define what field of your item is going to behave as the index in your database and everything should be working.
The best way would be to use CSVItemExporter in your pipeline. Create a file named pipeline.py inside your scrapy project and add below code lines.
from scrapy import signals
from scrapy.exporters import CsvItemExporter
class CSVExportPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s_coffer_shops.csv' % spider.name, 'w+b') # hard coded filename, not a good idea
self.files[spider] = file
self.exporter = CsvItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
Now add these lines in setting.py
ITEM_PIPELINES = {
'your_project_name.pipelines.CSVExportPipeline': 300
}
This custom CSVItemExporter will export your data in CSV styles. If you are not getting the data as expected you can modify process_item method to suits your need.
I have written two spiders in single file. When I ran scrapy runspider two_spiders.py, only the first Spider was executed. How can I run both of them without splitting the file into two files.
two_spiders.py:
import scrapy
class MySpider1(scrapy.Spider):
# first spider definition
...
class MySpider2(scrapy.Spider):
# second spider definition
...
Let's read the documentation:
Running multiple spiders in the same process
By default, Scrapy runs a
single spider per process when you run scrapy crawl. However, Scrapy
supports running multiple spiders per process using the internal API.
Here is an example that runs multiple spiders simultaneously:
import scrapy
from scrapy.crawler import CrawlerProcess
class MySpider1(scrapy.Spider):
# Your first spider definition
...
class MySpider2(scrapy.Spider):
# Your second spider definition
...
process = CrawlerProcess()
process.crawl(MySpider1)
process.crawl(MySpider2)
process.start() # the script will block here until all crawling jobs are finished
(there are few more examples in the documentation)
From your question it is not clear how have you put two spiders into one file. It was not enough to concatenate content of two files with single spiders.
Try to do what is written in the documentation. Or at least show us your code. Without it we can't help you.
Here is a full Scrapy project with 2 spiders in one file.
# quote_spiders.py
import json
import string
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.item import Item, Field
class TextCleaningPipeline(object):
def _clean_text(self, text):
text = text.replace('“', '').replace('”', '')
table = str.maketrans({key: None for key in string.punctuation})
clean_text = text.translate(table)
return clean_text.lower()
def process_item(self, item, spider):
item['text'] = self._clean_text(item['text'])
return item
class JsonWriterPipeline(object):
def open_spider(self, spider):
self.file = open(spider.settings['JSON_FILE'], 'a')
def close_spider(self, spider):
self.file.close()
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item
class QuoteItem(Item):
text = Field()
author = Field()
tags = Field()
spider = Field()
class QuotesSpiderOne(scrapy.Spider):
name = "quotes1"
def start_requests(self):
urls = ['http://quotes.toscrape.com/page/1/', ]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
for quote in response.css('div.quote'):
item = QuoteItem()
item['text'] = quote.css('span.text::text').get()
item['author'] = quote.css('small.author::text').get()
item['tags'] = quote.css('div.tags a.tag::text').getall()
item['spider'] = self.name
yield item
class QuotesSpiderTwo(scrapy.Spider):
name = "quotes2"
def start_requests(self):
urls = ['http://quotes.toscrape.com/page/2/', ]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
for quote in response.css('div.quote'):
item = QuoteItem()
item['text'] = quote.css('span.text::text').get()
item['author'] = quote.css('small.author::text').get()
item['tags'] = quote.css('div.tags a.tag::text').getall()
item['spider'] = self.name
yield item
if __name__ == '__main__':
settings = dict()
settings['USER_AGENT'] = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
settings['HTTPCACHE_ENABLED'] = True
settings['JSON_FILE'] = 'items.jl'
settings['ITEM_PIPELINES'] = dict()
settings['ITEM_PIPELINES']['__main__.TextCleaningPipeline'] = 800
settings['ITEM_PIPELINES']['__main__.JsonWriterPipeline'] = 801
process = CrawlerProcess(settings=settings)
process.crawl(QuotesSpiderOne)
process.crawl(QuotesSpiderTwo)
process.start()
Install Scrapy and run the script
$ pip install Scrapy
$ python quote_spiders.py
No other file is needed.
This example coupled with graphical debugger of pycharm/vscode can help understand scrapy workflow and make debugging easier.
I am pretty new to Scrapy. I am looking into using it to crawl an entire website for links, in which I would output the items into multiple JSON files. So I could then upload them to Amazon Cloud Search for indexing. Is it possible to split the items into multiple files instead of having just one giant file in the end? From what I've read, the Item Exporters can only output to one file per spider. But I am only using one CrawlSpider for this task. It would be nice if I could set a limit to the number of items included in each file, like 500 or 1000.
Here is the code I have set up so far (based off the Dmoz.org used in the tutorial):
dmoz_spider.py
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from tutorial.items import DmozItem
class DmozSpider(CrawlSpider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/",
]
rules = [Rule(LinkExtractor(), callback='parse_item', follow=True)]
def parse_item(self, response):
for sel in response.xpath('//ul/li'):
item = DmozItem()
item['title'] = sel.xpath('a/text()').extract()
item['link'] = sel.xpath('a/#href').extract()
item['desc'] = sel.xpath('text()').extract()
yield item
items.py
import scrapy
class DmozItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
desc = scrapy.Field()
Thanks for the help.
I don't think built-in feed exporters support writing into multiple files.
One option would be to export into a single file in jsonlines format basically, one JSON object per line which is convenient to pipe and split.
Then, separately, after the crawling is done, you can read the file in the desired chunks and write into separate JSON files.
So I could then upload them to Amazon Cloud Search for indexing.
Note that there is a direct Amazon S3 exporter (not sure it helps, just FYI).
You can add a name to each item and use a custom pipeline to output to different json files. like so:
from scrapy.exporters import JsonItemExporter
from scrapy import signals
class MultiOutputExporter(object):
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
self.items = ['item1','item2']
self.files = {}
self.exporters = {}
for item in self.items:
self.files[item] = open(f'{item}.json', 'w+b')
self.exporters[item] = JsonItemExporter(self.files[item])
self.exporters[item].start_exporting()
def spider_closed(self, spider):
for item in self.items:
self.exporters[item].finish_exporting()
self.files[item].close()
def process_item(self, item, spider):
self.exporters[item.name].export_item()
return item
Then add names to your items as follows:
class Item(scrapy.Item):
name = 'item1'
Now enable the pipeline in scrapy.setting and voila.