I've defined a Crawler class for crawling multiple spiders from script.
For spiders, instead of using pipelines, I defined a class, CrawlerPipeline and used signals for connecting methods.
In CrawlerPipeline, some methods require to use class variables such as __ERRORS.
I'm unable to implement the correct way for the same. Any suggestions or ideas will be very helpful.
For reference, I'm attaching the code snippet
from scrapy import signals
from scrapy.crawler import CrawlerProcess
from .pipeline import CrawlerPipeline
class Crawler:
def __init__(self) -> None:
self.process = CrawlerProcess(settings={
'ROBOTSTXT_OBEY': False,
'REDIRECT_ENABLED': True,
'SPIDER_MODULES': ['engine.crawler.spiders'],
'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7',
'USER_AGENT': 'Mozilla/5.0 (Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0',
})
def spawn(self, spider: str, **kwargs) -> None:
self.process.crawl(spider, **kwargs)
self.__connect_signals(spider)
def run(self) -> None:
self.process.start()
def __connect_signals(self, spider: str) -> None:
pipe = CrawlerPipeline()
for crawler in self.process.crawlers:
_set_signal = crawler.signals.connect
if spider == 'a':
_set_signal(pipe.add_meta_urls, signal=signals.spider_opened)
if spider == 'b':
...
if spider == 'c':
_set_signal(pipe.add_meta_urls, signal=signals.spider_opened)
if spider == 'd':
...
# These lines are not working, above two also not working
_set_signal(pipe.process_item, signal=signals.item_scraped)
_set_signal(pipe.spider_closed, signal=signals.spider_closed)
_set_signal(pipe.spider_error, signal=signals.spider_error)
import json
from pathlib import Path
from collections import defaultdict
from api.database import Mongo
class CrawlerPipeline:
__ITEMS = defaultdict(list)
__ERRORS = list
def process_item(self, item, spider):
self.__ITEMS[spider.name].append(item)
return item
def add_meta_urls(self, spider):
spider.start_urls = ['https://www.example.com']
def spider_error(self, failure, response, spider):
self.__ERRORS.append({
'spider': spider.name,
'url': response.url,
'status': response.status,
'error': failure.getErrorMessage(),
'traceback': failure.getTraceback(),
})
def spider_closed(self, spider, reason):
print(self.__ERRORS)
Path("logs").mkdir(parents=True, exist_ok=True)
...
Related
My scrapy crawler collect data from ptt website, and input the crawling data into google spreadsheet by using gspread. my ptt spider parse latest 40 post on the ptt website everyday, and now i would like to drop duplicate data in this latest 40 post, for example, if the post_title or post_link is the same with yesterday, then don't need to parse this post into google spreadsheet.
i know i should use DropItem in scarpy, but literally i didn't know how to fix my code( i am a very new beginner in Python), and would like ask for help for this one, thanks.
This is my ppt spider code
# -*- coding: utf-8 -*-
import scrapy
# from scrapy.exceptions import CloseSpider
from myFirstScrapyProject.items import MyfirstscrapyprojectItem
class PttSpider(scrapy.Spider):
count_page = 1
name = 'ptt'
allowed_domains = ['www.ptt.cc/']
start_urls = ['https://www.ptt.cc/bbs/e-shopping/search?q=%E8%9D%A6%E7%9A%AE']+['https://www.ptt.cc/bbs/e-seller/search?q=%E8%9D%A6%E7%9A%AE']
# start_urls = ['https://www.ptt.cc/bbs/e-shopping/index.html']
def parse(self, response):
items = MyfirstscrapyprojectItem()
for q in response.css('div.r-ent'):
items['push']=q.css('div.nrec > span.h1::text').extract_first()
items['title']=q.css('div.title > a::text').extract_first()
items['href']=q.css('div.title> a::attr(href)').extract_first()
items['date']=q.css('div.meta > div.date ::text').extract_first()
items['author']=q.css('div.meta > div.author ::text').extract_first()
yield(items)
and this is my pipeline
from myFirstScrapyProject.exporters import GoogleSheetItemExporter
from scrapy.exceptions import DropItem
class MyfirstscrapyprojectPipeline(object):
def open_spider(self, spider):
self.exporter = GoogleSheetItemExporter()
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
thanks to sharmiko, i rewrite it, but it seems doesn't work, what should i fix?
from myFirstScrapyProject.exporters import GoogleSheetItemExporter
from scrapy.exceptions import DropItem
class MyfirstscrapyprojectPipeline(object):
def open_spider(self, spider):
self.exporter = GoogleSheetItemExporter()
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
# def process_item(self, item, spider):
# self.exporter.export_item(item)
# return item
#class DuplicatesTitlePipeline(object):
def __init__(self):
self.article = set()
def process_item(self, item, spider):
href = item['href']
if href in self.article:
raise DropItem('duplicates href found %s', item)
self.exporter.export_item(item)
return(item)
this is the code for export to google sheet
import gspread
from oauth2client.service_account import ServiceAccountCredentials
from scrapy.exporters import BaseItemExporter
class GoogleSheetItemExporter(BaseItemExporter):
def __init__(self):
scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']
credentials = ServiceAccountCredentials.from_json_keyfile_name('pythonupload.json', scope)
gc = gspread.authorize(credentials)
self.spreadsheet = gc.open('Community')
self.worksheet = self.spreadsheet.get_worksheet(1)
def export_item(self, item):
self.worksheet.append_row([item['push'], item['title'],
item['href'],item['date'],item['author']])
You should modify your process_item function to check for duplicate elements and if it is found, you can just drop it.
from scrapy.exceptions import DropItem
...
def process_item(self, item, spider):
if [ your duplicate check logic goes here]:
raise DropItem('Duplicate element found')
else:
self.exporter.export_item(item)
return item
Dropped items are no longer passed to other pipeline components. You can read more about pipelines here.
I need to pass the URL, username, and password from one class to Scrapy Class to perform web scraping.
import quotes as q
import scrapy
from scrapy.crawler import CrawlerProcess
class ValidateURL:
def checkURL(self,urls):
try:
if(urls):
for key, value in urls.items():
if value['login_details']:
self.runScrap(value)
except:
return False
def runScrap(self,data):
if data:
''' data= "url_4": {
"url": ("https://quotes.toscrape.com/login",),
"fields_in_response": ["Quotes to Scrape","Login"],
"login_details": {"name":"foobar","pwd":"foobar" },
"fields_in_main_page": ["Quotes to Scrape","Top Ten tags"]
}
'''
process = CrawlerProcess()
process.crawl(q.QuotesSpider, start_urls=data['url'])
process.start()
And The scrapy class is
# -*- coding: utf-8 -*-
from scrapy import Spider
from scrapy.http import FormRequest
from scrapy.utils.response import open_in_browser
import sys
import logging
from bs4 import BeautifulSoup
# import scrapy
# from scrapy.crawler import CrawlerProcess
logging.basicConfig(filename='app.log',level=logging.INFO)
class QuotesSpider(Spider):
name = 'quotes'
start_urls = ('https://quotes.toscrape.com/login',)
def parse(self, response):
# print(self.req['url'])
print('/'*100)
self.start_urls=self.login_url
# print(type(self.login_url))
inputs =response.xpath('//form//input').extract()
soup_dict={}
for key,i in enumerate(inputs):
soup = BeautifulSoup(i, 'html.parser')
inp_type = soup.input['type'] if soup.input.has_attr('type') else None
inp_value = soup.input['value'] if soup.input.has_attr('value') else None
inp_name = soup.input['name'] if soup.input.has_attr('name') else None
soup_dict[key]= {'name':inp_name,'value':inp_value,'type':inp_type}
token = response.xpath('//*[#name="csrf_token"]/#value').extract_first()
return FormRequest.from_response(response,
formdata={'csrf_token': token,
'password': 'foobar',
'username': 'foobar'},
callback=self.scrape_pages)
def fetch_form_data(self,response):
if all(field in response for field in self.fields_in_response):
inputs =response.xpath('//form//input').extract()
soup_dict={}
for key,i in enumerate(inputs):
soup = BeautifulSoup(i, 'html.parser')
inp_type = soup.input['type'] if soup.input.has_attr('type') else None
inp_value = soup.input['value'] if soup.input.has_attr('value') else None
inp_name = soup.input['name'] if soup.input.has_attr('name') else None
soup_dict[key]= {'name':inp_name,'value':inp_value,'type':inp_type}
def scrape_pages(self, response):
open_in_browser(response)
# Complete your code here to scrape the pages that you are redirected to after logging in
# ....
# ....
However, I'm not able to update the class variable start_urls. with the passed variable from ValidateURL Class. I tried with init in the QuotesSpider class but that didn't work. Actually start_urls is a class member of BaseClass(Spider). Could some please help me to know how to update the class variable of baseclass
Could someone suggest what is missing
You can pass parameters to the Spider from crawl command like this
process.crawl(q.QuotesSpider, first='James', last='Bond')
my output is as follows
0 winner loser
1 winner1
2 loser1
3 winner2
4 loser2
5 winner3
6 loser3
how do I remove the empty cells so that winner and loser values are on the same row? I've tried to locate add new line parameters to pipelines but have no luck. Is there any way to over-ride pipelines to only write if item has a value to the row so the output can be on the same row?
spider.py
import scrapy
from scrapy_splash import SplashRequest
from scrapejs.items import SofascoreItemLoader
from scrapy import Spider
import scrapy
import json
from scrapy.http import Request, FormRequest
class MySpider(scrapy.Spider):
name = "jsscraper"
start_urls = ["https://www.sofascore.com/tennis/2018-02-07"]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url,
callback=self.parse,
endpoint='render.html',
args={'wait':3.5})
def parse(self, response):
for row in response.css('.event-team'):
il = SofascoreItemLoader(selector=row)
il.add_css('winner' , '.event-team:nth-
child(2)::text')
il.add_css('loser' , '.event-team:nth-
child(1)::text')
yield il.load_item()
pipline.py
from scrapy.exporters import CsvItemExporter
class ScrapejsPipeline(object):
def process_item(self, item, spider):
return item
class CsvPipeline(object):
def __init__(self):
self.file = open("quotedata2.csv", 'w+b')
self.exporter = CsvItemExporter(self.file, str)
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
items.py
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose,
from operator import methodcaller
from scrapy import Spider, Request, Selector
class SofascoreItem(scrapy.Item):
loser = scrapy.Field()
winner = scrapy.Field()
#date = scrapy.Field()
class SofascoreItemLoader(ItemLoader):
default_item_class = SofascoreItem
default_input_processor = MapCompose(methodcaller('strip'))
default_output_processor = TakeFirst()
Check this one, the problem is located: https://stackoverflow.com/a/48859488/9270398
I'm new both to Python and Scrapy so I'm not sure I've chosen the best method for doing this; but my aim is to get two (or more) different pictures at a page and naming the pictures differently.
How should I set up the pipeline, should I do a combined pipline or in separated pipelines? Now I've tried separated pipelines but can't make it work. The first picture downloads and renames perfectly, but the second one does not download at all (error message below).
I'm practicing at this page: http://www.allabolag.se/2321000016/STOCKHOLMS_LANS_LANDSTING
allabolagspider.py
class allabolagspider(CrawlSpider):
name="allabolagspider"
# allowed_domains = ["byralistan.se"]
start_urls = [
"http://www.allabolag.se/2321000016/STOCKHOLMS_LANS_LANDSTING"
]
pipelines = ['AllabolagPipeline', 'AllabolagPipeline2']
rules = (
Rule(LinkExtractor(allow = "http://www.allabolag.se/2321000016/STOCKHOLMS_LANS_LANDSTING"), callback='parse_link'),
)
def parse_link(self, response):
for sel in response.xpath('//*[#class="reportTable"]'):#//TODO==king it seems that IMDB has changed the html structure for these information
image = AllabolagItem()
tmptitle = response.xpath('''.//tr[2]/td[2]/table//tr[13]/td/span/text()''').extract()
tmptitle.insert(0, "logo-")
image['title'] = ["".join(tmptitle)]
rel = response.xpath('''.//tr[5]/td[2]/div[1]/div/a/img/#src''').extract()
image['image_urls'] = [urljoin(response.url, rel[0])]
yield image
for sel in response.xpath('//*[#class="mainWindow"]'):#//TODO==king it seems that IMDB has changed the html structure for these information
image2 = AllabolagItem()
tmptitle2 = response.xpath('''./div[2]/div[1]/ul/li[6]/a/text()''').extract()
tmptitle2.insert(0, "hej-")
image2['title2'] = ["".join(tmptitle2)]
rel2 = response.xpath('''./div[3]/div[1]/a/img/#src''').extract()
image2['image_urls2'] = [urljoin(response.url, rel2[0])]
yield image2
settings.py
BOT_NAME = 'allabolag'
SPIDER_MODULES = ['allabolag.spiders']
NEWSPIDER_MODULE = 'allabolag.spiders'
DOWNLOAD_DELAY = 2.5
CONCURRENT_REQUESTS = 250
USER_AGENT = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36"
ITEM_PIPELINES = {'allabolag.pipelines.AllabolagPipeline': 1,
'allabolag.pipelines.AllabolagPipeline2': 2,
}
IMAGES_STORE = 'Imagesfolder'
pipelines.py
import scrapy
from scrapy.pipelines.images import ImagesPipeline
import sqlite3 as lite
from allabolag import settings
from allabolag import items
con = None
class AllabolagPipeline(ImagesPipeline):
def set_filename(self, response):
return 'full/{0}.jpg'.format(response.meta['title'][0])
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url, meta={'title': item['title']})
def get_images(self, response, request, info):
for key, image, buf in super(AllabolagPipeline, self).get_images(response, request, info):
key = self.set_filename(response)
yield key, image, buf
class AllabolagPipeline2(ImagesPipeline):
def set_filename(self, response):
return 'full/{0}.jpg'.format(response.meta['title2'][0])
def get_media_requests(self, item, info):
for image_url2 in item['image_urls2']:
yield scrapy.Request(image_url2, meta={'title2': item['title2']})
def get_images(self, response, request, info):
for key, image, buf in super(AllabolagPipeline2, self).get_images(response, request, info):
key = self.set_filename2(response)
yield key, image, buf
copy paste from terminal
2016-03-08 22:15:58 [scrapy] ERROR: Error processing {'image_urls': [u'http://www.allabolag.se/img/prv/2798135.JPG'],
'images': [{'checksum': 'a567ec7c2bd99fd7eb20db42229a1bf9',
'path': 'full/0280bf8228087cd571e86f43859552f9880e558a.jpg',
'url': 'http://www.allabolag.se/img/prv/2798135.JPG'}],
'title': [u'logo-UTDELNINGSADRESS']}
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/Twisted-15.5.0-py2.7-macosx-10.6-intel.egg/twisted/internet/defer.py", line 588, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/Scrapy-1.0.3-py2.7.egg/scrapy/pipelines/media.py", line 45, in process_item
dlist = [self._process_request(r, info) for r in requests]
File "/Users/VickieB/Documents/Scrapy/Test1/tutorial/tandlakare/allabolag/pipelines.py", line 36, in get_media_requests
for image_url2 in item['image_urls2']:
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/Scrapy-1.0.3-py2.7.egg/scrapy/item.py", line 56, in __getitem__
return self._values[key]
KeyError: 'image_urls2'
There might be several bugs I haven't noticed but I can explain one of them...
KeyError generally signifies a failed dictionary lookup. In this case it means that, at some point during execution, you're passing an item (a dictionary) to def get_media_requests(self, item, info): that doesn't have the key "image_urls2"
Changing get_media_requests to this will show you when and should allow the script to keep executing.
def get_media_requests(self, item, info):
if "image_urls2" not in item:
print("ERROR - 'image_urls2' NOT IN ITEM/DICT")
else:
for image_url2 in item['image_urls2']:
yield scrapy.Request(image_url2, meta={'title2': item['title2']})
If you're lazy or don't care about a few missing values, you could enclose the whole thing in try/except like so:
def get_media_requests(self, item, info):
try:
for image_url2 in item['image_urls2']:
yield scrapy.Request(image_url2, meta={'title2': item['title2']})
except Exception as e:
print(str(e))
I am beginner to python and I am working with scrapy. I have used xmlitemexporter to export my scraped data to xml file. But i get only "<"/item"">" in the xml file.
My items.py is like follow:
from scrapy.item import Item, Field
class WorkwithitemsItem(Item):
title = Field()
link = Field()
publish = Field()
description = Field()
And the spider is like:
from scrapy import log
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from workwithitems.items import WorkwithitemsItem
class MySpider(BaseSpider):
name = 'spidey'
allowed_domains = ['ekantipur.com']
start_urls = [
'http://www.ekantipur.com/en/rss',
]
def parse(self, response):
self.log('A response from %s just arrived!' % response.url)
sel = Selector(response)
title = sel.xpath('//title/text()').extract()
link = sel.xpath('//link/text()').extract()
publish = sel.xpath('//pubDate/text()').extract()
description = sel.xpath('//description/text()').extract()
WorkwithitemsItem(title = title[2:], link = link[2:],
publish = publish, description = description[1:])
And the pipelines.py is:
from scrapy import signals
from scrapy.contrib.exporter import XmlItemExporter
class XmlExportPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s_products.xml' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = XmlItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
The settings.py is:
BOT_NAME = 'workwithitems'
SPIDER_MODULES = ['workwithitems.spiders']
NEWSPIDER_MODULE = 'workwithitems.spiders'
FEED_EXPORTERS_BASE = {
'xml': 'scrapy.contrib.exporter.XmlItemExporter',
}
ITEM_PIPELINES = {
'workwithitems.pipelines.XmlExportPipeline': 800,
}
I can't figure out where my problem is.
Ok! I found the problem. What i did is just put a 'return' at the last line in spider.py
return WorkwithitemsItem(title = title[2:], link = link[2:],
publish = publish, description = description[1:]
)