I've created a custom middleware class to handle to handle refused http requests.
The middleware is attached to a specific spider and the logs show its attached successfully and detached the original one:
Debugging this, I can see that the breakpoint on the process_response function in the original retry middleware is stopped while in my custom one it does not stop.
Any ideas?
EDIT:
to reproduce:
spider:
import time
import scrapy
from scrapy.crawler import CrawlerProcess
class UsptoPatentHistorySpider(scrapy.Spider):
name = 'quotes'
handle_httpstatus_list = [401, 429]
start_urls = [
'https://patentcenter.uspto.gov/retrieval/public/v1/applications/sdwp/external/metadata/7850271',
]
custom_settings = {'RETRY_HTTP_CODES': [429, 401],
'SPIDER_MIDDLEWARES': {
'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
'chalicelib.scrapy_my.scrapy_my.middlewares.TooManyRequestsRetryMiddleware': 543,
}}
def parse(self, response, **kwargs):
yield {"response": response.body}
def _handle_429(self, response):
time.sleep(60)
return response.follow()
if __name__ == "__main__":
process = CrawlerProcess()
process.crawl(UsptoPatentHistorySpider)
process.start()
middlewares.py:
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
from scrapy.downloadermiddlewares.retry import RetryMiddleware
from scrapy.utils.response import response_status_message
import time
class TooManyRequestsRetryMiddleware(RetryMiddleware):
def __init__(self, crawler):
super(TooManyRequestsRetryMiddleware, self).__init__(crawler.settings)
self.crawler = crawler
#classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_response(self, request, response, spider):
if request.meta.get('dont_retry', False):
return response
elif response.status == 429:
self.crawler.engine.pause()
time.sleep(5)
self.crawler.engine.unpause()
reason = response_status_message(response.status)
return self._retry(request, reason, spider) or response
elif response.status in self.retry_http_codes:
reason = response_status_message(response.status)
return self._retry(request, reason, spider) or response
return response
the function "process_response" in the custom middleware is never called
scrapy.downloadermiddlewares.retry.RetryMiddleware is downloader middleware (not spider middleware, see scrapy Architecture overview for details)
You didn't disabled original scrapy.downloadermiddlewares.retry.RetryMiddleware and you attached TooManyRequestsRetryMiddleware into spider middleware (you need to modify DOWNLOADER_MIDDLEWARES setting:
custom_settings = {'RETRY_HTTP_CODES': [429, 401],
'DOWNLOADER_MIDDLEWARES': { # not SPIDER_MIDDLEWARES
'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
'chalicelib.scrapy_my.scrapy_my.middlewares.TooManyRequestsRetryMiddleware': 543,
}}
Related
Via pika i take url from rabbitmq and try to create new request for Scrapy spider
When i start my spider by scrapy crawl spider spider just don't close due to raise DontCloseSpider() but don't create a request for spider
My custom exception:
import pika
from scrapy import signals
from scrapy.http import Request
from scrapy.exceptions import DontCloseSpider
class AddRequestExample:
def __init__(self, stats):
self.stats = stats
#classmethod
def from_crawler(cls, crawler):
s = cls(crawler)
crawler.signals.connect(s.spider_idle, signal=signals.spider_idle)
return s
def spider_idle(self, spider):
connection = pika.BlockingConnection(pika.ConnectionParameters(host='localhost'))
channel = connection.channel()
try:
url = channel.basic_get(queue='hello')[2]
url = url.decode()
crawler.engine.crawl(Request(url), self)
except Exception:
pass
raise DontCloseSpider()
my spider:
import scrapy
class QuotesSpider(scrapy.Spider):
name = "spider"
def parse(self, response):
yield {
'url': response.url
}
It looks like you are trying to reproduce approach from this answer.
In this case you need to define request callback function. As you process spider_idle signal from extension (not from spider) - it should be spider.parse method.
def spider_idle(self, spider):
....
try:
url = channel.basic_get(queue='hello')[2]
url = url.decode()
spider.crawler.engine.crawl(Request(url=url, callback = spider.parse), self)
except Exception:
....
I've created a python script using scrapy to scrape some information available in a certain webpage. The problem is the link I'm trying with gets redirected very often. However, when I try few times using requests, I get the desired content.
In case of scrapy, I'm unable to reuse the link because I found it redirecting no matter how many times I try. I can even catch the main url using response.meta.get("redirect_urls")[0] meant to be used resursively within parse method. However, it always gets redirected and as a result callback is not taking place.
This is my current attempt (the link used within the script is just a placeholder):
import scrapy
from scrapy.crawler import CrawlerProcess
class StackoverflowSpider(scrapy.Spider):
handle_httpstatus_list = [301, 302]
name = "stackoverflow"
start_url = 'https://stackoverflow.com/questions/22937618/reference-what-does-this-regex-mean'
def start_requests(self):
yield scrapy.Request(self.start_url,meta={"lead_link":self.start_url},callback=self.parse)
def parse(self,response):
if response.meta.get("lead_link"):
self.lead_link = response.meta.get("lead_link")
elif response.meta.get("redirect_urls"):
self.lead_link = response.meta.get("redirect_urls")[0]
try:
if response.status!=200 :raise
if not response.css("[itemprop='text'] > h2"):raise
answer_title = response.css("[itemprop='text'] > h2::text").get()
print(answer_title)
except Exception:
print(self.lead_link)
yield scrapy.Request(self.lead_link,meta={"lead_link":self.lead_link},dont_filter=True, callback=self.parse)
if __name__ == "__main__":
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
})
c.crawl(StackoverflowSpider)
c.start()
Question: How can I force scrapy to make a callback using the url that got redirected?
As far as I understand, you want to scrape a link until it stops redirecting and you finally get http status 200
If yes, then you have to first remove handle_httpstatus_list = [301, 302] from your code
Then create a CustomMiddleware in middlewares.py
class CustomMiddleware(object):
def process_response(self, request, response, spider):
if not response.css("[itemprop='text'] > h2"):
logging.info('Desired text not found so re-scraping' % (request.url))
req = request.copy()
request.dont_filter = True
return req
if response.status in [301, 302]:
original_url = request.meta.get('redirect_urls', [response.url])[0]
logging.info('%s is redirecting to %s, so re-scraping it' % (request._url, request.url))
request._url = original_url
request.dont_filter = True
return request
return response
Then your spider should look like something this
class StackoverflowSpider(scrapy.Spider):
name = "stackoverflow"
start_url = 'https://stackoverflow.com/questions/22937618/reference-what-does-this-regex-mean'
custom_settings = {
'DOWNLOADER_MIDDLEWARES': {
'YOUR_PROJECT_NAME.middlewares.CustomMiddleware': 100,
}
}
def start_requests(self):
yield scrapy.Request(self.start_url,meta={"lead_link":self.start_url},callback=self.parse)
def parse(self,response):
answer_title = response.css("[itemprop='text'] > h2::text").get()
print(answer_title)
If you tell me which site you are scraping then I can help you out, you can email me as well which is on my profile
You may want to see this.
If you need to prevent redirecting it is possible by request meta:
request = scrapy.Request(self.start_url,meta={"lead_link":self.start_url},callback=self.parse)
request.meta['dont_redirect'] = True
yield request
Due to documentation this is a way to stop redirecting.
I've made a spider which work to collect data on cdiscount website. However, each time I scrape more than 320 pages of a category, I have the 503 error and the spider close.
How to deal with that problem ? I've tried changing user agent and using a proxy pool like this :
def __init__(self, *args, **kwargs):
super(CdiscountSpider, self).__init__(*args, **kwargs)
self.proxy_pool = ['49.236.220.238:52840', '181.112.41.50:33381', '50.235.111.161:45126']
(...)
request = scrapy.Request(url, callback=self.parse_dir_contents) #on accède au contenu des catégories
request.meta["proxy"] = random.choice(self.proxy_pool)
yield request
but it didn't work. Please, any help greatly appreciated :)
You can have a download middleware that keeps retrying with new proxy the URLs that have 503 response until they are successfully scraped
create a file named custom_middleware.py
import random
import logging
class CustomMiddleware(object):
proxy_pool = ['49.236.220.238:52840', '181.112.41.50:33381', '50.235.111.161:45126']
def process_request(self, request, spider):
request.meta['proxy'] = “http://“ + random.choice(self.proxy_pool)
def process_response(self, request, response, spider):
if response.status in [503]:
logging.error("%s found for %s so retrying"%(response.status, response.url))
req = request.copy()
req.dont_filter = True
req.meta['proxy'] = “http://“ + random.choice(self.proxy_pool)
return req
else:
return response
and in your settings.py just enable that middleware
DOWNLOADER_MIDDLEWARES = {
'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 1,
'YOUR_PROJECT_PATH.custom_middleware.CustomMiddleware': 200,
}
I using Scrapy, and I would like to check the status code before to enter in the parse method.
My code look like:
class mywesbite(BaseSpider):
# Crawling Start
CrawlSpider.started_on = datetime.now()
# CrawlSpider
name = 'mywebsite'
DOWNLOAD_DELAY = 10
allowed_domains = ['mywebsite.com']
pathUrl = "URL/mywebsite.txt"
# Init
def __init__(self, local = None, *args, **kwargs):
# Heritage
super(mywebsite, self).__init__(*args, **kwargs)
# On Spider Closed
dispatcher.connect(self.spider_closed, signals.spider_closed)
def start_requests(self):
return [ Request(url = start_url) for start_url in [l.strip() for l in open(self.pathUrl).readlines()] ]
def parse(self, response):
print "==============="
print response.headers
print "==============="
# Selector
sel = Selector(response)
When my proxy is not blocked, I see the response headers, but when my IP is blocked, I just see in the output console this:
DEBUG: Ignoring response <999 https://www.mywebsite.com>: HTTP status
code is not handled or not allowed.
How to check the response header before to enter the parse method?
Edit:
answer: This bug appears when the spider is blocked/banned, by an anti-crawling system. You must use an unblocked proxy system.
I'm trying to crawl a large site. They have a rate limiting system in place. Is it possible to pause scrapy for 10 minutes when it encounter a 403 page? I know I can set a DOWNLOAD_DELAY but I noticed that I can scrape faster by setting a small DOWNLOAD_DELAY and then pause scrapy for a few minutes when it gets 403. This way the rate limiting gets triggered only once every hour or so.
You can write your own retry middleware and put it to middleware.py
from scrapy.downloadermiddlewares.retry import RetryMiddleware
from scrapy.utils.response import response_status_message
from time import sleep
class SleepRetryMiddleware(RetryMiddleware):
def __init__(self, settings):
RetryMiddleware.__init__(self, settings)
def process_response(self, request, response, spider):
if response.status in [403]:
sleep(120) # few minutes
reason = response_status_message(response.status)
return self._retry(request, reason, spider) or response
return super(SleepRetryMiddleware, self).process_response(request, response, spider)
and don't forget change settings.py
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
'your_project.middlewares.SleepRetryMiddleware': 100,
}
Scrapy is a Twisted-based Python framework. So, never use time.sleep or pause.until inside it!
Instead, try using Deferred() from Twisted.
class ScrapySpider(Spider):
name = 'live_function'
def start_requests(self):
yield Request('some url', callback=self.non_stop_function)
def non_stop_function(self, response):
parse_and_pause = Deferred() # changed
parse_and_pause.addCallback(self.second_parse_function) # changed
parse_and_pause.addCallback(pause, seconds=10) # changed
for url in ['url1', 'url2', 'url3', 'more urls']:
yield Request(url, callback=parse_and_pause) # changed
yield Request('some url', callback=self.non_stop_function) # Call itself
def second_parse_function(self, response):
pass
More info here: Scrapy: non-blocking pause