How to Retry 503 response using Scrapy DownloadMiddleware? - python

In my I have this
from scrapy.downloadermiddlewares.retry import RetryMiddleware
class Retry(RetryMiddleware):
def process_response(self, request, response, spider):
if response.status == '503':
logger.error("503 status returned: " + response.url)
return self._retry(request,response, spider) or response
logger.debug("response.status = "+str(response.status)+" from URL "+str(response.url))
logger.debug(response.headers)
return super(Retry, self).process_response(request, response, spider)
def _retry(self, request,response, spider):
logger.debug("Deleting session "+str(request.meta['sessionId']))
self.delete_session(request.meta['sessionId'])
logger.debug("Retrying URL: %(request)s", {'request': request})
logger.debug("Response headers were:")
logger.debug(request.headers)
retryreq = request.copy()
retryreq.headers['Authorization'] = crawlera_auth.strip()
retryreq.headers['X-Crawlera-Session'] = 'create'
retryreq.dont_filter = True
return retryreq
And in my settings.py I have this
DOWNLOADER_MIDDLEWARES = {
'craigslist_tickets.retrymiddleware.Retry': 100,
'craigslist_tickets.crawlera_proxy_middleware.CrawleraProxyMiddleware': 200
}
I can see output like response.status = 200 for all URLs which are scraped successfully, but for the URLs which return 500 don't even are passing through process_response
I can only see in terminal
[scrapy] DEBUG: Retrying <GET http:website.com> (failed 1 times): 503 Service Unavailable
SHORT QUESTION:
I want to scrape URLs which return 503 again by passing through process_response method of my custom class Retry

I had
RETRY_HTTP_CODES = [503] in settings.py so thats why Scrapy was handeling 503 code by itself.
Now I changed it to RETRY_HTTP_CODES = [] now every URL that returns 503 is being passed via process_response method of retrymiddleware.Retry class ...
Mission accomplished.

according to the documentation the RetryMiddleware handles by deafult the 500 codes and because of its priority the response can not be reached by your code (please check the base downloaders. I would suggest to change the priority of your Retry middleware to 650, like:
DOWNLOADER_MIDDLEWARES = {
'craigslist_tickets.retrymiddleware.Retry': 650,
'craigslist_tickets.crawlera_proxy_middleware.CrawleraProxyMiddleware': 200
}

Related

Scrapy not scraping all pages

I'm new to scrapy and have been trying to develop a spider that scrapes Tripadvisor's things to do page. Trip advisor paginates results with offset so I made it find the last page num, multiply the number of results per page, and loop over a range with a step of 30. However it returns only a fraction of the results its supposed to, and get_details prints out 7 of the 28 pages scraped. I believe what is happening is url redirection on random pages.
Scrapy logs this 301 redirection on the other pages, and it appears to be redirecting to the first page. I tried disabling redirection but that did not work.
2021-03-28 18:46:38 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://www.tripadvisor.com/Attractions-g55229-Activities-a_allAttractions.true-Nashville_Davidson_County_Tennessee.html> from <GET https://www.tripadvisor.com/Attractions-g55229-Activities-a_allAttractions.true-oa90-Nashville_Davidson_County_Tennessee.html>
Here's my code for the spider:
import scrapy
import re
class TripadvisorSpider(scrapy.Spider):
name = "tripadvisor"
start_urls = [
'https://www.tripadvisor.com/Attractions-g55229-Activities-a_allAttractions.true-oa{}-Nashville_Davidson_County_Tennessee.html'
]
def parse(self, response):
num_pages = int(response.css(
'._37Nr884k .DrjyGw-P.IT-ONkaj::text')[-1].get())
for offset in range(0, num_pages * 30, 30):
formatted_url = self.start_urls[0].format(offset)
yield scrapy.Request(formatted_url, callback=self.get_details)
def get_details(self, response):
print('url is ' + response.url)
for listing in response.css('div._19L437XW._1qhi5DVB.CO7bjfl5'):
yield {
'title': listing.css('._392swiRT ._1gpq3zsA._1zP41Z7X::text')[1].get(),
'category': listing.css('._392swiRT ._1fV2VpKV .DrjyGw-P._26S7gyB4._3SccQt-T::text').get(),
'rating': float(re.findall(r"[-+]?\d*\.\d+|\d+", listing.css('svg.zWXXYhVR::attr(title)').get())[0]),
'rating_count': float(listing.css('._392swiRT .DrjyGw-P._26S7gyB4._14_buatE._1dimhEoy::text').get().replace(',', '')),
'url': listing.css('._3W_31Rvp._1nUIPWja._17LAEUXp._2b3s5IMB a::attr(href)').get(),
'main_image': listing.css('._1BR0J4XD').attrib['src']
}
Is there a way to get scrapy working for each page? What is causing this problem exactly?
Found a solution. Discovered I needed to handle the redirection manually and disable Scrapy's default middleware.
Here is the custom middleware I added to middlewares.py
from scrapy.downloadermiddlewares.retry import RetryMiddleware
from scrapy.selector import Selector
from scrapy.utils.response import get_meta_refresh
class CustomRetryMiddleware(RetryMiddleware):
def process_response(self, request, response, spider):
url = response.url
if response.status in [301, 307]:
reason = 'redirect %d' % response.status
return self._retry(request, reason, spider) or response
interval, redirect_url = get_meta_refresh(response)
# handle meta redirect
if redirect_url:
reason = 'meta'
return self._retry(request, reason, spider) or response
hxs = Selector(response)
# test for captcha page
captcha = hxs.xpath(
".//input[contains(#id, 'captchacharacters')]").extract()
if captcha:
reason = 'capcha'
return self._retry(request, reason, spider) or response
return response
It is an updated version from this question's top answer.
Scrapy retry or redirect middleware

Python scrapy middleware not working as expected

I've created a custom middleware class to handle to handle refused http requests.
The middleware is attached to a specific spider and the logs show its attached successfully and detached the original one:
Debugging this, I can see that the breakpoint on the process_response function in the original retry middleware is stopped while in my custom one it does not stop.
Any ideas?
EDIT:
to reproduce:
spider:
import time
import scrapy
from scrapy.crawler import CrawlerProcess
class UsptoPatentHistorySpider(scrapy.Spider):
name = 'quotes'
handle_httpstatus_list = [401, 429]
start_urls = [
'https://patentcenter.uspto.gov/retrieval/public/v1/applications/sdwp/external/metadata/7850271',
]
custom_settings = {'RETRY_HTTP_CODES': [429, 401],
'SPIDER_MIDDLEWARES': {
'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
'chalicelib.scrapy_my.scrapy_my.middlewares.TooManyRequestsRetryMiddleware': 543,
}}
def parse(self, response, **kwargs):
yield {"response": response.body}
def _handle_429(self, response):
time.sleep(60)
return response.follow()
if __name__ == "__main__":
process = CrawlerProcess()
process.crawl(UsptoPatentHistorySpider)
process.start()
middlewares.py:
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
from scrapy.downloadermiddlewares.retry import RetryMiddleware
from scrapy.utils.response import response_status_message
import time
class TooManyRequestsRetryMiddleware(RetryMiddleware):
def __init__(self, crawler):
super(TooManyRequestsRetryMiddleware, self).__init__(crawler.settings)
self.crawler = crawler
#classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_response(self, request, response, spider):
if request.meta.get('dont_retry', False):
return response
elif response.status == 429:
self.crawler.engine.pause()
time.sleep(5)
self.crawler.engine.unpause()
reason = response_status_message(response.status)
return self._retry(request, reason, spider) or response
elif response.status in self.retry_http_codes:
reason = response_status_message(response.status)
return self._retry(request, reason, spider) or response
return response
the function "process_response" in the custom middleware is never called
scrapy.downloadermiddlewares.retry.RetryMiddleware is downloader middleware (not spider middleware, see scrapy Architecture overview for details)
You didn't disabled original scrapy.downloadermiddlewares.retry.RetryMiddleware and you attached TooManyRequestsRetryMiddleware into spider middleware (you need to modify DOWNLOADER_MIDDLEWARES setting:
custom_settings = {'RETRY_HTTP_CODES': [429, 401],
'DOWNLOADER_MIDDLEWARES': { # not SPIDER_MIDDLEWARES
'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
'chalicelib.scrapy_my.scrapy_my.middlewares.TooManyRequestsRetryMiddleware': 543,
}}

Unable to force scrapy to make a callback using the url that got redirected

I've created a python script using scrapy to scrape some information available in a certain webpage. The problem is the link I'm trying with gets redirected very often. However, when I try few times using requests, I get the desired content.
In case of scrapy, I'm unable to reuse the link because I found it redirecting no matter how many times I try. I can even catch the main url using response.meta.get("redirect_urls")[0] meant to be used resursively within parse method. However, it always gets redirected and as a result callback is not taking place.
This is my current attempt (the link used within the script is just a placeholder):
import scrapy
from scrapy.crawler import CrawlerProcess
class StackoverflowSpider(scrapy.Spider):
handle_httpstatus_list = [301, 302]
name = "stackoverflow"
start_url = 'https://stackoverflow.com/questions/22937618/reference-what-does-this-regex-mean'
def start_requests(self):
yield scrapy.Request(self.start_url,meta={"lead_link":self.start_url},callback=self.parse)
def parse(self,response):
if response.meta.get("lead_link"):
self.lead_link = response.meta.get("lead_link")
elif response.meta.get("redirect_urls"):
self.lead_link = response.meta.get("redirect_urls")[0]
try:
if response.status!=200 :raise
if not response.css("[itemprop='text'] > h2"):raise
answer_title = response.css("[itemprop='text'] > h2::text").get()
print(answer_title)
except Exception:
print(self.lead_link)
yield scrapy.Request(self.lead_link,meta={"lead_link":self.lead_link},dont_filter=True, callback=self.parse)
if __name__ == "__main__":
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
})
c.crawl(StackoverflowSpider)
c.start()
Question: How can I force scrapy to make a callback using the url that got redirected?
As far as I understand, you want to scrape a link until it stops redirecting and you finally get http status 200
If yes, then you have to first remove handle_httpstatus_list = [301, 302] from your code
Then create a CustomMiddleware in middlewares.py
class CustomMiddleware(object):
def process_response(self, request, response, spider):
if not response.css("[itemprop='text'] > h2"):
logging.info('Desired text not found so re-scraping' % (request.url))
req = request.copy()
request.dont_filter = True
return req
if response.status in [301, 302]:
original_url = request.meta.get('redirect_urls', [response.url])[0]
logging.info('%s is redirecting to %s, so re-scraping it' % (request._url, request.url))
request._url = original_url
request.dont_filter = True
return request
return response
Then your spider should look like something this
class StackoverflowSpider(scrapy.Spider):
name = "stackoverflow"
start_url = 'https://stackoverflow.com/questions/22937618/reference-what-does-this-regex-mean'
custom_settings = {
'DOWNLOADER_MIDDLEWARES': {
'YOUR_PROJECT_NAME.middlewares.CustomMiddleware': 100,
}
}
def start_requests(self):
yield scrapy.Request(self.start_url,meta={"lead_link":self.start_url},callback=self.parse)
def parse(self,response):
answer_title = response.css("[itemprop='text'] > h2::text").get()
print(answer_title)
If you tell me which site you are scraping then I can help you out, you can email me as well which is on my profile
You may want to see this.
If you need to prevent redirecting it is possible by request meta:
request = scrapy.Request(self.start_url,meta={"lead_link":self.start_url},callback=self.parse)
request.meta['dont_redirect'] = True
yield request
Due to documentation this is a way to stop redirecting.

Error 503 while scraping cdiscount (scrapy) Python

I've made a spider which work to collect data on cdiscount website. However, each time I scrape more than 320 pages of a category, I have the 503 error and the spider close.
How to deal with that problem ? I've tried changing user agent and using a proxy pool like this :
def __init__(self, *args, **kwargs):
super(CdiscountSpider, self).__init__(*args, **kwargs)
self.proxy_pool = ['49.236.220.238:52840', '181.112.41.50:33381', '50.235.111.161:45126']
(...)
request = scrapy.Request(url, callback=self.parse_dir_contents) #on accède au contenu des catégories
request.meta["proxy"] = random.choice(self.proxy_pool)
yield request
but it didn't work. Please, any help greatly appreciated :)
You can have a download middleware that keeps retrying with new proxy the URLs that have 503 response until they are successfully scraped
create a file named custom_middleware.py
import random
import logging
class CustomMiddleware(object):
proxy_pool = ['49.236.220.238:52840', '181.112.41.50:33381', '50.235.111.161:45126']
def process_request(self, request, spider):
request.meta['proxy'] = “http://“ + random.choice(self.proxy_pool)
def process_response(self, request, response, spider):
if response.status in [503]:
logging.error("%s found for %s so retrying"%(response.status, response.url))
req = request.copy()
req.dont_filter = True
req.meta['proxy'] = “http://“ + random.choice(self.proxy_pool)
return req
else:
return response
and in your settings.py just enable that middleware
DOWNLOADER_MIDDLEWARES = {
'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 1,
'YOUR_PROJECT_PATH.custom_middleware.CustomMiddleware': 200,
}

Scrapy - Correct way to change User Agent in Request

I have created a custom Middleware in Scrapy by overriding the RetryMiddleware which changes both Proxy and User-Agent before retrying. It looks like this
class CustomRetryMiddleware(RetryMiddleware):
def _retry(self, request, reason, spider):
retries = request.meta.get('retry_times', 0) + 1
if retries <= self.max_retry_times:
Proxy_UA_Middleware.switch_proxy()
Proxy_UA_Middleware.switch_ua()
logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s",
{'request': request, 'retries': retries, 'reason': reason},
extra={'spider': spider})
retryreq = request.copy()
retryreq.meta['retry_times'] = retries
retryreq.dont_filter = True
retryreq.priority = request.priority + self.priority_adjust
return retryreq
else:
logger.debug("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
{'request': request, 'retries': retries, 'reason': reason},
extra={'spider': spider})
The Proxy_UA_Middlware class is quite long. Basically it contains methods that change proxy and user agent. I have both these middlewares configured properly in my settings.py file. The proxy part works okay but the User Agent doesn't change. The code I've used to changed User Agent looks like this
request.headers.setdefault('User-Agent', self.user_agent)
where self.user_agent is a random value taken from an array of user agents. This doesn't work. However, if I do this
request.headers['User-Agent'] = self.user_agent
then it works just fine and the user agent changes successfully for each retry. But I haven't seen anyone use this method to change the User Agent. My question is if changing the User Agent this way is okay and if not what am I doing wrong?
If you always want to control which user-agent to use on that middleware, then it is ok, what setdefault does is to check if there is no User-Agent assigned before, which is possible because other middlewares could be doing it, or even assigning it from the spider.
Also I think you should also disable the default UserAgentMiddleware or even set a higher priority to your middleware, check that UserAgentMiddleware priority is 400, so set yours to be before (some number before 400).
First, you are overriding a function with _ (an underscore) in the front which should be a "private" function in Python. The function might change in the different version of Scrapy and your overriding will hinder the upgrade/downgrade. It's risky for you to override it. It's better to change the user agent in another function wrapping _retry.
I've made a function for that but using Scrapy fake user agent module. I found two functions calling _retry. So, retry happens on exception and on retry statuses. We need to change the user agent on both functions in the request before it is retried. So this is the code:
from scrapy.downloadermiddlewares.retry import *
from scrapy.spidermiddlewares.httperror import *
from fake_useragent import UserAgent
class Retry500Middleware(RetryMiddleware):
def __init__(self, settings):
super(Retry500Middleware, self).__init__(settings)
fallback = settings.get('FAKEUSERAGENT_FALLBACK', None)
self.ua = UserAgent(fallback=fallback)
self.ua_type = settings.get('RANDOM_UA_TYPE', 'random')
def get_ua(self):
'''Gets random UA based on the type setting (random, firefox…)'''
return getattr(self.ua, self.ua_type)
def process_response(self, request, response, spider):
if request.meta.get('dont_retry', False):
return response
if response.status in self.retry_http_codes:
reason = response_status_message(response.status)
request.headers['User-Agent'] = self.get_ua()
return self._retry(request, reason, spider) or response
return response
def process_exception(self, request, exception, spider):
if isinstance(exception, self.EXCEPTIONS_TO_RETRY) \
and not request.meta.get('dont_retry', False):
request.headers['User-Agent'] = self.get_ua()
return self._retry(request, exception, spider)
Don't forget to enable the middleware via settings.py and disable the standard retry and user agent middleware.
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
'my_project.middlewares.Retry500Middleware': 401,
'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
}
FAKEUSERAGENT_FALLBACK = "<your favorite user agent>"

Categories

Resources