How can I implement custom proxy on Scrapy? - python

I'm trying to implement custom scraper API but as a begging I think I'm doing wrong. But I follow their documentation to setup everything. Here is a documentation link
from scrapy import Spider
from scrapy.http import Request
from .config import API
from scraper_api import ScraperAPIClient
client = ScraperAPIClient(API)
class GlassSpider(Spider):
name = 'glass'
allowed_domains = ['glassdoor.co.uk']
start_urls = [client.scrapyGet(url='https://www.glassdoor.co.uk/Job/russian-jobs-SRCH_KE0,7.htm?fromAge=1')]
def parse(self, response):
jobs = response.xpath('//*[contains(#class, "react-job-listing")]')
for job in jobs:
job_url = job.xpath('.//*[contains(#class, "jobInfoItem jobTitle")]/#href').extract_first()
absulate_job_url = response.urljoin(job_url)
yield Request(client.scrapyGet(url=absulate_job_url),
callback=self.parse_jobpage,
meta={
"Job URL": absulate_job_url
})
def parse_jobpage(self, response):
absulate_job_url = response.meta.get('Job URL')
job_description = "".join(line for line in response.xpath('//*[contains(#class, "desc")]//text()').extract())
yield {
"Job URL": absulate_job_url,
"Job Description": job_description
}
That's the output I'm receiving.... Please what's wrong with my code. Please fix it for me. So I can follow and get the point. Thank you.
2020-10-01 23:01:45 [scrapy.core.engine] DEBUG: Crawled (200) <GET
https://api.scraperapi.com/?url=https%3A%2F%2Fwww.glassdoor.co.uk%2FJob%2F
russian-jobs-SRCH_KE0%2C7.htm%3FfromAge%3D1&api_key=bec9dd9f2be095dfc6158a7e609&scraper_sdk=python>
(referer: None)
2020-10-01 23:01:45 [scrapy.spidermiddlewares.offsite] DEBUG: Filtered offsite request to 'api.scraperapi.com': <GET
https://api.scraperapi.c
om/?url=https%3A%2F%2Fapi.scraperapi.com%2Fpartner%2FjobListing.htm%3Fpos%3D101%26ao%3D1044074%26s%3D149%26guid%3D00000174e51ccd8988e2e5420e6
7cf0d%26src%3DGD_JOB_AD%26t%3DSRFJ%26vt%3Dw%26cs%3D1_94f59ee8%26cb%3D1601571704401%26jobListingId%3D3696480795&api_key=bec9d9f82b0955c61
5c8a7e639scraper_sdk=python>

I'm not familiar with this particular lib, but from your execution logs the issue is that your request is beign filtered, since it's consider offsite.
[scrapy.spidermiddlewares.offsite] DEBUG: Filtered offsite request to 'api.scraperapi.com': <GET https://api.scraperapi.c om/?url=https%3A%2F%2Fapi.scraperapi.com%2Fpartner%2FjobListing.htm%3Fpos%3D101%26ao%3D1044074%26s%3D149%26guid%3D00000174e51ccd8988e2e5420e6 7cf0d%26src%3DGD_JOB_AD%26t%3DSRFJ%26vt%3Dw%26cs%3D1_94f59ee8%26cb%3D1601571704401%26jobListingId%3D3696480795&api_key=bec9d9f82b0955c61 5c8a7e639scraper_sdk=python>
Since scraperapi will make your request go through their domain and that's outside of what you defined in your allowed_domains it's filtered as an offsite request. To avoid this issue you can remove this line entirely:
allowed_domains = ['glassdoor.co.uk']
or try include 'api.scraperapi.com' in it.

Related

Scrapy bypass data usage consent wall

I am scraping yahoo finance news using the code below.
class YfinNewsSpider(scrapy.Spider):
name = 'yfin_news_spider'
custom_settings = {'DOWNLOAD_DELAY': '0.5', 'COOKIES_ENABLED': True, 'COOKIES_DEBUG': True}
def __init__(self, month, year, **kwargs):
self.start_urls = ['https://finance.yahoo.com/sitemap/2020_03_all']
self.allowed_domains = ['finance.yahoo.com']
super().__init__(**kwargs)
def parse(self, response):
all_news_urls = response.xpath('//ul/li[#class="List(n) Py(3px) Lh(1.2)"]')
for news in all_news_urls:
news_url = news.xpath('.//a[#class="Td(n) Td(u):h C($c-fuji-grey-k)"]/#href').extract_first()
yield scrapy.Request(news_url, callback=self.parse_news, dont_filter=True)
def parse_news(self, response):
news_url = str(response.url)
title = response.xpath('//title/text()').extract_first()
paragraphs = response.xpath('//div[#class="caas-body"]/p/text()').extract()
date_time = response.xpath('//div[#class="caas-attr-time-style"]/time/#datetime').extract_first()
yield {'title': title, 'url': news_url, 'body_text': paragraphs, 'timestamp': date_time}
However, when I run my spider it give me below results.
2020-11-28 20:42:40 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://consent.yahoo.com/v2/collectConsent?sessionId=3_cc-session_05cc09ea-0bc0-439d-8b4c-2d6f20f52d6e> (referer: https://finance.yahoo.com/sitemap/2020_03_all)
2020-11-28 20:42:40 [scrapy.downloadermiddlewares.cookies] DEBUG: Sending cookies to: <GET https://finance.yahoo.com/news/onegold-becomes-first-company-offer-110000241.html>
Cookie: B=cnmvgrdfs5a0r&b=3&s=o1; GUCS=ASXMbR9p
2020-11-28 20:42:40 [scrapy.core.scraper] DEBUG: Scraped from <200 https://consent.yahoo.com/v2/collectConsent?sessionId=3_cc-session_05cc09ea-0bc0-439d-8b4c-2d6f20f52d6e>
{'title': 'Yahoo er nu en del af Verizon Media', 'url': 'https://consent.yahoo.com/v2/collectConsent?sessionId=3_cc-session_05cc09ea-0bc0-439d-8b4c-2d6f20f52d6e', 'body_text': [], 'timestamp': None}
2020-11-28 20:42:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://consent.yahoo.com/v2/collectConsent?sessionId=3_cc-session_d6731ce6-78bc-4222-914f-24cf98f874b8> (referer: https://finance.yahoo.com/sitemap/2020_03_all)
This seems to indicate that when my spider go to https://finance.yahoo.com/news/onegold-becomes-first-company-offer-110000241.html found in https://finance.yahoo.com/sitemap/2020_03_all. It tried sending cookie to https://finance.yahoo.com/news/onegold-becomes-first-company-offer-110000241.html, but was redirected to consent accepting wall https://consent.yahoo.com/v2/collectConsent?sessionId=3_cc-session_05cc09ea-0bc0-439d-8b4c-2d6f20f52d6e.
I open this consent wall https://consent.yahoo.com/v2/collectConsent?sessionId=3_cc-session_05cc09ea-0bc0-439d-8b4c-2d6f20f52d6e in browser and found data consent accepting screen. When I clicked accept, it brought me to the correct site that I want to scrape. The scraping results also exactly the content in this consent screen.
I have tried setting COOKIES_ENABLED to True, but it did not work. So, is there anyway to bypass this accepting screen in scrapy?
Thank you.
You can try one thing:
Open the consent page on network tab, then click on the give consent button. There you can identify the request that it sends when you give your consent. You can try replicating the same request using scrapy. May be this way your issue will be solved.
Other option would be to use scrapy-selenium to manually click that button and then scrapy can take over from there.

scrapy 503 Service Unavailable on starturl

I modifed this spider but it gives this errors
Gave up retrying <GET https://lib.maplelegends.com/robots.txt> (failed 3 times): 503 Service Unavailable
2019-01-06 23:43:56 [scrapy.core.engine] DEBUG: Crawled (503) <GET https://lib.maplelegends.com/robots.txt> (referer: None)
2019-01-06 23:43:56 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET https://lib.maplelegends.com/?p=etc&id=4004003> (failed 1 times): 503 Service Unavailable
2019-01-06 23:43:56 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET https://lib.maplelegends.com/?p=etc&id=4004003> (failed 2 times): 503 Service Unavailable
2019-01-06 23:43:56 [scrapy.downloadermiddlewares.retry] DEBUG: Gave up retrying <GET https://lib.maplelegends.com/?p=etc&id=4004003> (failed 3 times): 503 Service Unavailable
2019-01-06 23:43:56 [scrapy.core.engine] DEBUG: Crawled (503) <GET https://lib.maplelegends.com/?p=etc&id=4004003> (referer: None)
2019-01-06 23:43:56 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <503 https://lib.maplelegends.com/?p=etc&id=4004003>: HTTP status code is not handled or not allowed
Crawler code:
#!/usr/bin/env python3
import scrapy
import time
start_url = 'https://lib.maplelegends.com/?p=etc&id=4004003'
class MySpider(scrapy.Spider):
name = 'MySpider'
start_urls = [start_url]
def parse(self, response):
# print('url:', response.url)
products = response.xpath('.//div[#class="table-responsive"]/table/tbody')
for product in products:
item = {
#'name': product.xpath('./tr/td/b[1]/a/text()').extract(),
'link': product.xpath('./tr/td/b[1]/a/#href').extract(),
}
# url = response.urljoin(item['link'])
# yield scrapy.Request(url=url, callback=self.parse_product, meta={'item': item})
yield response.follow(item['link'], callback=self.parse_product, meta={'item': item})
time.sleep(5)
# execute with low
yield scrapy.Request(start_url, dont_filter=True, priority=-1)
def parse_product(self, response):
# print('url:', response.url)
# name = response.xpath('(//strong)[1]/text()').re(r'(\w+)')
hp = response.xpath('//*[contains(concat( " ", #class, " " ), concat( " ", "image", " " ))] | //img').re(r':(\d+)')
scrolls = response.xpath('//*[contains(concat( " ", #class, " " ), concat( " ", "image", " " ))] | //strong+//a//img/#title').re(r'\bScroll\b')
for price, hp, scrolls in zip(name, hp, scrolls):
yield {'name': name.strip(), 'hp': hp.strip(), 'scroll':scrolls.strip()}
--- it runs without project and saves in output.csv ---
from scrapy.crawler import CrawlerRunner
def _run_crawler(spider_cls, settings):
"""
spider_cls: Scrapy Spider class
returns: Twisted Deferred
"""
runner = CrawlerRunner(settings)
return runner.crawl(spider_cls) # return Deferred
def test_scrapy_crawler():
deferred = _run_crawler(MySpider, settings)
#deferred.addCallback
def _success(results):
"""
After crawler completes, this function will execute.
Do your assertions in this function.
"""
#deferred.addErrback
def _error(failure):
raise failure.value
return deferred
Robots.txt
Your crawler is trying to check robots.txt file but the website doesn't have one present.
To avoid this you can set ROBOTSTXT_OBEY setting to false in your settings.py file.
By default it's False but new scrapy projects generated with scrapy startproject command has ROBOTSTXT_OBEY = True generated from the template.
503 responses
Further the website seems to respond as 503 on every first request. The website is using some sort of bot protection:
First request is 503 then some javascript is being executed to make an AJAX request for generating __shovlshield cookie:
Seems like https://shovl.io/ ddos protection is being used.
To solve this you need to reverse engineer how javascript generates the cookie or employ javascript rendering techniques/services such as selenium or splash

Issue with scrapy spider

I am trying to get volume-weighted average prices for stocks from the moneycontrol.com website. The parse function is running without any issues but the parse_links function is not getting called. Am i missing something here?
# -*- coding: utf-8 -*-
import scrapy
class MoneycontrolSpider(scrapy.Spider):
name = "moneycontrol"
allowed_domains = ["https://www.moneycontrol.com"]
start_urls = ["https://www.moneycontrol.com/india/stockpricequote"]
def parse(self,response):
for link in response.css('td.last > a::attr(href)').extract():
if(link):
yield scrapy.Request(link, callback=self.parse_links,method='GET')
def parse_links(self, response):
VWAP= response.xpath('//*[#id="n_vwap_val"]/text()').extract_first()
print(VWAP)
with open('quotes.txt','a+') as f:
f.write('VWAP: {}'.format(VWAP) + '\n')
If you read the log output, the error becomes obvious.
2018-09-08 19:52:38 [py.warnings] WARNING: c:\program files\python37\lib\site-packages\scrapy\spidermiddlewares\offsite.py:59: URLWarning: allowed_domains accepts only domains, not URLs. Ignoring URL entry https://www.moneycontrol.com in allowed_domains.
warnings.warn("allowed_domains accepts only domains, not URLs. Ignoring URL entry %s in allowed_domains." % domain, URLWarning)
2018-09-08 19:52:38 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023
2018-09-08 19:52:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.moneycontrol.com/india/stockpricequote> (referer: None)
2018-09-08 19:52:40 [scrapy.spidermiddlewares.offsite] DEBUG: Filtered offsite request to 'www.moneycontrol.com': <GET http://www.moneycontrol.com/india/stockpricequote/chemicals/aartiindustries/AI45>
So just fix your allowed_domains, and you should be fine:
allowed_domains = ["moneycontrol.com"]

How many items has been scraped per start_url

I use scrapy to crawl 1000 urls and store scraped item in a mongodb. I'd to know how many items have been found for each url. From scrapy stats I can see 'item_scraped_count': 3500
However, I need this count for each start_url separately. There is also referer field for each item that I might use to count each url items manually:
2016-05-24 15:15:10 [scrapy] DEBUG: Crawled (200) <GET https://www.youtube.com/watch?v=6w-_ucPV674> (referer: https://www.youtube.com/results?q=billys&sp=EgQIAhAB)
But I wonder if there is a built-in support from scrapy.
challenge accepted!
there isn't something on scrapy that directly supports this, but you could separate it from your spider code with a Spider Middleware:
middlewares.py
from scrapy.http.request import Request
class StartRequestsCountMiddleware(object):
start_urls = {}
def process_start_requests(self, start_requests, spider):
for i, request in enumerate(start_requests):
self.start_urls[i] = request.url
request.meta.update(start_request_index=i)
yield request
def process_spider_output(self, response, result, spider):
for output in result:
if isinstance(output, Request):
output.meta.update(
start_request_index=response.meta['start_request_index'],
)
else:
spider.crawler.stats.inc_value(
'start_requests/item_scraped_count/{}'.format(
self.start_urls[response.meta['start_request_index']],
),
)
yield output
Remember to activate it on settings.py:
SPIDER_MIDDLEWARES = {
...
'myproject.middlewares.StartRequestsCountMiddleware': 200,
}
Now you should be able to see something like this on your spider stats:
'start_requests/item_scraped_count/START_URL1': ITEMCOUNT1,
'start_requests/item_scraped_count/START_URL2': ITEMCOUNT2,

python method is not called

I have following class method in a scrapy spider. parse_category yields a Request object that has callback to parse_product. Sometimes a category page redirects to a product page. So here I detect if a category page is a product page. If it is, I just call the parse_product method. But for some reason it does not call the method.
def parse(self, response):
hxs = HtmlXPathSelector(response)
anchors = hxs.select('//div[#id="panelMfr"]/div/ul/li[position() != last()]/a')
for anchor in anchors[2:3]:
url = anchor.select('#href').extract().pop()
cat = anchor.select('text()').extract().pop().strip()
yield Request(urljoin(get_base_url(response), url), callback=self.parse_category, meta={"category": cat})
def parse_category(self, response):
hxs = HtmlXPathSelector(response)
base_url = get_base_url(response)
# check if its a redirected product page
if (hxs.select(self.product_name_xpath)):
self.log("Category-To-Product Redirection")
self.parse_product(response) # <<---- This line is not called.
self.log("Product Parsed")
return
products_xpath = '//div[#class="productName"]/a/#href'
products = hxs.select(products_xpath).extract()
for url in products:
yield Request(urljoin(base_url, url), callback=self.parse_product, meta={"category": response.meta['category']})
next_page = hxs.select('//table[#class="nav-back"]/tr/td/span/a[contains(text(), "Next")]/text()').extract()
if next_page:
url = next_page[0]
yield Request(urljoin(base_url, url), callback=self.parse_category, meta={"category": response.meta['category']})
def parse_product(self, response):
hxs = HtmlXPathSelector(response)
base_url = get_base_url(response)
self.log("Inside parse_product")
In the log I see Category-To-Product Redirection and Product Parsed is printed but Inside parse_product is missing. Whats did I do wrong here?
2013-07-12 21:31:34+0100 [example.com] DEBUG: Crawled (200) <GET http://www.example.com/category.aspx> (referer: None)
2013-07-12 21:31:34+0100 [example.com] DEBUG: Redirecting (302) to <GET http://www.example.com/productinfo.aspx?catref=AM6901> from <GET http://www.example.com/products/Inks-Toners/Apple>
2013-07-12 21:31:35+0100 [example.com] DEBUG: Crawled (200) <GET http://www.example.com/productinfo.aspx?catref=AM6901> (referer: http://www.example.com/category.aspx)
2013-07-12 21:31:35+0100 [example.com] DEBUG: Category-To-Product Redirection
2013-07-12 21:31:35+0100 [example.com] DEBUG: Product Parsed
2013-07-12 21:31:35+0100 [example.com] INFO: Closing spider (finished)
2013-07-12 21:31:35+0100 [-] ERROR: ERROR:root:SPIDER CLOSED: No. of products: 0

Categories

Resources