My complete code:
import re
from bs4 import BeautifulSoup
import json
from typing import Any, Optional, cast
from inline_requests import inline_requests
from scrapy import Spider, Request
import asyncio
class QuotesSpider(Spider):
name = "scraper"
custom_settings = {
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
}
def start_requests(self):
codes= ["A", "B"]
url = "https://somesite.com/"
for i, code in enumerate(codes):
yield Request(url=url, callback=self.handle, meta={'cookiejar': i, "code": code})
#inline_requests
async def handle(self, response):
code = response.meta["code"]
cookiejar_ref = response.meta["cookiejar"]
# Parse csrfToken from the html
soup = BeautifulSoup(response.text, "html.parser")
relevant_script = [script.text for script in soup.find_all("script") if "csrfToken" in script.text]
matched_group = re.search(r'"csrfToken":"(.+?)"', relevant_script[0]) if len(relevant_script) > 0 else None
if matched_group is None:
raise Exception("Failed to extract csrfToken")
csrf_token = matched_group.group(1)
await asyncio.sleep(1) # <-- Need async because of this (and for more async related tasks after wards like calling websocket, etc)
# Initiate search
api = "https://somesite.com/search"
headers = { "x-csrf-token": csrf_token, 'Content-Type':'application/json' }
payload = {"a": 1}
response = yield Request(api, method='POST', headers=headers, meta={'cookiejar': cookiejar_ref}, body=json.dumps(payload))
lots_url = json.loads(response.text)["redirect"]
yield {
"lots_url": lots_url,
}
The issue is here (adding async keyword causes the function not to wait anymore):
async def handle(self, response: Response):
Don't want to do it the callback way as the code was becoming unreadble (there are lots of other functions that i have omitted here for breivety). Only way i found to call scrapy requests sequentially was to use scrapy-requets-inline but it stops working as soon as i add the async keyword on the function definition. Remove that and it works as expected (i.e waits for request to finish before proceeding further). Any way to make it wait with the async keyword?.
One alternative i know is to ditch scrapy entirely and use aiohttp but doing that will mean losing all the awesome features that scrapy provides (like rate limiting, logginh stats via scrapymd etc).
Thanks!
Related
I've been trying to make a Discord 'bot' of sorts with an integrated Scrapy spider that scrapes data from a website (that has no APIs) and outputs the parsed data depending on the command sent on Discord to the bot.
I've managed to nail down the scraping part as I can get the data I need from a list array and output it in a filesystem file with Scrapy's commands:
import scrapy
class locg(scrapy.Spider):
name = 'spiderbot'
start_urls = ['https://leagueofcomicgeeks.com/comics/new-comics']
def start_requests(self):
headers= {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'}
for url in self.start_urls:
yield scrapy.Request(url, headers=headers)
def parse(self, response):
for items in response.xpath('.//ul[#class="comic-list-thumbs item-list-thumbs item-list"]'):
publishers = items.xpath('.//div[#class="publisher color-offset"]/text()').extract()
issues = items.xpath(".//div[#class='title color-primary']//a/text()").extract()
i = 0
for publisher in publishers:
if "Marvel Comics" in publisher:
print(issues[i])
i = i+1
With this code I can match in the list every issue name that is with publisher Marvel Comics.
I changed the code to integrate it with the Discord python API, the problem is I can't call the parse function from the Scrapy class outside of it so that I can make sure it runs whenever a function inside the Discord class gets called by the discord command.
So this is what I currently have:
import discord
import os
import scrapy
marvel = []
dc = []
class locg(scrapy.Spider):
name = 'spiderbot'
start_urls = ['https://leagueofcomicgeeks.com/comics/new-comics']
def start_requests(self):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'}
for url in self.start_urls:
yield scrapy.Request(url, headers=headers)
def parse(self, response):
for items in response.xpath('.//ul[#class="comic-list-thumbs item-list-thumbs item-list"]'):
publishers = items.xpath('.//div[#class="publisher color-offset"]/text()').extract()
issues = items.xpath(".//div[#class='title color-primary']//a/text()").extract()
i = 0
for publisher in publishers:
if "Marvel Comics" in publisher:
marvel.append(issues[i])
if "DC Comics" in publisher:
dc.append(issues[i])
i = i+1
client = discord.Client()
#client.event
async def on_ready():
print('We have logged in as {0.user}'.format(client))
#client.event
async def on_message(message):
if message.author == client.user:
return
test = locg()
test.parse()
if message.content.startswith('-marvel'):
await message.channel.send(marvel)
if message.content.startswith('-dc'):
await message.channel.send(dc)
client.run('token')
When I run this and I call the command on my Discord server the bots just gives me [] so the empty list from the global variable I defined at the start of the code. Which makes me think the Scrapy class isn't running when I call the command and so the variable is empty.
I tried to instanciate the class with
test = locg()
test.parse()
But I get No value for argument 'response' in method call and I'm not really sure how to define the response value in here.
Does anyone have any pointers into this?
I realize that maybe this isn't really what Scrapy was designed to do so please do tell me if so, I'll look for other ways to do this.
I appreciate any and all help, please let me know if I should give more info or if I should've done anything different with my first question. This is my first day on the website.
Thank you.
I've created a python script using scrapy to scrape some information available in a certain webpage. The problem is the link I'm trying with gets redirected very often. However, when I try few times using requests, I get the desired content.
In case of scrapy, I'm unable to reuse the link because I found it redirecting no matter how many times I try. I can even catch the main url using response.meta.get("redirect_urls")[0] meant to be used resursively within parse method. However, it always gets redirected and as a result callback is not taking place.
This is my current attempt (the link used within the script is just a placeholder):
import scrapy
from scrapy.crawler import CrawlerProcess
class StackoverflowSpider(scrapy.Spider):
handle_httpstatus_list = [301, 302]
name = "stackoverflow"
start_url = 'https://stackoverflow.com/questions/22937618/reference-what-does-this-regex-mean'
def start_requests(self):
yield scrapy.Request(self.start_url,meta={"lead_link":self.start_url},callback=self.parse)
def parse(self,response):
if response.meta.get("lead_link"):
self.lead_link = response.meta.get("lead_link")
elif response.meta.get("redirect_urls"):
self.lead_link = response.meta.get("redirect_urls")[0]
try:
if response.status!=200 :raise
if not response.css("[itemprop='text'] > h2"):raise
answer_title = response.css("[itemprop='text'] > h2::text").get()
print(answer_title)
except Exception:
print(self.lead_link)
yield scrapy.Request(self.lead_link,meta={"lead_link":self.lead_link},dont_filter=True, callback=self.parse)
if __name__ == "__main__":
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
})
c.crawl(StackoverflowSpider)
c.start()
Question: How can I force scrapy to make a callback using the url that got redirected?
As far as I understand, you want to scrape a link until it stops redirecting and you finally get http status 200
If yes, then you have to first remove handle_httpstatus_list = [301, 302] from your code
Then create a CustomMiddleware in middlewares.py
class CustomMiddleware(object):
def process_response(self, request, response, spider):
if not response.css("[itemprop='text'] > h2"):
logging.info('Desired text not found so re-scraping' % (request.url))
req = request.copy()
request.dont_filter = True
return req
if response.status in [301, 302]:
original_url = request.meta.get('redirect_urls', [response.url])[0]
logging.info('%s is redirecting to %s, so re-scraping it' % (request._url, request.url))
request._url = original_url
request.dont_filter = True
return request
return response
Then your spider should look like something this
class StackoverflowSpider(scrapy.Spider):
name = "stackoverflow"
start_url = 'https://stackoverflow.com/questions/22937618/reference-what-does-this-regex-mean'
custom_settings = {
'DOWNLOADER_MIDDLEWARES': {
'YOUR_PROJECT_NAME.middlewares.CustomMiddleware': 100,
}
}
def start_requests(self):
yield scrapy.Request(self.start_url,meta={"lead_link":self.start_url},callback=self.parse)
def parse(self,response):
answer_title = response.css("[itemprop='text'] > h2::text").get()
print(answer_title)
If you tell me which site you are scraping then I can help you out, you can email me as well which is on my profile
You may want to see this.
If you need to prevent redirecting it is possible by request meta:
request = scrapy.Request(self.start_url,meta={"lead_link":self.start_url},callback=self.parse)
request.meta['dont_redirect'] = True
yield request
Due to documentation this is a way to stop redirecting.
The following code is my attempt at doing python requests through tor, this works fine, however I am interested in adding multithreading to this.
So I would like to simultaneously do about 10 different requests and process their outputs. What is the simplest and most efficient way to do this?
def onionrequest(url, onionid):
onionid = onionid
session = requests.session()
session.proxies = {}
session.proxies['http'] = 'socks5h://localhost:9050'
session.proxies['https'] = 'socks5h://localhost:9050'
#r = session.get('http://google.com')
onionurlforrequest = "http://" + url
try:
r = session.get(onionurlforrequest, timeout=15)
except:
return None
if r.status_code = 200:
listofallonions.append(url)
I would recommend using the the following packages to achieve this: asyncio, aiohttp, aiohttp_socks
example code:
import asyncio
import aiohttp
from aiohttp_socks import ProxyConnector
async def fetch(session, url):
async with session.get(url) as response:
return await response.text()
async def main(urls):
tasks = []
connector = ProxyConnector.from_url('socks5://localhost:9150', rdns=True)
async with aiohttp.ClientSession(connector=connector, rdns=True) as session:
for url in urls:
tasks.append(fetch(session, url))
htmls = await asyncio.gather(*tasks)
for html in htmls:
print(html)
if __name__ == '__main__':
urls = [
'http://python.org',
'https://google.com',
...
]
loop = asyncio.get_event_loop()
loop.run_until_complete(main(urls))
Using asyncio can get a bit daunting at first, so you might need to practice for a while before you get the hang of it.
If you want a more in-depth explanation of the difference between synchronous and asynchronous, check out this question.
I've written a script in python's scrapy to make a proxied requests using either of the newly generated proxies by get_proxies() method. I used requests module to fetch the proxies in order to reuse them in the script. However, the problem is the proxy my script chooses to use may not be the good one always so sometimes it doesn't fetch valid response.
How can I let my script keep trying with different proxies until there is a valid response?
My script so far:
import scrapy
import random
import requests
from itertools import cycle
from bs4 import BeautifulSoup
from scrapy.http.request import Request
from scrapy.crawler import CrawlerProcess
class ProxySpider(scrapy.Spider):
name = "sslproxies"
check_url = "https://stackoverflow.com/questions/tagged/web-scraping"
proxy_link = "https://www.sslproxies.org/"
def start_requests(self):
proxylist = self.get_proxies()
random.shuffle(proxylist)
proxy_ip_port = next(cycle(proxylist))
print(proxy_ip_port) #Checking out the proxy address
request = scrapy.Request(self.check_url, callback=self.parse,errback=self.errback_httpbin,dont_filter=True)
request.meta['proxy'] = "http://{}".format(proxy_ip_port)
yield request
def get_proxies(self):
response = requests.get(self.proxy_link)
soup = BeautifulSoup(response.text,"lxml")
proxy = [':'.join([item.select_one("td").text,item.select_one("td:nth-of-type(2)").text]) for item in soup.select("table.table tbody tr") if "yes" in item.text]
return proxy
def parse(self, response):
print(response.meta.get("proxy")) #Compare this to the earlier one whether they both are the same
def errback_httpbin(self, failure):
print("Failure: "+str(failure))
if __name__ == "__main__":
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
'DOWNLOAD_TIMEOUT' : 5,
})
c.crawl(ProxySpider)
c.start()
PS My intension is to seek any solution the way I've started here.
As we know http response needs to pass all middlewares in order to reach spider methods.
It means that only requests with valid proxies can proceed to spider callback functions.
In order to use valid proxies we need to check ALL proxies first and after that choose only from valid proxies.
When our previously chosen proxy doesn't work anymore - we mark this proxy as not valid and choose new one from remaining valid proxies in spider errback.
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.http.request import Request
class ProxySpider(scrapy.Spider):
name = "sslproxies"
check_url = "https://stackoverflow.com/questions/tagged/web-scraping"
proxy_link = "https://www.sslproxies.org/"
current_proxy = ""
proxies = {}
def start_requests(self):
yield Request(self.proxy_link,callback=self.parse_proxies)
def parse_proxies(self,response):
for row in response.css("table#proxylisttable tbody tr"):
if "yes" in row.extract():
td = row.css("td::text").extract()
self.proxies["http://{}".format(td[0]+":"+td[1])]={"valid":False}
for proxy in self.proxies.keys():
yield Request(self.check_url,callback=self.parse,errback=self.errback_httpbin,
meta={"proxy":proxy,
"download_slot":proxy},
dont_filter=True)
def parse(self, response):
if "proxy" in response.request.meta.keys():
#As script reaches this parse method we can mark current proxy as valid
self.proxies[response.request.meta["proxy"]]["valid"] = True
print(response.meta.get("proxy"))
if not self.current_proxy:
#Scraper reaches this code line on first valid response
self.current_proxy = response.request.meta["proxy"]
#yield Request(next_url, callback=self.parse_next,
# meta={"proxy":self.current_proxy,
# "download_slot":self.current_proxy})
def errback_httpbin(self, failure):
if "proxy" in failure.request.meta.keys():
proxy = failure.request.meta["proxy"]
if proxy == self.current_proxy:
#If current proxy after our usage becomes not valid
#Mark it as not valid
self.proxies[proxy]["valid"] = False
for ip_port in self.proxies.keys():
#And choose valid proxy from self.proxies
if self.proxies[ip_port]["valid"]:
failure.request.meta["proxy"] = ip_port
failure.request.meta["download_slot"] = ip_port
self.current_proxy = ip_port
return failure.request
print("Failure: "+str(failure))
if __name__ == "__main__":
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
'COOKIES_ENABLED': False,
'DOWNLOAD_TIMEOUT' : 10,
'DOWNLOAD_DELAY' : 3,
})
c.crawl(ProxySpider)
c.start()
you need write a downloader middleware, to install a process_exception hook, scrapy calls this hook when exception raised. in the hook, you could return a new Request object, with dont_filter=True flag, to let scrapy reschedule the request until it succeeds.
in the meanwhile, you could verify response extensively in process_response hook, check the status code, response content etc., and reschedule request as necessary.
in order to change proxy easily, you should use built-in HttpProxyMiddleware, instead of tinker with environ:
request.meta['proxy'] = proxy_address
take a look at this project as an example.
In order to create a scraper for a page with dynamic loaded content, requests-html provides modules to get the rendered page after the JS execution. However, when trying to use the AsyncHTMLSession by calling the arender() method in a multithreaded implementation, the HTML generated doesn't change.
E.g. in the URL provided in the source code, the tables HTML values are empty by default and after the script execution, emulated by the arender() method it is expected to insert the values into the markup, though no visible changes are noticed in the source code.
from pprint import pprint
#from bs4 import BeautifulSoup
import asyncio
from timeit import default_timer
from concurrent.futures import ThreadPoolExecutor
from requests_html import AsyncHTMLSession, HTML
async def fetch(session, url):
r = await session.get(url)
await r.html.arender()
return r.content
def parseWebpage(page):
print(page)
async def get_data_asynchronous():
urls = [
'http://www.fpb.pt/fpb2014/!site.go?s=1&show=jog&id=258215'
]
with ThreadPoolExecutor(max_workers=20) as executor:
with AsyncHTMLSession() as session:
# Set any session parameters here before calling `fetch`
# Initialize the event loop
loop = asyncio.get_event_loop()
# Use list comprehension to create a list of
# tasks to complete. The executor will run the `fetch`
# function for each url in the urlslist
tasks = [
await loop.run_in_executor(
executor,
fetch,
*(session, url) # Allows us to pass in multiple arguments to `fetch`
)
for url in urls
]
# Initializes the tasks to run and awaits their results
for response in await asyncio.gather(*tasks):
parseWebpage(response)
def main():
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(get_data_asynchronous())
loop.run_until_complete(future)
main()
The source code representation post the execution of the rendering method is not under the content attribute of the session, but under raw_html in the HTML object. In this case, the value returned should be r.html.raw_html.