In order to create a scraper for a page with dynamic loaded content, requests-html provides modules to get the rendered page after the JS execution. However, when trying to use the AsyncHTMLSession by calling the arender() method in a multithreaded implementation, the HTML generated doesn't change.
E.g. in the URL provided in the source code, the tables HTML values are empty by default and after the script execution, emulated by the arender() method it is expected to insert the values into the markup, though no visible changes are noticed in the source code.
from pprint import pprint
#from bs4 import BeautifulSoup
import asyncio
from timeit import default_timer
from concurrent.futures import ThreadPoolExecutor
from requests_html import AsyncHTMLSession, HTML
async def fetch(session, url):
r = await session.get(url)
await r.html.arender()
return r.content
def parseWebpage(page):
print(page)
async def get_data_asynchronous():
urls = [
'http://www.fpb.pt/fpb2014/!site.go?s=1&show=jog&id=258215'
]
with ThreadPoolExecutor(max_workers=20) as executor:
with AsyncHTMLSession() as session:
# Set any session parameters here before calling `fetch`
# Initialize the event loop
loop = asyncio.get_event_loop()
# Use list comprehension to create a list of
# tasks to complete. The executor will run the `fetch`
# function for each url in the urlslist
tasks = [
await loop.run_in_executor(
executor,
fetch,
*(session, url) # Allows us to pass in multiple arguments to `fetch`
)
for url in urls
]
# Initializes the tasks to run and awaits their results
for response in await asyncio.gather(*tasks):
parseWebpage(response)
def main():
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(get_data_asynchronous())
loop.run_until_complete(future)
main()
The source code representation post the execution of the rendering method is not under the content attribute of the session, but under raw_html in the HTML object. In this case, the value returned should be r.html.raw_html.
Related
My complete code:
import re
from bs4 import BeautifulSoup
import json
from typing import Any, Optional, cast
from inline_requests import inline_requests
from scrapy import Spider, Request
import asyncio
class QuotesSpider(Spider):
name = "scraper"
custom_settings = {
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
}
def start_requests(self):
codes= ["A", "B"]
url = "https://somesite.com/"
for i, code in enumerate(codes):
yield Request(url=url, callback=self.handle, meta={'cookiejar': i, "code": code})
#inline_requests
async def handle(self, response):
code = response.meta["code"]
cookiejar_ref = response.meta["cookiejar"]
# Parse csrfToken from the html
soup = BeautifulSoup(response.text, "html.parser")
relevant_script = [script.text for script in soup.find_all("script") if "csrfToken" in script.text]
matched_group = re.search(r'"csrfToken":"(.+?)"', relevant_script[0]) if len(relevant_script) > 0 else None
if matched_group is None:
raise Exception("Failed to extract csrfToken")
csrf_token = matched_group.group(1)
await asyncio.sleep(1) # <-- Need async because of this (and for more async related tasks after wards like calling websocket, etc)
# Initiate search
api = "https://somesite.com/search"
headers = { "x-csrf-token": csrf_token, 'Content-Type':'application/json' }
payload = {"a": 1}
response = yield Request(api, method='POST', headers=headers, meta={'cookiejar': cookiejar_ref}, body=json.dumps(payload))
lots_url = json.loads(response.text)["redirect"]
yield {
"lots_url": lots_url,
}
The issue is here (adding async keyword causes the function not to wait anymore):
async def handle(self, response: Response):
Don't want to do it the callback way as the code was becoming unreadble (there are lots of other functions that i have omitted here for breivety). Only way i found to call scrapy requests sequentially was to use scrapy-requets-inline but it stops working as soon as i add the async keyword on the function definition. Remove that and it works as expected (i.e waits for request to finish before proceeding further). Any way to make it wait with the async keyword?.
One alternative i know is to ditch scrapy entirely and use aiohttp but doing that will mean losing all the awesome features that scrapy provides (like rate limiting, logginh stats via scrapymd etc).
Thanks!
For my project I need to request a api and to store the result in a list. But the no. of requests I need to give more than 5000 with different body values. So, it take huge amount of time to complete. Is there is any way to parallely send the requests to complete the process quickly. I tried some threading code in this but I can't be able to figure out the ay to solve this.
import requests
res_list=[]
l=[19821, 29674 , 41983, 40234 ,.....] # Nearly 5000 items for now and the count may increase in future
for i in l:
URL ="https://api.something.com/?key=xxx-xxx-xxx&job_id={0}".format(i)
res = requests.get(url=URL)
res_list.append(res.text)
Probably, you just need to make your queries asynchronously. Something like that:
import asyncio
import aiohttp
NUMBERS = [1, 2, 3]
async def call():
async with aiohttp.ClientSession() as session:
for num in NUMBERS:
async with session.get(f'http://httpbin.org/get?{num}') as resp:
print(resp.status)
print(await resp.text())
if __name__ == '__main__':
loop = asyncio.new_event_loop()
loop.run_until_complete(call())
first time trying asyncio and aiohttp.
I have the following code that gets urls from the MySQL database for GET requests. Gets the responses and pushes them to MySQL database.
if __name__ == "__main__":
database_name = 'db_name'
company_name = 'company_name'
my_db = Db(database=database_name) # wrapper class for mysql.connector
urls_dict = my_db.get_rest_api_urls_for_specific_company(company_name=company_name)
update_id = my_db.get_updateid()
my_db.get_connection(dictionary=True)
for url in urls_dict:
url_id = url['id']
url = url['url']
table_name = my_db.make_sql_table_name_by_url(url)
insert_query = my_db.get_sql_for_insert(table_name)
r = requests.get(url=url).json() # make the request
args = [json.dumps(r), update_id, url_id]
my_db.db_execute_one(insert_query, args, close_conn=False)
my_db.close_conn()
This works fine but to speed it up How can I run it asynchronously?
I have looked here, here and here but can't seem to get my head around it.
Here is what I have tried based on #Raphael Medaer's answer.
async def fetch(url):
async with ClientSession() as session:
async with session.request(method='GET', url=url) as response:
json = await response.json()
return json
async def process(url, update_id):
table_name = await db.make_sql_table_name_by_url(url)
result = await fetch(url)
print(url, result)
if __name__ == "__main__":
"""Get urls from DB"""
db = Db(database="fuse_src")
urls = db.get_rest_api_urls() # This returns list of dictionary
update_id = db.get_updateid()
url_list = []
for url in urls:
url_list.append(url['url'])
print(update_id)
asyncio.get_event_loop().run_until_complete(
asyncio.gather(*[process(url, update_id) for url in url_list]))
I get an error in the process method:
TypeError: object str can't be used in 'await' expression
Not sure whats the problem?
Any code example specific to this would be highly appreciated.
Make this code asynchronous will not speed it up at all. Except if you consider to run a part of your code in "parallel". For instance you can run multiple (SQL or HTTP) queries in "same time". By doing asynchronous programming you will not execute code in "same time". Although you will get benefit of long IO tasks to execute other part of your code while you're waiting for IOs.
First of all, you'll have to use asynchronous libraries (instead of synchronous one).
mysql.connector could be replaced by aiomysql from aio-libs.
requests could be replaced by aiohttp
To execute multiple asynchronous tasks in "parallel" (for instance to replace your loop for url in urls_dict:), you have to read carefully about asyncio tasks and function gather.
I will not (re)write your code in an asynchronous way, however here are a few lines of pseudo code which could help you:
async def process(url):
result = await fetch(url)
await db.commit(result)
if __name__ == "__main__":
db = MyDbConnection()
urls = await db.fetch_all_urls()
asyncio.get_event_loop().run_until_complete(
asyncio.gather(*[process(url) for url in urls]))
I am trying to open a multiple web session and save the data into CSV, Have written my code using for loop & requests.get options, But it's taking so long to access 90 number of Web location. Can anyone let me know how the whole process run in parallel for loc_var:
The code is working fine, only the issue is running one by one for loc_var, and took so long time.
Want to access all the for loop loc_var URL in parallel and write operation of CSV
Below is the Code:
import pandas as pd
import numpy as np
import os
import requests
import datetime
import zipfile
t=datetime.date.today()-datetime.timedelta(2)
server = [("A","web1",":5000","username=usr&password=p7Tdfr")]
'''List of all web_ips'''
web_1 = ["Web1","Web2","Web3","Web4","Web5","Web6","Web7","Web8","Web9","Web10","Web11","Web12","Web13","Web14","Web15"]
'''List of All location'''
loc_var =["post1","post2","post3","post4","post5","post6","post7","post8","post9","post10","post11","post12","post13","post14","post15","post16","post17","post18"]
for s,web,port,usr in server:
login_url='http://'+web+port+'/api/v1/system/login/?'+usr
print (login_url)
s= requests.session()
login_response = s.post(login_url)
print("login Responce",login_response)
#Start access the Web for Loc_variable
for mkt in loc_var:
#output is CSV File
com_actions_url='http://'+web+port+'/api/v1/3E+date(%5C%22'+str(t)+'%5C%22)and+location+%3D%3D+%27'+mkt+'%27%22&page_size=-1&format=%22csv%22'
print("com_action_url",com_actions_url)
r = s.get(com_actions_url)
print("action",r)
if r.ok == True:
with open(os.path.join("/home/Reports_DC/", "relation_%s.csv"%mkt),'wb') as f:
f.write(r.content)
# If loc is not aceesble try with another Web_1 List
if r.ok == False:
while r.ok == False:
for web_2 in web_1:
login_url='http://'+web_2+port+'/api/v1/system/login/?'+usr
com_actions_url='http://'+web_2+port+'/api/v1/3E+date(%5C%22'+str(t)+'%5C%22)and+location+%3D%3D+%27'+mkt+'%27%22&page_size=-1&format=%22csv%22'
login_response = s.post(login_url)
print("login Responce",login_response)
print("com_action_url",com_actions_url)
r = s.get(com_actions_url)
if r.ok == True:
with open(os.path.join("/home/Reports_DC/", "relation_%s.csv"%mkt),'wb') as f:
f.write(r.content)
break
There are multiple approaches that you can take to make concurrent HTTP requests. Two that I've used are (1) multiple threads with concurrent.futures.ThreadPoolExecutor or (2) send the requests asynchronously using asyncio/aiohttp.
To use a thread pool to send your requests in parallel, you would first generate a list of URLs that you want to fetch in parallel (in your case generate a list of login_urls and com_action_urls), and then you would request all of the URLs concurrently as follows:
from concurrent.futures import ThreadPoolExecutor
import requests
def fetch(url):
page = requests.get(url)
return page.text
# Catch HTTP errors/exceptions here
pool = ThreadPoolExecutor(max_workers=5)
urls = ['http://www.google.com', 'http://www.yahoo.com', 'http://www.bing.com'] # Create a list of urls
for page in pool.map(fetch, urls):
# Do whatever you want with the results ...
print(page[0:100])
Using asyncio/aiohttp is generally faster than the threaded approach above, but the learning curve is more complicated. Here is a simple example (Python 3.7+):
import asyncio
import aiohttp
urls = ['http://www.google.com', 'http://www.yahoo.com', 'http://www.bing.com']
async def fetch(session, url):
async with session.get(url) as resp:
return await resp.text()
# Catch HTTP errors/exceptions here
async def fetch_concurrent(urls):
loop = asyncio.get_event_loop()
async with aiohttp.ClientSession() as session:
tasks = []
for u in urls:
tasks.append(loop.create_task(fetch(session, u)))
for result in asyncio.as_completed(tasks):
page = await result
#Do whatever you want with results
print(page[0:100])
asyncio.run(fetch_concurrent(urls))
But unless you are going to be making a huge number of requests, the threaded approach will likely be sufficient (and way easier to implement).
In python I want to create an async method in a class that create a thread without blocking the main thread. When the new thread finish, I return a value from that function/thread.
For example the class is used for retrieve some information from web pages. I want run parallel processing in a function that download the page and return a object.
class WebDown:
def display(self, url):
print 'display(): ' + content
def download(self, url):
thread = Thread(target=self.get_info)
# thread join
print 'download(): ' + content
# return the info
def get_info(self, url):
# download page
# retrieve info
return info
if __name__ == '__main__':
wd = WebDown()
ret = wd.download('http://...')
wd.display('http://...')
I this example, in order I call download() for retrieve the info, after display() for print others information. The print output should be
display(): foo, bar, ....
download(): blue, red, ....
One way to write asynchronous, non blocking code in python involves using Python's Twisted. Twisted does not rely on multithreading but uses multiprocessing instead. It gives you convenient way to create Deferred objects, adding callbacks and errbacks to them. The example you give would look like this in Twisted, I'm using treq (Twisted Requests) library which makes generating requests a little quicker and easier:
from treq import get
from twisted.internet import reactor
class WebAsync(object):
def download(self, url):
request = get(url)
request.addCallback(self.deliver_body)
def deliver_body(self, response):
deferred = response.text()
deferred.addCallback(self.display)
return deferred
def display(self, response_body):
print response_body
reactor.stop()
if __name__ == "__main__":
web_client = WebAsync()
web_client.download("http://httpbin.org/html")
reactor.run()
Both 'download' and 'deliver_body' methods return deferreds, you add callbacks to them that are going to be executed when results is available.
I would simply use request and gevent called grequests.
import grequests
>>> urls = [
'http://...',
'http://...'
]
>>> rs = (grequests.get(u) for u in urls)
>>> grequests.map(rs)
[<Response [200]>, <Response [200]>]