How can I make python request module asyncio and aiohttp - python

def get_ship_position(ship_id):
import requests
url ="https://www.marinetraffic.com/en/vesselDetails/vesselInfo/shipid:{}".format(ship_id)
headers = {
"accept": "application/json",
"accept-encoding": "gzip, deflate",
"user-agent": "Mozilla/5.0",
"x-requested-with": "XMLHttpRequest"
}
response = requests.get(url, headers=headers)
response.raise_for_status()
return response.json()
def main():
from time import perf_counter
start = perf_counter()
i = 7550
while (i <= 9999 ):
i+=1
try:
data = get_ship_position(i)
with open("marinetraffic.txt","a",encoding="utf-8") as bos:
print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}x{}\t{}\t{}\t{}\t{}\t{}\t{}".format(data["mmsi"], data["imo"],data["name"],data["nameAis"],data["type"],data["typeSpecific"],data["yearBuilt"],data["length"],data["breadth"],data["callsign"],data["country"],data["deadweight"],data["grossTonnage"],data["homePort"],data["status"]),file=bos)
print(i,"Yazdı")
except Exception:
print(i,"Hata")
with open("marinetraffichata.txt","a",encoding="utf-8") as hata:
print("Hata",i,file=hata)
pass
stop = perf_counter()
print("çalışılan süre:", stop - start,"saniye")
# return 0
if __name__ == "__main__":
import sys
sys.exit(main())
I am progressing very slowly with the request module, how can I make the code run fast? I've seen the aiohttp and async modules and they are really fast. How can I adapt my own code?

Using asyncio and aiohttp is certainly one way of being able to do concurrent URL retrievals. But I am wondering if it is the best way given (1) you are already using requests and (2) you want to retrieve 2450 URLs, but not necessarily all at the same time.
By using a multithreading pool of size N, you would have N threads concurrently retrieving up to N URLs. By setting an "appropriate" value for N you can control the degree of concurrency. Performance could improve by increasing N but at some point as N got larger, performance could start to decrease. There is also the possibility that the website might think you are performing a Denial of Service attack by making so many concurrent requests.
In the code below I am using a value of 64 for N and creating a Session instance for doing the retrievals, which should also improve performance. I am using method multiprocessing.pool.Threadpool.imap to process the returned data elements as they become available. This method returns an iterator that when iterated will return the next return value from your worker function, get_ship_position. However, I am explicitly using method next to iterate so that I can individually handle exceptions that are raised by get_ship_posiion. If I were to use instead for data in pool.imap(worker, range(7551, 10_001)) to iterate then once an exception is raised by an invocation of get_ship_position, I would not be able to continue iterating subsequent results.
def get_ship_position(session, ship_id):
url ="https://www.marinetraffic.com/en/vesselDetails/vesselInfo/shipid:{}".format(ship_id)
response = session.get(url)
response.raise_for_status()
return response.json()
def main():
from time import perf_counter
import requests
from multiprocessing.pool import ThreadPool
from functools import partial
start = perf_counter()
with requests.Session() as session:
headers = {
"accept": "application/json",
"accept-encoding": "gzip, deflate",
"user-agent": "Mozilla/5.0",
"x-requested-with": "XMLHttpRequest"
}
session.headers = headers
with ThreadPool(64) as pool:
worker = partial(get_ship_position, session)
it = pool.imap(worker, range(7551, 10_001))
i = 7550
with open("marinetraffic.txt","a",encoding="utf-8") as f:
while True:
i += 1
try:
data = next(it)
print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}x{}\t{}\t{}\t{}\t{}\t{}\t{}".format(data["mmsi"], data["imo"],data["name"],data["nameAis"],data["type"],data["typeSpecific"],data["yearBuilt"],data["length"],data["breadth"],data["callsign"],data["country"],data["deadweight"],data["grossTonnage"],data["homePort"],data["status"]),file=f)
print(i,"Yazdı")
except StopIteration:
break
except Exception:
print(i,"Hata")
print("Hata",i,file=f)
stop = perf_counter()
print("çalışılan süre:", stop - start,"saniye")
return 0
if __name__ == "__main__":
import sys
sys.exit(main())
Using asyncio and aiohttp
The following code uses asyncio and aiohttp. A semaphore set to 64 controls the number of coroutines that can be running concurrently so that you can control the number of concurrent get requests are made. Again, this number, set to 64, can be adjusted to see how performance varies.
import asyncio
async def get_ship_position(session, ship_id):
url ="https://www.marinetraffic.com/en/vesselDetails/vesselInfo/shipid:{}".format(ship_id)
async with session.get(url) as response:
status = response.status
if status != 200:
raise Exception(f'Bad status: {status}')
return await response.json()
async def bounded_fetch(sem, session, ship_id):
async with sem:
result = await get_ship_position(session, ship_id)
return result
async def main():
from time import perf_counter
import aiohttp
start = perf_counter()
headers = {
"accept": "application/json",
"accept-encoding": "gzip, deflate",
"user-agent": "Mozilla/5.0",
"x-requested-with": "XMLHttpRequest"
}
async with aiohttp.ClientSession(headers=headers) as session:
sem = asyncio.Semaphore(64)
responses = await asyncio.gather(*(bounded_fetch(sem, session, i) for i in range(7551, 10_001)), return_exceptions=True)
with open("marinetraffic.txt","a",encoding="utf-8") as f:
for i, data in enumerate(responses, start=7551):
if isinstance(data, Exception):
print(i,"Hata")
print("Hata",i,file=f)
else:
print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}x{}\t{}\t{}\t{}\t{}\t{}\t{}".format(data["mmsi"], data["imo"],data["name"],data["nameAis"],data["type"],data["typeSpecific"],data["yearBuilt"],data["length"],data["breadth"],data["callsign"],data["country"],data["deadweight"],data["grossTonnage"],data["homePort"],data["status"]),file=f)
print(i,"Yazdı")
stop = perf_counter()
print("çalışılan süre:", stop - start,"saniye")
return 0
if __name__ == "__main__":
import sys
rc = asyncio.get_event_loop().run_until_complete(main())
sys.exit(rc)
Note
With either version successive runs can produce widely different run times.
Update
If you want to write results to the output file as results are returned instead of after all coroutines have completed then try:
... # code omitted
import aiofiles
async with aiohttp.ClientSession(headers=headers) as session:
sem = asyncio.Semaphore(64)
tasks = [asyncio.create_task(bounded_fetch(sem, session, i)) for i in range(7551, 10_001)]
async with aiofiles.open("marinetraffic.txt", "w", encoding="utf-8") as f:
for i, task in enumerate(tasks, start=7551):
try:
await task
data = task.result()
record = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}x{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(data["mmsi"], data["imo"],data["name"],data["nameAis"],data["type"],data["typeSpecific"],data["yearBuilt"],data["length"],data["breadth"],data["callsign"],data["country"],data["deadweight"],data["grossTonnage"],data["homePort"],data["status"])
await f.write(record)
print(i,"Yazdı")
except:
print(i,"Hata")
print("Hata",i,file=f)
print(i,"Yazdı")
except:
print(i,"Hata")
print("Hata",i,file=f)
... # code omitted

Related

Python aiohttp how to handle client session token timeout

I am making several 100's of http request using aiohttp. I am relatively new to the async world but have managed to get the basic code working.
First I am generating a token. Then, making aiohttp calls using this token.
Token has a validity of 30 mins. So I am assuming if my calls run for more than 30 mins then they will start failing.
How do I update my code to plug-in a new token after 30 mins then resume the remaining calls. This my first time implementing async calls, so relatively clueless on how to handle this.
async def a_get_all_user_details(urls):
results = []
connector = aiohttp.TCPConnector(limit=70)
timeout = aiohttp.ClientTimeout(total=None, connect=300, sock_connect=300, sock_read=None)
auth_token = get_token() # token expires in 30 mins
headers = {
'accept': 'application/json',
'Authorization': 'Bearer ' + auth_token
}
async with aiohttp.ClientSession(trust_env=True, headers=headers, connector=connector, timeout=timeout) as session:
for url in urls:
result = asyncio.ensure_future(a_get_user_details(url, session))
results.append(result)
responses = await asyncio.gather(*results)
return responses
def main():
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(a_get_all_user_details(search_urls))
user_details = loop.run_until_complete(future)
Maybe there's simpler way to do it but here's my take:
The problem is that there are many connections in the fly when you want to refresh session. When you close the session and create new one, active connections which are waiting for data throw an exception.
In my example I have a list of all sessions and when time arrives I simply create new session (with new token) and append it to the list. The new connections will use the last (freshest) session.
At the end of script I close all sessions.
import aiohttp
import asyncio
sessions = []
async def get_token():
return "XYZ"
async def refresh_session():
# this function periodically refreshes the token every X sec
connector = aiohttp.TCPConnector(limit=3)
timeout = aiohttp.ClientTimeout(
total=None, connect=300, sock_connect=300, sock_read=None
)
while True:
headers = {
"accept": "application/json",
"Authorization": "Bearer " + await get_token(),
}
sessions.append(
aiohttp.ClientSession(
trust_env=True,
headers=headers,
connector=connector,
timeout=timeout,
)
)
print("New session created")
await asyncio.sleep(5) # every 5 seconds refresh session
async def get_user_detail(url):
# wait for session to show up:
while not sessions:
await asyncio.sleep(1)
# use last (freshest) session:
async with sessions[-1].get(url) as resp:
assert resp.status == 200
html = await resp.text()
return f"some result for {url} length of data {len(html)}"
async def get_user_details(urls):
results = []
for url in urls:
results.append(asyncio.ensure_future(get_user_detail(url)))
responses = await asyncio.gather(*results)
return responses
async def main():
# some urls to gather:
urls = [
"https://www.google.com",
"https://www.microsoft.com",
"https://www.yahoo.com",
] * 30
t1 = asyncio.create_task(refresh_session())
t2 = asyncio.create_task(get_user_details(urls))
# finish when first task ends (in this case get_user_details())
done, _ = await asyncio.wait([t1, t2], return_when=asyncio.FIRST_COMPLETED)
# close all opened sessions:
for s in sessions:
await s.close()
# print the result
print("Domains gathered ", len(done.pop().result()))
if __name__ == "__main__":
asyncio.run(main())
This prints:
New session created
New session created
Domains gathered 90

Async check of several hundred proxies

I need to check several hundred proxy servers and get the number of not working. Script for this
import urllib.request
import socket
net = ['http://192.168.1.1:8080',
'http://192.168.1.2:8080',
'http://192.168.1.3:8080',
'http://192.168.1.4:8080',
'http://192.168.1.5:8080',
'http://192.168.1.6:8080',
'http://192.168.1.7:8080',
'http://192.168.1.8:8080',
'http://192.168.1.9:8080',
'http://192.168.1.10:8080']
fail = 0
socket.setdefaulttimeout(3)
for x in net:
try:
print(x)
proxy = urllib.request.ProxyHandler({'http': (x)})
opener = urllib.request.build_opener(proxy)
urllib.request.install_opener(opener)
urllib.request.urlretrieve('http://google.com')
except IOError:
print ("Connection error")
fail+=1
print(fail)
Proxies in the list, I have given a simple version.
It takes 55 seconds to check 250 working proxies. I can't wait that long, need to increase the execution speed.
How can this be done using async?
This should give you an idea of how to approach it. You have to wrap the various connection blocks in try, except yourself.
NOTE: This code is not tested as I do not have any way of doing so.
import asyncio, aiohttp
def returnPartionedList(inputlist, x=100):
return([inputlist[i:i + x] for i in range(0, len(inputlist), x)])
# Returns: Original list split into segments of x.
async def TestProxy(url, proxy, session):
async with session.get(url, proxy=proxy, timeout=3) as response:
if response.status == 200:
_ = await response.text()
return(proxy)
async def TestProxies(listofproxies):
returnResults = []
url = "https://google.com" # Test proxy with this url
ProxyPartitions = returnPartionedList(listofproxies, 20) # Rate limit 20 per second
for partition in ProxyPartitions:
ProxyTasks = []
async with aiohttp.ClientSession() as session:
for proxy in partition:
ProxyTasks.append(asyncio.create_task(TestProxy(url, proxy, session)))
results = await asyncio.gather(*ProxyTasks, return_exceptions=False)
if results:
for result in results:
if result:
returnResults.append(result)
await asyncio.sleep(1)
return(returnResults)
async def main():
listofproxies = [
'http://10.10.1.1:8080',
'http://10.10.1.2:8080',
'http://10.10.1.3:8080',
'http://10.10.1.4:8080',
'http://10.10.1.5:8080',
'http://10.10.1.6:8080',
'http://10.10.1.7:8080',
'http://10.10.1.8:8080',
'http://10.10.1.9:8080',
'http://10.10.1.10:8080'
]
test_proxies = await TestProxies(listofproxies)
print(test_proxies)
if __name__ == "__main__":
asyncio.run(main())

Python requests.get and threading with different servers

I am working on a simple web scraper and rn trying to implement some multithreading. While my code works as intended with some servers(reducing time of execution vastly), my primary goal is to make it work with few specific ones. So when I try it with the ones in sites list, I get performance like I am still using sequential code. Any guesses what can cause this?
import requests, time
from bs4 import BeautifulSoup
from threading import Thread
from random import choice
# Enable to get some logging info
#---------------------------------
# import logging
# import http.client
# http.client.HTTPConnection.debuglevel = 1
# logging.basicConfig()
# logging.getLogger().setLevel(logging.DEBUG)
# requests_log = logging.getLogger("requests.packages.urllib3")
# requests_log.setLevel(logging.DEBUG)
# requests_log.propagate = True
sites = [
"https://pikabu.ru/community/blackhumour",
"https://www.pikabu.ru/tag/%D0%9C%D0%B5%D0%BC%D1%8B/hot"
]
class Pikabu_Downloader(Thread):
def __init__(self, url, name, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = url
self.name = name
self.begin = time.time()
def run(self):
print("Beginning with thread number",self.name, ",", round(time.time()-self.begin, 4), " seconds has passed")
html_data = self._get_html()
print("After requests.get with thread number", self.name, ",", round(time.time()-self.begin, 4), " seconds has passed")
if html_data is None:
return
self.soup = BeautifulSoup(html_data, "html.parser")
print("After making soup with thread number", self.name, ",", round(time.time() - self.begin, 4), " seconds has passed")
def _get_html(self):
try:
user_agents = ('Mozilla/5.0 (Windows NT 10.0; Win64; x64)', 'AppleWebKit/537.36 (KHTML, like Gecko)', 'Chrome/74.0.3729.169', 'Safari/537.36')
print(f"Go {self.url}...")
res = requests.get(self.url, headers={'User-Agent': choice(user_agents)}, stream = True)#, allow_redirects=False)
except Exception as exc:
print(exc)
else:
return res.text
test = "https://readingbooks.site/read/?name=1984&"
def download():
pikabu_urls = []
for url in sites:
pikabu = [url + "?page=" + str(x) for x in range(1, 10)]
pikabu_urls = pikabu_urls + pikabu
pikabu_dls = [Pikabu_Downloader(url=page, name=str(i)) for i, page in enumerate(pikabu_urls)]
# Comment the string above and enable 2 underlying strings to get result from test server
# tests = [test + "page=" + str(x) for x in range(1, pages)]
# pikabu_dls = [Pikabu_Downloader(url=page, name=str(i)) for i, page in enumerate(tests)]
for pikabu_dl in pikabu_dls:
pikabu_dl.start()
for pikabu_dl in pikabu_dls:
pikabu_dl.join()
download()
And the result is something like
...
After requests.get with thread number 1 , 1.6904 seconds has passed
After making soup with thread number 1 , 1.7554 seconds has passed
After requests.get with thread number 2 , 2.9805 seconds has passed
After making soup with thread number 2 , 3.0455 seconds has passed
After requests.get with thread number 3 , 4.3225 seconds has passed
After making soup with thread number 3 , 4.3895 seconds has passed
...
What can cause such latency between thread executions? I was hoping to get each thread to finish almost simultaneously and to get more...asynchronous output, like with server from test. If I set a timeout of 5 sec inside requests.get, most of the requests wont even work.
After I investigated your case, I would point out some issues that you have encountered:
Do not print when it is on parallel tasks, it will cause the bottle-neck on the way of rendering to screen
The large of tasks are not always good for performance, it depends on how much your memory will process. Imagine that you have 1000 links, you have to create 1000 task objects? No, only place-holder for 5-20 by leveraging ThreadPool
Server also is a problem to deal with when taking request. Downloaded size, low bandwidth, network, distancing,.. caused response late will affect your physic machine. Your sites are weight, it seems consuming 1-3000ms each request so when you test it with small size (20 links), it makes you feel it runs sequentially
Your code is running parallel, since you do a little bit trick to put it on different threads, it is not quite right because we need a fully async library, such like asyncio and aiohttp. The aiohttp will take care numerous async requests on the Coroutine whereas asyncio will support syntax and operate on your main thread.
I did a small experiment on colab, please be noticed that I didn't use asyncio and aiohttp on colab because of stuck, but I have implemented on several projects before and it worked faster than below fastest method.
The second function is your implementation
import urllib.request
from threading import Thread
import time, requests
from random import choice
user_agents = ('Mozilla/5.0 (Windows NT 10.0; Win64; x64)', 'AppleWebKit/537.36 (KHTML, like Gecko)', 'Chrome/74.0.3729.169', 'Safari/537.36')
timeout = 5
sites = [
"https://pikabu.ru/community/blackhumour",
"https://www.pikabu.ru/tag/%D0%9C%D0%B5%D0%BC%D1%8B/hot"
]
URLS = []
for url in sites:
pikabu = [url + "?page=" + str(x) for x in range(25)]
URLS.extend(pikabu)
def convert_to_threads():
return [Thread(target=load_url, args=(page, timeout)) for page in URLS]
def running_threads():
threads = convert_to_threads()
start = time.time()
for i in threads:
i.start()
for i in threads:
i.join()
print(f'Finish with {len(URLS)} requests {time.time() - start}')
def load_url(url, timeout):
res = requests.get(url, headers={'User-Agent': choice(user_agents)}, stream = True)#, allow_redirects=False)
return res.text
def running_sequence():
start = time.time()
for url in URLS:
load_url(url, timeout)
print(f'Finish with {len(URLS)} requests {time.time() - start}')
def running_thread_pool():
start = time.time()
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=15) as executor:
# Start the load operations and mark each future with its URL
future_to_url = {executor.submit(load_url, url, timeout): url for url in URLS}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
# else:
# print('%r page is %d length' % (url, len(data)))
print(f'Finish with {len(URLS)} requests {time.time() - start}')
In short, I recommend you use ThreadPool (prefer in colab), or asyncio and aiohttp (not in colab) to gain speed

Problem in using async and await in python

I am trying to using async and await, I am still new to it I cannot figure out what I am doing wrong
import requests
import bs4
import colorama
from colorama import Fore
import time
import datetime
import asyncio
async def get_html(episode_number: int) -> str:
print(Fore.YELLOW + f"Getting HTML for episode {episode_number}", flush=True)
url = f'https://talkpython.fm/{episode_number}'
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
resp.raise_for_status()
return await resp.text
await resp.raise_for_status()
# return await resp.text
def get_title(html: str, episode_number: int) -> str:
print(colorama.Fore.CYAN + f"Getting TITLE for episode {episode_number}", flush=True)
soup = bs4.BeautifulSoup(html, 'html.parser')
header = soup.select_one('h1')
if not header:
return "MISSING"
return header.text.strip()
def main():
t0 = datetime.datetime.now()
print(colorama.Fore.WHITE + ' App started.', flush=True )
loop = asyncio.get_event_loop()
final_task = asyncio.gather(loop)
#get_title_range()
dt = datetime.datetime.now() - t0
loop.run_until_complete(final_task)
print(colorama.Fore.CYAN + "Done. " + ' App exiting total time: {:,.2f} sec.'.format(dt.total_seconds()), flush=True)
def get_title_range():
for n in range(150, 170):
html = get_html(n)
title = get_title(html, n)
print(Fore.CYAN + f"Title found: {title}", flush=True)
if __name__ == '__main__':
main()
It looks like you're not initializing tasks for your event loop to run on. I typically follow this pattern:
async def main():
headers = {'Connection': 'keep-alive', 'Content-Type': 'application/json', 'Authorization': auth}
url = 'some-api.com/post-request-something'
# We use a session to take advantage of tcp keep-alive
timeout = aiohttp.ClientTimeout(total=10000)
async with aiohttp.ClientSession(timeout=timeout) as session:
tasks = [async_wrap(session, q, url, headers) for q in queue]
# gather literally 'gathers' all the tasks and schedules them in the event loop
await asyncio.gather(*tasks, return_exceptions=True)
if __name__ == '__main__':
ts = time()
# Create the asyncio event loop - from the main function
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(main())
finally:
# Lets avoid an unclosed loop running a DDoS attack on ourselves
loop.close()
logger.info('Took %s seconds to complete', time() - ts)
note the line containing, takes those tasks gathered to schedule as coroutines in the main event loop:
loop.run_until_complete(main())
and then this, which calls my function, async_wrap() for each record I wanted to send in the http client (which I had stored in a list), but in your case it would call your asynchronous function get_html() using each record from get_title_range():
tasks = [async_wrap(session, q, url, headers) for q in queue] # -> mine
await asyncio.gather(*tasks, return_exceptions=True) # -> gather those tasks!
tasks = [get_html(episode_number=episode) for episode in list_of_episode_nums] # -> yours
await asyncio.gather(*tasks, return_exceptions=True) # -> gather those tasks!
Hope this helps you shore some details up, but unfortunately, asynchronous code can be quite a headache, requiring lots of trial-and-error.

aiohttp: rate limiting requests-per-second by domain

I am writing a web crawler that is running parallel fetches for many different domains. I want to limit the number of requests-per-second that are made to each individual domain, but I do not care about the total number of connections that are open, or the total requests per second that are made across all domains. I want to maximize the number of open connections and requests-per-second overall, while limiting the number of requests-per-second made to individual domains.
All of the currently existing examples I can find either (1) limit the number of open connections or (2) limit the total number of requests-per-second made in the fetch loop. Examples include:
aiohttp: rate limiting parallel requests
aiohttp: set maximum number of requests per second
Neither of them do what I am requesting which is to limit requests-per-second on a per domain basis. The first question only answers how to limit requests-per-second overall. The second one doesn't even have answers to the actual question (the OP asks about requests per second and the answers all talk about limiting # of connections).
Here is the code that I tried, using a simple rate limiter I made for a synchronous version, which doesn't work when the DomainTimer code is run in an async event loop:
from collections import defaultdict
from datetime import datetime, timedelta
import asyncio
import async_timeout
import aiohttp
from urllib.parse import urlparse
from queue import Queue, Empty
from HTMLProcessing import processHTML
import URLFilters
SEED_URLS = ['http://www.bbc.co.uk', 'http://www.news.google.com']
url_queue = Queue()
for u in SEED_URLS:
url_queue.put(u)
# number of pages to download per run of crawlConcurrent()
BATCH_SIZE = 100
DELAY = timedelta(seconds = 1.0) # delay between requests from single domain, in seconds
HTTP_HEADERS = {'Referer': 'http://www.google.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0'}
class DomainTimer():
def __init__(self):
self.timer = None
def resetTimer(self):
self.timer = datetime.now()
def delayExceeded(self, delay):
if not self.timer: #We haven't fetched this before
return True
if (datetime.now() - self.timer) >= delay:
return True
else:
return False
crawl_history = defaultdict(dict) # given a URL, when is last time crawled?
domain_timers = defaultdict(DomainTimer)
async def fetch(session, url):
domain = urlparse(url).netloc
print('here fetching ' + url + "\n")
dt = domain_timers[domain]
if dt.delayExceeded(DELAY) or not dt:
with async_timeout.timeout(10):
try:
dt.resetTimer() # reset domain timer
async with session.get(url, headers=HTTP_HEADERS) as response:
if response.status == 200:
crawl_history[url] = datetime.now()
html = await response.text()
return {'url': url, 'html': html}
else:
# log HTTP response, put into crawl_history so
# we don't attempt to fetch again
print(url + " failed with response: " + str(response.status) + "\n")
return {'url': url, 'http_status': response.status}
except aiohttp.ClientConnectionError as e:
print("Connection failed " + str(e))
except aiohttp.ClientPayloadError as e:
print("Recieved bad data from server # " + url + "\n")
else: # Delay hasn't passed yet: skip for now & put # end of q
url_queue.put(url);
return None
async def fetch_all(urls):
"""Launch requests for all web pages."""
tasks = []
async with aiohttp.ClientSession() as session:
for url in urls:
task = asyncio.ensure_future(fetch(session, url))
tasks.append(task) # create list of tasks
return await asyncio.gather(*tasks) # gather task responses
def batch_crawl():
"""Launch requests for all web pages."""
start_time = datetime.now()
# Here we build the list of URLs to crawl for this batch
urls = []
for i in range(BATCH_SIZE):
try:
next_url = url_queue.get_nowait() # get next URL from queue
urls.append(next_url)
except Empty:
print("Processed all items in URL queue.\n")
break;
loop = asyncio.get_event_loop()
asyncio.set_event_loop(loop)
pages = loop.run_until_complete(fetch_all(urls))
crawl_time = (datetime.now() - start_time).seconds
print("Crawl completed. Fetched " + str(len(pages)) + " pages in " + str(crawl_time) + " seconds.\n")
return pages
def parse_html(pages):
""" Parse the HTML for each page downloaded in this batch"""
start_time = datetime.now()
results = {}
for p in pages:
if not p or not p['html']:
print("Received empty page")
continue
else:
url, html = p['url'], p['html']
results[url] = processHTML(html)
processing_time = (datetime.now() - start_time).seconds
print("HTML processing finished. Processed " + str(len(results)) + " pages in " + str(processing_time) + " seconds.\n")
return results
def extract_new_links(results):
"""Extract links from """
# later we could track where links were from here, anchor text, etc,
# and weight queue priority based on that
links = []
for k in results.keys():
new_urls = [l['href'] for l in results[k]['links']]
for u in new_urls:
if u not in crawl_history.keys():
links.append(u)
return links
def filterURLs(urls):
urls = URLFilters.filterDuplicates(urls)
urls = URLFilters.filterBlacklistedDomains(urls)
return urls
def run_batch():
pages = batch_crawl()
results = parse_html(pages)
links = extract_new_links(results)
for l in filterURLs(links):
url_queue.put(l)
return results
There are no errors or exceptions thrown, and the rate-limiting code works fine in for synchronous fetches, but the DomainTimer has no apparent effect when run in async loop. The delay of one request-per-second per domain is not upheld...
How would I modify this synchronous rate limiting code to work within the async event loop? Thanks!
It's hard to debug your code since it contains many unrelated stuff, it's easier to show idea on a new simple example.
Main idea:
write your Semaphore-like class using __aenter__, __aexit__
that accepts url (domain)
use domain-specific Lock to prevent multiple requests to the same domain
sleep before allowing next request according to domain's last request and RPS
track time of last request for each domain
Code:
import asyncio
import aiohttp
from urllib.parse import urlparse
from collections import defaultdict
class Limiter:
# domain -> req/sec:
_limits = {
'httpbin.org': 4,
'eu.httpbin.org': 1,
}
# domain -> it's lock:
_locks = defaultdict(lambda: asyncio.Lock())
# domain -> it's last request time
_times = defaultdict(lambda: 0)
def __init__(self, url):
self._host = urlparse(url).hostname
async def __aenter__(self):
await self._lock
to_wait = self._to_wait_before_request()
print(f'Wait {to_wait} sec before next request to {self._host}')
await asyncio.sleep(to_wait)
async def __aexit__(self, *args):
print(f'Request to {self._host} just finished')
self._update_request_time()
self._lock.release()
#property
def _lock(self):
"""Lock that prevents multiple requests to same host."""
return self._locks[self._host]
def _to_wait_before_request(self):
"""What time we need to wait before request to host."""
request_time = self._times[self._host]
request_delay = 1 / self._limits[self._host]
now = asyncio.get_event_loop().time()
to_wait = request_time + request_delay - now
to_wait = max(0, to_wait)
return to_wait
def _update_request_time(self):
now = asyncio.get_event_loop().time()
self._times[self._host] = now
# request that uses Limiter instead of Semaphore:
async def get(url):
async with Limiter(url):
async with aiohttp.ClientSession() as session: # TODO reuse session for different requests.
async with session.get(url) as resp:
return await resp.text()
# main:
async def main():
coros = [
get('http://httpbin.org/get'),
get('http://httpbin.org/get'),
get('http://httpbin.org/get'),
get('http://httpbin.org/get'),
get('http://httpbin.org/get'),
get('http://eu.httpbin.org/get'),
get('http://eu.httpbin.org/get'),
get('http://eu.httpbin.org/get'),
get('http://eu.httpbin.org/get'),
get('http://eu.httpbin.org/get'),
]
await asyncio.gather(*coros)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(main())
finally:
loop.run_until_complete(loop.shutdown_asyncgens())
loop.close()
I developed a library named octopus-api (https://pypi.org/project/octopus-api/), that enables you to rate limit and set the number of connections to the endpoint using aiohttp under the hood. The goal of it is to simplify all the aiohttp setup needed.
Here is an example of how to use it, where the get_ethereum is the user-defined request function. It could have also been a web crawler function request or what ever fits:
from octopus_api import TentacleSession, OctopusApi
from typing import Dict, List
if __name__ == '__main__':
async def get_ethereum(session: TentacleSession, request: Dict):
async with session.get(url=request["url"], params=request["params"]) as response:
body = await response.json()
return body
client = OctopusApi(rate=50, resolution="sec", connections=6)
result: List = client.execute(requests_list=[{
"url": "https://api.pro.coinbase.com/products/ETH-EUR/candles?granularity=900&start=2021-12-04T00:00:00Z&end=2021-12-04T00:00:00Z",
"params": {}}] * 1000, func=get_ethereum)
print(result)
The TentacleSession works the same as how you write POST, GET, PUT and PATCH for aiohttp.ClientSession.
Let me know if it helps your issue related to rate limits and connection for crawling.

Categories

Resources