Python async API requests in batches - python

I m tryin to make async API calls this way:
func to send request:
async def get_data(client, postdata):
res = await client.post(url=_url, headers=_headers, data=postdata)
return res
func to parse JSON:
async def parse_res(client, postdata):
res = await get_data(client, postdata)
if bool(json.loads(res.text)['suggestions']):
_oks = <...grab some JSON fields...>
else:
_oks = {}
return _oks
I wrap this two funcs in MAIN():
async def main(_jobs):
async with httpx.AsyncClient() as client:
batch = []
calls = []
for job in _jobs:
_postdata = '{ "query": "'+ job + '" }'
calls.append(asyncio.create_task(parse_res(client, _postdata)))
batch = await asyncio.gather(*calls)
return batch
and then just run MAIN()
But the API can handle about 30-50 fast (nearly simultaneous requests or throws 429 HTTP error).
So i need to send batches of 30 calls and process 10 000 requests in chunks.
How do i process 10 000 (ten thousand) API calls in batches of 30 ?

One library that comes in handy here is funcy. It offers various helper for working with sequences. One of that would be chunks. This allows you to split a sequence into chunks of equal size or fewer in the end if the totalsize does not divide.
from funcy import chunks
result = []
for job_chunk in chunks(30, _jobs):
calls = [parse_res(client, '{ "query": "'+ job + '" }') for job un job_chunk]
batch = await asyncio.gather(*calls)
result.extend(batch)

You could use Simon Hawe's answer, however here's a different approach without the usage of external libraries
Use asyncio.Semaphore to limit the amount of calls made concurrently, when the semaphore is released it will let another function to run.
import asyncio
sem = asyncio.Semaphore(30) # no. of simultaneous requests
async def get_data(client, postdata):
async with sem:
res = client.post(url=_url, headers=_headers, data=postdata)
return res
async def parse_res(client, postdata):
res = await get_data(client, postdata)
if bool(json.loads(res.text)['suggestions']):
_oks = <...grab some JSON fields...>
else:
_oks = {}
return _oks
async def main(_jobs: int):
async with httpx.AsyncClient() as client:
postdata = '{"query": "' + job + '"}'
calls = [
asyncio.create_task(parse_res(client, postdata)
for _ in range(_jobs)
]
return await asyncio.gather(*calls)

Related

Python aiohttp how to handle client session token timeout

I am making several 100's of http request using aiohttp. I am relatively new to the async world but have managed to get the basic code working.
First I am generating a token. Then, making aiohttp calls using this token.
Token has a validity of 30 mins. So I am assuming if my calls run for more than 30 mins then they will start failing.
How do I update my code to plug-in a new token after 30 mins then resume the remaining calls. This my first time implementing async calls, so relatively clueless on how to handle this.
async def a_get_all_user_details(urls):
results = []
connector = aiohttp.TCPConnector(limit=70)
timeout = aiohttp.ClientTimeout(total=None, connect=300, sock_connect=300, sock_read=None)
auth_token = get_token() # token expires in 30 mins
headers = {
'accept': 'application/json',
'Authorization': 'Bearer ' + auth_token
}
async with aiohttp.ClientSession(trust_env=True, headers=headers, connector=connector, timeout=timeout) as session:
for url in urls:
result = asyncio.ensure_future(a_get_user_details(url, session))
results.append(result)
responses = await asyncio.gather(*results)
return responses
def main():
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(a_get_all_user_details(search_urls))
user_details = loop.run_until_complete(future)
Maybe there's simpler way to do it but here's my take:
The problem is that there are many connections in the fly when you want to refresh session. When you close the session and create new one, active connections which are waiting for data throw an exception.
In my example I have a list of all sessions and when time arrives I simply create new session (with new token) and append it to the list. The new connections will use the last (freshest) session.
At the end of script I close all sessions.
import aiohttp
import asyncio
sessions = []
async def get_token():
return "XYZ"
async def refresh_session():
# this function periodically refreshes the token every X sec
connector = aiohttp.TCPConnector(limit=3)
timeout = aiohttp.ClientTimeout(
total=None, connect=300, sock_connect=300, sock_read=None
)
while True:
headers = {
"accept": "application/json",
"Authorization": "Bearer " + await get_token(),
}
sessions.append(
aiohttp.ClientSession(
trust_env=True,
headers=headers,
connector=connector,
timeout=timeout,
)
)
print("New session created")
await asyncio.sleep(5) # every 5 seconds refresh session
async def get_user_detail(url):
# wait for session to show up:
while not sessions:
await asyncio.sleep(1)
# use last (freshest) session:
async with sessions[-1].get(url) as resp:
assert resp.status == 200
html = await resp.text()
return f"some result for {url} length of data {len(html)}"
async def get_user_details(urls):
results = []
for url in urls:
results.append(asyncio.ensure_future(get_user_detail(url)))
responses = await asyncio.gather(*results)
return responses
async def main():
# some urls to gather:
urls = [
"https://www.google.com",
"https://www.microsoft.com",
"https://www.yahoo.com",
] * 30
t1 = asyncio.create_task(refresh_session())
t2 = asyncio.create_task(get_user_details(urls))
# finish when first task ends (in this case get_user_details())
done, _ = await asyncio.wait([t1, t2], return_when=asyncio.FIRST_COMPLETED)
# close all opened sessions:
for s in sessions:
await s.close()
# print the result
print("Domains gathered ", len(done.pop().result()))
if __name__ == "__main__":
asyncio.run(main())
This prints:
New session created
New session created
Domains gathered 90

Using boto3 to await a synchronous lambda invocation

I want to invoke a lambda function synchronously (request - response) but want to use python async-await to await the response.
response = await client.invoke('my-func', InvocationType='RequestResponse', Payload='...'))
I found a kind of solution here but it is cumbersome and from 2016.
Is there a better approach today?
I found a way of doing it by manually running the invoke function on the asyncio event loop:
import asyncio
import concurrent
import boto3
import json
import botocore
class LambdaClient():
def __init__(self, concurrency: int = 20):
self.executor = concurrent.futures.ThreadPoolExecutor(
max_workers=concurrency,
)
client_config = botocore.config.Config(
max_pool_connections=concurrency
)
self.client = boto3.client('lambda', config=client_config)
async def invoke_async(self, snapshot):
loop = asyncio.get_running_loop()
result = await loop.run_in_executor(self.executor, lambda: self.invoke(snapshot))
return result
def invoke(self, snapshot):
payload = {
'path': '/calculate/value',
'body': json.dumps(snapshot)
}
b = bytes(json.dumps(payload), encoding='utf8')
response = self.client.invoke(
FunctionName='function-name',
InvocationType='RequestResponse',
LogType='None',
Payload=b)
if 'StatusCode' not in response or response['StatusCode'] != 200:
raise ValueError(f'Lambda invocation failed with response {response}')
output = response["Payload"].read()
return output

Run two concurrent task groups asynchronously with asyncio

I am trying to write a program using asyncio and was oriented towards this blog post. What I am trying to do is fetch some JSON data concurrently. For one input data frame. however, I would like to process the requested data further as soon as it becomes available.
So basically there are two groups of tasks:
process data in df1 concurrently and do some calc once JSON returned
process data in df2 concurrently
They are more or less independent of each other, but I want to run the group of tasks concurrently as well. Once both task groups are finished I want to further process them.
My question is if my implementation is properly designed in terms of asyncio patterns, where I just used two gather statements? Or whether this is the wrong concept? Here is a scatch:
import asyncio
import aiohttp
from aiohttp import ClientSession
async def fetch_json(url: str, session: ClientSession, data: json.dumps) -> Dict:
resp = await session.get(url=url, headers={"content-type": "application/json"}, data=data)
resp.raise_for_status()
logger.info("Got response [%s] for URL: %s", resp.status, url)
json = await resp.json()
return json
async def some_calc(url: str, session: ClientSession, data: json.dumps):
res = await fetch_json(url=url, session=session, data=data)
return [float(x) for x in res]
async def process_data(df: Dict, url: str, session: ClientSession):
async with session:
tasks = []
for data in df:
try:
if df1:
task = some_calc(url=url, session=session, data=data)
else:
task = fetch_json(url=url, session=session, data=data)
except Exception as e:
# ...
tasks.append(
task
)
res = await asyncio.gather(*tasks)
return res
async def bulk_execute(df1, df2):
url = "http://some.url/"
async with ClientSession() as session:
res = await asyncio.gather(process_data(df1, url, session), process_data(df2, url, session))
return res
if __name__ == "__main__":
res = asyncio.run(bulk_execute(df1, df2))

aiohttp: rate limiting requests-per-second by domain

I am writing a web crawler that is running parallel fetches for many different domains. I want to limit the number of requests-per-second that are made to each individual domain, but I do not care about the total number of connections that are open, or the total requests per second that are made across all domains. I want to maximize the number of open connections and requests-per-second overall, while limiting the number of requests-per-second made to individual domains.
All of the currently existing examples I can find either (1) limit the number of open connections or (2) limit the total number of requests-per-second made in the fetch loop. Examples include:
aiohttp: rate limiting parallel requests
aiohttp: set maximum number of requests per second
Neither of them do what I am requesting which is to limit requests-per-second on a per domain basis. The first question only answers how to limit requests-per-second overall. The second one doesn't even have answers to the actual question (the OP asks about requests per second and the answers all talk about limiting # of connections).
Here is the code that I tried, using a simple rate limiter I made for a synchronous version, which doesn't work when the DomainTimer code is run in an async event loop:
from collections import defaultdict
from datetime import datetime, timedelta
import asyncio
import async_timeout
import aiohttp
from urllib.parse import urlparse
from queue import Queue, Empty
from HTMLProcessing import processHTML
import URLFilters
SEED_URLS = ['http://www.bbc.co.uk', 'http://www.news.google.com']
url_queue = Queue()
for u in SEED_URLS:
url_queue.put(u)
# number of pages to download per run of crawlConcurrent()
BATCH_SIZE = 100
DELAY = timedelta(seconds = 1.0) # delay between requests from single domain, in seconds
HTTP_HEADERS = {'Referer': 'http://www.google.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0'}
class DomainTimer():
def __init__(self):
self.timer = None
def resetTimer(self):
self.timer = datetime.now()
def delayExceeded(self, delay):
if not self.timer: #We haven't fetched this before
return True
if (datetime.now() - self.timer) >= delay:
return True
else:
return False
crawl_history = defaultdict(dict) # given a URL, when is last time crawled?
domain_timers = defaultdict(DomainTimer)
async def fetch(session, url):
domain = urlparse(url).netloc
print('here fetching ' + url + "\n")
dt = domain_timers[domain]
if dt.delayExceeded(DELAY) or not dt:
with async_timeout.timeout(10):
try:
dt.resetTimer() # reset domain timer
async with session.get(url, headers=HTTP_HEADERS) as response:
if response.status == 200:
crawl_history[url] = datetime.now()
html = await response.text()
return {'url': url, 'html': html}
else:
# log HTTP response, put into crawl_history so
# we don't attempt to fetch again
print(url + " failed with response: " + str(response.status) + "\n")
return {'url': url, 'http_status': response.status}
except aiohttp.ClientConnectionError as e:
print("Connection failed " + str(e))
except aiohttp.ClientPayloadError as e:
print("Recieved bad data from server # " + url + "\n")
else: # Delay hasn't passed yet: skip for now & put # end of q
url_queue.put(url);
return None
async def fetch_all(urls):
"""Launch requests for all web pages."""
tasks = []
async with aiohttp.ClientSession() as session:
for url in urls:
task = asyncio.ensure_future(fetch(session, url))
tasks.append(task) # create list of tasks
return await asyncio.gather(*tasks) # gather task responses
def batch_crawl():
"""Launch requests for all web pages."""
start_time = datetime.now()
# Here we build the list of URLs to crawl for this batch
urls = []
for i in range(BATCH_SIZE):
try:
next_url = url_queue.get_nowait() # get next URL from queue
urls.append(next_url)
except Empty:
print("Processed all items in URL queue.\n")
break;
loop = asyncio.get_event_loop()
asyncio.set_event_loop(loop)
pages = loop.run_until_complete(fetch_all(urls))
crawl_time = (datetime.now() - start_time).seconds
print("Crawl completed. Fetched " + str(len(pages)) + " pages in " + str(crawl_time) + " seconds.\n")
return pages
def parse_html(pages):
""" Parse the HTML for each page downloaded in this batch"""
start_time = datetime.now()
results = {}
for p in pages:
if not p or not p['html']:
print("Received empty page")
continue
else:
url, html = p['url'], p['html']
results[url] = processHTML(html)
processing_time = (datetime.now() - start_time).seconds
print("HTML processing finished. Processed " + str(len(results)) + " pages in " + str(processing_time) + " seconds.\n")
return results
def extract_new_links(results):
"""Extract links from """
# later we could track where links were from here, anchor text, etc,
# and weight queue priority based on that
links = []
for k in results.keys():
new_urls = [l['href'] for l in results[k]['links']]
for u in new_urls:
if u not in crawl_history.keys():
links.append(u)
return links
def filterURLs(urls):
urls = URLFilters.filterDuplicates(urls)
urls = URLFilters.filterBlacklistedDomains(urls)
return urls
def run_batch():
pages = batch_crawl()
results = parse_html(pages)
links = extract_new_links(results)
for l in filterURLs(links):
url_queue.put(l)
return results
There are no errors or exceptions thrown, and the rate-limiting code works fine in for synchronous fetches, but the DomainTimer has no apparent effect when run in async loop. The delay of one request-per-second per domain is not upheld...
How would I modify this synchronous rate limiting code to work within the async event loop? Thanks!
It's hard to debug your code since it contains many unrelated stuff, it's easier to show idea on a new simple example.
Main idea:
write your Semaphore-like class using __aenter__, __aexit__
that accepts url (domain)
use domain-specific Lock to prevent multiple requests to the same domain
sleep before allowing next request according to domain's last request and RPS
track time of last request for each domain
Code:
import asyncio
import aiohttp
from urllib.parse import urlparse
from collections import defaultdict
class Limiter:
# domain -> req/sec:
_limits = {
'httpbin.org': 4,
'eu.httpbin.org': 1,
}
# domain -> it's lock:
_locks = defaultdict(lambda: asyncio.Lock())
# domain -> it's last request time
_times = defaultdict(lambda: 0)
def __init__(self, url):
self._host = urlparse(url).hostname
async def __aenter__(self):
await self._lock
to_wait = self._to_wait_before_request()
print(f'Wait {to_wait} sec before next request to {self._host}')
await asyncio.sleep(to_wait)
async def __aexit__(self, *args):
print(f'Request to {self._host} just finished')
self._update_request_time()
self._lock.release()
#property
def _lock(self):
"""Lock that prevents multiple requests to same host."""
return self._locks[self._host]
def _to_wait_before_request(self):
"""What time we need to wait before request to host."""
request_time = self._times[self._host]
request_delay = 1 / self._limits[self._host]
now = asyncio.get_event_loop().time()
to_wait = request_time + request_delay - now
to_wait = max(0, to_wait)
return to_wait
def _update_request_time(self):
now = asyncio.get_event_loop().time()
self._times[self._host] = now
# request that uses Limiter instead of Semaphore:
async def get(url):
async with Limiter(url):
async with aiohttp.ClientSession() as session: # TODO reuse session for different requests.
async with session.get(url) as resp:
return await resp.text()
# main:
async def main():
coros = [
get('http://httpbin.org/get'),
get('http://httpbin.org/get'),
get('http://httpbin.org/get'),
get('http://httpbin.org/get'),
get('http://httpbin.org/get'),
get('http://eu.httpbin.org/get'),
get('http://eu.httpbin.org/get'),
get('http://eu.httpbin.org/get'),
get('http://eu.httpbin.org/get'),
get('http://eu.httpbin.org/get'),
]
await asyncio.gather(*coros)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(main())
finally:
loop.run_until_complete(loop.shutdown_asyncgens())
loop.close()
I developed a library named octopus-api (https://pypi.org/project/octopus-api/), that enables you to rate limit and set the number of connections to the endpoint using aiohttp under the hood. The goal of it is to simplify all the aiohttp setup needed.
Here is an example of how to use it, where the get_ethereum is the user-defined request function. It could have also been a web crawler function request or what ever fits:
from octopus_api import TentacleSession, OctopusApi
from typing import Dict, List
if __name__ == '__main__':
async def get_ethereum(session: TentacleSession, request: Dict):
async with session.get(url=request["url"], params=request["params"]) as response:
body = await response.json()
return body
client = OctopusApi(rate=50, resolution="sec", connections=6)
result: List = client.execute(requests_list=[{
"url": "https://api.pro.coinbase.com/products/ETH-EUR/candles?granularity=900&start=2021-12-04T00:00:00Z&end=2021-12-04T00:00:00Z",
"params": {}}] * 1000, func=get_ethereum)
print(result)
The TentacleSession works the same as how you write POST, GET, PUT and PATCH for aiohttp.ClientSession.
Let me know if it helps your issue related to rate limits and connection for crawling.

Making 1 milion requests with aiohttp/asyncio - literally

I followed up this tutorial: https://pawelmhm.github.io/asyncio/python/aiohttp/2016/04/22/asyncio-aiohttp.html and everything works fine when I am doing like 50 000 requests. But I need to do 1 milion API calls and then I have problem with this code:
url = "http://some_url.com/?id={}"
tasks = set()
sem = asyncio.Semaphore(MAX_SIM_CONNS)
for i in range(1, LAST_ID + 1):
task = asyncio.ensure_future(bound_fetch(sem, url.format(i)))
tasks.add(task)
responses = asyncio.gather(*tasks)
return await responses
Because Python needs to create 1 milion tasks, it basically just lags and then prints Killed message in terminal. Is there any way to use a generator insted of pre-made set (or list) of urls? Thanks.
Schedule all 1 million tasks at once
This is the code you are talking about. It takes up to 3 GB RAM so it is easily possible that it will be terminated by the operating system if you have low free memory.
import asyncio
from aiohttp import ClientSession
MAX_SIM_CONNS = 50
LAST_ID = 10**6
async def fetch(url, session):
async with session.get(url) as response:
return await response.read()
async def bound_fetch(sem, url, session):
async with sem:
await fetch(url, session)
async def fetch_all():
url = "http://localhost:8080/?id={}"
tasks = set()
async with ClientSession() as session:
sem = asyncio.Semaphore(MAX_SIM_CONNS)
for i in range(1, LAST_ID + 1):
task = asyncio.create_task(bound_fetch(sem, url.format(i), session))
tasks.add(task)
return await asyncio.gather(*tasks)
if __name__ == '__main__':
asyncio.run(fetch_all())
Use queue to streamline the work
This is my suggestion how to use asyncio.Queue to pass URLs to worker tasks. The queue is filled as-needed, there is no pre-made list of URLs.
It takes only 30 MB RAM :)
import asyncio
from aiohttp import ClientSession
MAX_SIM_CONNS = 50
LAST_ID = 10**6
async def fetch(url, session):
async with session.get(url) as response:
return await response.read()
async def fetch_worker(url_queue):
async with ClientSession() as session:
while True:
url = await url_queue.get()
try:
if url is None:
# all work is done
return
response = await fetch(url, session)
# ...do something with the response
finally:
url_queue.task_done()
# calling task_done() is necessary for the url_queue.join() to work correctly
async def fetch_all():
url = "http://localhost:8080/?id={}"
url_queue = asyncio.Queue(maxsize=100)
worker_tasks = []
for i in range(MAX_SIM_CONNS):
wt = asyncio.create_task(fetch_worker(url_queue))
worker_tasks.append(wt)
for i in range(1, LAST_ID + 1):
await url_queue.put(url.format(i))
for i in range(MAX_SIM_CONNS):
# tell the workers that the work is done
await url_queue.put(None)
await url_queue.join()
await asyncio.gather(*worker_tasks)
if __name__ == '__main__':
asyncio.run(fetch_all())
asyncio is memory bound (like any other program). You can not spawn more task that memory can hold. My guess is that you hit a memory limit. Check dmesg for more information.
1 millions RPS doesn't mean there is 1M tasks. A task can do several request in the same second.

Categories

Resources