I am trying to write a program using asyncio and was oriented towards this blog post. What I am trying to do is fetch some JSON data concurrently. For one input data frame. however, I would like to process the requested data further as soon as it becomes available.
So basically there are two groups of tasks:
process data in df1 concurrently and do some calc once JSON returned
process data in df2 concurrently
They are more or less independent of each other, but I want to run the group of tasks concurrently as well. Once both task groups are finished I want to further process them.
My question is if my implementation is properly designed in terms of asyncio patterns, where I just used two gather statements? Or whether this is the wrong concept? Here is a scatch:
import asyncio
import aiohttp
from aiohttp import ClientSession
async def fetch_json(url: str, session: ClientSession, data: json.dumps) -> Dict:
resp = await session.get(url=url, headers={"content-type": "application/json"}, data=data)
resp.raise_for_status()
logger.info("Got response [%s] for URL: %s", resp.status, url)
json = await resp.json()
return json
async def some_calc(url: str, session: ClientSession, data: json.dumps):
res = await fetch_json(url=url, session=session, data=data)
return [float(x) for x in res]
async def process_data(df: Dict, url: str, session: ClientSession):
async with session:
tasks = []
for data in df:
try:
if df1:
task = some_calc(url=url, session=session, data=data)
else:
task = fetch_json(url=url, session=session, data=data)
except Exception as e:
# ...
tasks.append(
task
)
res = await asyncio.gather(*tasks)
return res
async def bulk_execute(df1, df2):
url = "http://some.url/"
async with ClientSession() as session:
res = await asyncio.gather(process_data(df1, url, session), process_data(df2, url, session))
return res
if __name__ == "__main__":
res = asyncio.run(bulk_execute(df1, df2))
Related
Please explain to me when should I use asyncio.ensure_future and create_task and what's the difference.
We can write like this:
async def run(r):
url = "http://localhost:8080/{}"
tasks = []
# Fetch all responses within one Client session,
# keep connection alive for all requests.
async with ClientSession() as session:
for i in range(r):
task = asyncio.ensure_future(fetch(url.format(i), session))
tasks.append(task)
responses = await asyncio.gather(*tasks)
# you now have all response bodies in this variable
print(responses)
Or like this:
async def get_all(*names: str):
started_at = time.time()
# Create tasks, so we start requesting all of them concurrently
tasks = [asyncio.create_task(get_pokemon(name)) for name in names]
# Await ALL
results = await asyncio.gather(*tasks)
for result in results:
if result:
pokemon = parse_pokemon(result)
print(f"💁 {pokemon.name} is of type(s) {','.join(pokemon.types)}")
else:
print(f"❌ No data found for...")
I have a block of codes that works well in fetching data from API requests to a specific site. The issue is that the site only gives me a limit of 50 objects for each call, and I have to make multiple calls. As a result, it takes me too long to finish the fetching work (sometimes I have to wait nearly 20 minutes). Here are my codes:
import concurrent.futures
import requests
supply = 3000
offset = 0
token_ids = []
while offset < supply:
url = "url_1" + str(offset)
response = requests.request("GET", url)
a = response.json()
assets = a["assets"]
def get_token_ids(an):
if str(an['sell_orders']) == 'None' and str(an['last_sale']) == 'None' and str(an['num_sales']) == '0':
token_ids.append(str(an['token_id']))
with concurrent.futures.ThreadPoolExecutor() as executor:
results = [executor.submit(get_token_ids, asset) for asset in assets]
offset += 50
print(token_ids)
The problem is that the codes run through and wait for all actions to be finished before making another request. I am thinking of an improvement that when the request is sent, the offset value gets added, and the loop processes to another request, thus I don't have to wait. I don't know how to do it, I studied 'asyncio', but it is still a challenge for me. Can anyone help me with this?
The problem is that Requests is not asynchronous code, so each of its network calls blocks the loop until its completion.
https://docs.python-requests.org/en/latest/user/advanced/#blocking-or-non-blocking
Therefore, it is better to try asynchronous libraries, for example, aiohttp:
https://github.com/aio-libs/aiohttp
Example
Create session for all connections:
async with aiohttp.ClientSession() as session:
and run all desired requests:
results = await asyncio.gather(
*[get_data(session, offset) for offset in range(0, supply, step)]
)
here, requests are executed asynchronously, with session.get(url) gets only the response headers, and the content gets await response.json():
async with session.get(url) as response:
a = await response.json()
And in the main block main loop starts:
loop = asyncio.get_event_loop()
token_ids = loop.run_until_complete(main())
loop.close()
The full code
import aiohttp
import asyncio
async def get_data(session, offset):
token_ids = []
url = "url_1" + str(offset)
async with session.get(url) as response:
# For tests:
# print("Status:", response.status)
# print("Content-type:", response.headers['content-type'])
a = await response.json()
assets = a["assets"]
for asset in assets:
if str(asset['sell_orders']) == 'None' and str(asset['last_sale']) == 'None' and str(asset['num_sales']) == '0':
token_ids.append(str(asset['token_id']))
return token_ids
async def main():
supply = 3000
step = 50
token_ids = []
# Create session for all connections and pass it to "get" function
async with aiohttp.ClientSession() as session:
results = await asyncio.gather(
*[get_data(session, offset) for offset in range(0, supply, step)]
)
for ids in results:
token_ids.extend(ids)
return token_ids
if __name__ == "__main__":
# asynchronous code start here
loop = asyncio.get_event_loop()
token_ids = loop.run_until_complete(main())
loop.close()
# asynchronous code end here
print(token_ids)
New to asyncio, using it to try to make a very large number of API requests more quickly and store the data returned from each request in a dict. I think I've got the syntax of using asyncio and aiohttp figured out mostly, because I'm getting the data returned but I'm having a hard time taking that data and storing it in a dict.
search_ids = [1,2,3,4,5,6,7,8,9,10]
stats = {"Date":[],"Instance ID":[],"Result":[],"Display Name":[]}
async def main():
async with aiohttp.ClientSession() as session:
tasks = []
for search_id in search_ids:
task = asyncio.ensure_future(get_data(session, search_id))
tasks.append(task)
responses = await asyncio.gather(*tasks)
for y in responses['entries']:
stats['Display Name'].append(y['player']['UserInfo']['displayName'])
async def get_data(session, search_id):
url = f'https://www.myapi.com/{search_id}'
async with session.get(url, headers=HEADERS, ssl=False) as response:
results = await response.json()
return results['Response']
asyncio.run(main())
So when I run this, I get an error: TypeError: list indices must be integers or slices, not str
Which makes it seem to me as if the data that has been returned isn't iterable. However, I've looked at what's being returned and it's exactly what I'm expecting it to be. So much that if I change the code to look like this instead, it works fine:
search_ids = [1,2,3,4,5,6,7,8,9,10]
stats = {"Date":[],"Instance ID":[],"Result":[],"Display Name":[]}
async def main():
async with aiohttp.ClientSession() as session:
tasks = []
for search_id in search_ids:
task = asyncio.ensure_future(get_data(session, search_id))
tasks.append(task)
responses = await asyncio.gather(*tasks)
for y in responses:
stats['Display Name'].append(y['entries'][0]['player']['UserInfo']['displayName'])
stats['Display Name'].append(y['entries'][1]['player']['UserInfo']['displayName'])
stats['Display Name'].append(y['entries'][2]['player']['UserInfo']['displayName'])
stats['Display Name'].append(y['entries'][3]['player']['UserInfo']['displayName'])
stats['Display Name'].append(y['entries'][4]['player']['UserInfo']['displayName'])
stats['Display Name'].append(y['entries'][5]['player']['UserInfo']['displayName'])
async def get_data(session, search_id):
url = f'https://www.myapi.com/{search_id}'
async with session.get(url, headers=HEADERS, ssl=False) as response:
results = await response.json()
return results['Response']
asyncio.run(main())
Am I not basically doing the same thing manually here that I'm trying to do with a For loop on the top snippet? I would just go with this workaround except that I plan on pulling out much more data from each of these responses and it's not practical to manually do this over and over.
Plus obviously this makes me question if I'm understanding async correctly or not if this is giving me such a simple error.
Appreciate any help.
You’re iterating over two different things. In the first one you iterate over responses[“entries”]. In the second you use responses. responses is a list (of dictionaries), not a dictionary, so it can only be accessed by index, not by key.
When you ran your code synchronously, all you had to do was iterate over the entries in the response. Now that you're working with multiple responses in a list, you need to iterate over both the responses and the entries in each. To do this, you need to use two separate for loops.
responses = await asyncio.gather(
*[get_data(session, search_id) for search_id in search_ids]
)
for response in responses:
for entry in response["entries"]:
stats["Display Name"].append(
entry["player"]["UserInfo"]["displayName"]
)
This might help
for index, y in enumerate(responses['entries']):
stats['Display Name'].append(y['entries'][index]['player']['UserInfo']['displayName'])
the error is because of the key in data type. Please try this code based on your 2nd code
search_ids = [1,2,3,4,5,6,7,8,9,10]
stats = {"Date":[],"Instance ID":[],"Result":[],"Display Name":[]}
async def main():
async with aiohttp.ClientSession() as session:
tasks = []
for search_id in search_ids:
task = asyncio.ensure_future(get_data(session, search_id))
tasks.append(task)
responses = await asyncio.gather(
*[get_data(session, search_id) for search_id in search_ids])
for response in responses:
for entry in response["entries"]:
stats["Display Name"].append(
entry["player"]["UserInfo"]["displayName"]
)
async def get_data(session, carnage_id):
url = f'https://www.myapi.com/{search_id}'
async with session.get(url, headers=HEADERS, ssl=False) as response:
results = await response.json()
return results['Response']
asyncio.run(main())
I am using the following code to make requests with aiohttp client. The server that I am trying to send request has a 30k request limit per hour per IP. So I am getting 429 too many request error. I want to put the job on sleep whenever it hits the limit.
I can extract the x_rateLimit_reset from the header so I thought I could use it to put the job on sleep but I observed very strange behavior. Sometimes the job the sleep time becomes negative and sometimes it gets stuck in sleeping mode.
For example, the last time that I ran the job, it first slept for 2000 seconds and then after the time passed, it again tried to sleep for another 2500 seconds and got stuck in sleeping mode. I think maybe the other parallel processes caused the issue so was wondering how to deal with too many request error msg when using Asyncio.
#backoff.on_exception(backoff.expo, (asyncio.TimeoutError, aiohttp.client_exceptions.ServerDisconnectedError,TooManyRequests),
max_time=300)
async def fetch(self, url, session, params):
try:
async with session.get(url, params=params) as response:
now = int(time.time())
print(response)
output = await response.read()
output = json.loads(output)
if 'X-RateLimit-Remaining' in response.headers:
rate = response.headers['X-RateLimit-Remaining']
if 'status' in output and output['status'] == 429:
x_rateLimit_reset = int(response.headers['X-RateLimit-Reset'])
print("sleep mode")
seconds = x_rateLimit_reset - now
LOGGER.info("The job will sleep for {} seconds".format(seconds))
time.sleep(max(seconds,0))
raise TooManyRequests()
return output
except (asyncio.TimeoutError, TypeError, json.decoder.JSONDecodeError,
aiohttp.client_exceptions.ServerDisconnectedError) as e:
print(str(e))
async def bound_fetch(self, sem, url, session, params):
# Getter function with semaphore.
async with sem:
output = await self.fetch(url, session, params)
return {"url": url, "output": output}
Edited:
This is how I initiate bound_fetch and define the URLs:
def get_responses(self, urls, office_token, params=None):
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(self.run(office_token, urls, params))
responses = loop.run_until_complete(future)
return responses
async def run(self, office_token, urls, params):
tasks = []
# create instance of Semaphore
sem = asyncio.BoundedSemaphore(200)
timeout = ClientTimeout(total=1000)
async with ClientSession(auth=BasicAuth(office_token, password=' '), timeout=timeout,
connector=TCPConnector(ssl=False)) as session:
for url in urls:
# pass Semaphore and session to every GET request
task = asyncio.ensure_future(self.bound_fetch(sem, url, session, params))
tasks.append(task)
responses = await asyncio.gather(*tasks)
return responses
urls = [
"{}/{}".format(self.base_url, "{}?page={}&api_key={}".format(object_name, page_number, self.api_keys))
for page_number in range(batch * chunk_size + 1, chunk_size * (1 + batch) + 1)]
Main reason you are using time.sleep() instead await asyncio.sleep().
UPDATE
Here is minimal working solution and some comment how it works.
Please use it to adopt your solution.
Take a look on asyncio-throttle
import aiohttp
import asyncio
from datetime import datetime
async def fetch(session, task): # fetching urls and mark result of execution
async with session.get(task['url']) as response:
if response.status != 200:
# response.raise_for_status()
# Here you need to somehow handle 429 code if it acquired
# In my example I just skip it.
task['result'] = response.status
task['status'] = 'done'
await response.text() # just to be sure we acquire data
print(f"{str(datetime.now())}: Got result of {task['url']}") # logging
task['result'] = response.status
task['status'] = 'done'
async def fetch_all(session, urls, persecond):
# convert to list of dicts
url_tasks = [{'url': i, 'result': None, 'status': 'new'} for i in urls]
n = 0 # counter
while True:
# calc how many tasks are fetching right now
running_tasks = len([i for i in url_tasks if i['status'] in ['fetch']])
# calc how many tasks are still need to be executed
is_tasks_to_wait = len([i for i in url_tasks if i['status'] != 'done'])
# check we are not in the end of list n < len()
# check we have room for one more task
if n < len(url_tasks) and running_tasks < persecond:
url_tasks[n]['status'] = 'fetch'
#
# Here is main trick
# If you schedule task inside running loop
# it will start to execute sync code until find some await
#
asyncio.create_task(fetch(session, url_tasks[n]))
n += 1
print(f'Schedule tasks {n}. '
f'Running {running_tasks} '
f'Remain {is_tasks_to_wait}')
# Check persecond constrain and wait a sec (or period)
if running_tasks >= persecond:
print('Throttling')
await asyncio.sleep(1)
#
# Here is another main trick
# To keep asyncio.run (or loop.run_until_complete) executing
# we need to wait a little than check that all tasks are done and
# wait and so on
if is_tasks_to_wait != 0:
await asyncio.sleep(0.1) # wait all tasks done
else:
# All tasks done
break
return url_tasks
async def main():
urls = ['http://google.com/?1',
'http://google.com/?2',
'http://google.com/?3']*3
async with aiohttp.ClientSession() as session:
res = await fetch_all(session, urls, 3)
print(res)
if __name__ == '__main__':
asyncio.run(main())
# (asyncio.run) do cancel all pending tasks (we do not have them,
# because we check all task done)
# (asyncio.run) do await canceling all tasks
# (asyncio.run) do stop loop
# exit program
I followed up this tutorial: https://pawelmhm.github.io/asyncio/python/aiohttp/2016/04/22/asyncio-aiohttp.html and everything works fine when I am doing like 50 000 requests. But I need to do 1 milion API calls and then I have problem with this code:
url = "http://some_url.com/?id={}"
tasks = set()
sem = asyncio.Semaphore(MAX_SIM_CONNS)
for i in range(1, LAST_ID + 1):
task = asyncio.ensure_future(bound_fetch(sem, url.format(i)))
tasks.add(task)
responses = asyncio.gather(*tasks)
return await responses
Because Python needs to create 1 milion tasks, it basically just lags and then prints Killed message in terminal. Is there any way to use a generator insted of pre-made set (or list) of urls? Thanks.
Schedule all 1 million tasks at once
This is the code you are talking about. It takes up to 3 GB RAM so it is easily possible that it will be terminated by the operating system if you have low free memory.
import asyncio
from aiohttp import ClientSession
MAX_SIM_CONNS = 50
LAST_ID = 10**6
async def fetch(url, session):
async with session.get(url) as response:
return await response.read()
async def bound_fetch(sem, url, session):
async with sem:
await fetch(url, session)
async def fetch_all():
url = "http://localhost:8080/?id={}"
tasks = set()
async with ClientSession() as session:
sem = asyncio.Semaphore(MAX_SIM_CONNS)
for i in range(1, LAST_ID + 1):
task = asyncio.create_task(bound_fetch(sem, url.format(i), session))
tasks.add(task)
return await asyncio.gather(*tasks)
if __name__ == '__main__':
asyncio.run(fetch_all())
Use queue to streamline the work
This is my suggestion how to use asyncio.Queue to pass URLs to worker tasks. The queue is filled as-needed, there is no pre-made list of URLs.
It takes only 30 MB RAM :)
import asyncio
from aiohttp import ClientSession
MAX_SIM_CONNS = 50
LAST_ID = 10**6
async def fetch(url, session):
async with session.get(url) as response:
return await response.read()
async def fetch_worker(url_queue):
async with ClientSession() as session:
while True:
url = await url_queue.get()
try:
if url is None:
# all work is done
return
response = await fetch(url, session)
# ...do something with the response
finally:
url_queue.task_done()
# calling task_done() is necessary for the url_queue.join() to work correctly
async def fetch_all():
url = "http://localhost:8080/?id={}"
url_queue = asyncio.Queue(maxsize=100)
worker_tasks = []
for i in range(MAX_SIM_CONNS):
wt = asyncio.create_task(fetch_worker(url_queue))
worker_tasks.append(wt)
for i in range(1, LAST_ID + 1):
await url_queue.put(url.format(i))
for i in range(MAX_SIM_CONNS):
# tell the workers that the work is done
await url_queue.put(None)
await url_queue.join()
await asyncio.gather(*worker_tasks)
if __name__ == '__main__':
asyncio.run(fetch_all())
asyncio is memory bound (like any other program). You can not spawn more task that memory can hold. My guess is that you hit a memory limit. Check dmesg for more information.
1 millions RPS doesn't mean there is 1M tasks. A task can do several request in the same second.