python using asyncio gather with ensure_future - python

I'have a list of url which i need to fetch concurrently. In order to do that im using asyncio and aiohttp in this way:
async def fetch(session: aiohttp.ClientSession, res):
async with session.get(f"/foo/bar/{res}") as res:
return await res.json()
async def get_resources():
async with aiohttp.ClientSession(base_url=BASE_URL, headers=HEADERS) as session:
return await asyncio.gather(*[
fetch(session, res)
for res in [resource1, resource2, ...]
])
def main():
return asyncio.get_event_loop().run_until_complete(
get_resources()
)
this work as i expected, all resources all fire concurrently. but i noticed that i can also use asyncio.ensure_future:
async def get_resources():
async with aiohttp.ClientSession(base_url=BASE_URL) as session:
return await asyncio.gather(*[
asyncio.ensure_future(fetch(session, res))
for res in [resource1, resource2, ...]
])
so im wondering whats the benefit of using another extra call to before call gather... in this case asyncio.ensure_future. why not to directly pass a coroutine to gather like i did in the first example?

Related

Why does aiohttp behaves synchronously?

I wrote this script to process a few hundred thousand lines with an localhost API, but the code behaves synchronously.
async def process_line_async(session, line):
async with session.put('http://localhost:8887/load', data=line) as response:
r = await response.json()
sys.stdout.write(
f"some info about the response\n")
async def looper(lines):
async with aiohttp.ClientSession() as session:
tasks = []
for line in lines:
task = asyncio.ensure_future(process_line_async(session, line))
tasks.append(task)
await asyncio.gather(*tasks)
asyncio.run(main(lines))
When I test it, or when I modify the process_line_async() function leaving out the request part, with a time.sleep() statement and some arbitrary print, it works but it behaves synchronously

Getting "ValueError: too many file descriptors in select()" despite of using asyncio.Semaphore

I am doing some requests to Azure Maps. I have a subscription key (subscriptionKey) and a list of addresses I want to look for (addresses):
query_template = 'https://atlas.microsoft.com/search/address/json?&subscription-key={}&api-version=1.0&language=en-US&query={}'
queries = [query_template.format(subscriptionKey, address) for address in addresses]
I come from this question (not necessary to read it to understand the following) and everything worked fine in my sample of 1k queries. However, when I tried 10k queries I got ValueError: too many file descriptors in select(). I added some of the answers from here and now my code looks like this:
import asyncio
from aiohttp import ClientSession
from ssl import SSLContext
from sys import platform
import nest_asyncio
nest_asyncio.apply()
# Function to get a JSON from the result of a query
async def fetch(url, session):
async with session.get(url, ssl=SSLContext()) as response:
return await response.json()
# Function to run 'fetch()' with a Semaphore and check that the result is a dictionary (JSON)
async def fetch_sem(sem, attempts, url, session):
semaphore = asyncio.Semaphore(sem)
async with semaphore:
for _ in range(attempts):
result = await fetch(url, session)
if isinstance(result, dict):
break
return result
# Function to search for all queries
async def fetch_all(sem, attempts, urls):
async with ClientSession() as session:
return await asyncio.gather(*[fetch_sem(sem, attempts, url, session) for url in urls], return_exceptions=True)
# Making the queries
if __name__ == '__main__':
if platform == 'win32':
loop = asyncio.ProactorEventLoop()
asyncio.set_event_loop(loop)
loop = asyncio.get_event_loop()
results = loop.run_until_complete(fetch_all(1000, 3, queries))
Note that I have included both asyncio.Semaphore and asyncio.ProactorEventLoop(). But despite of this additions, I still get ValueError: too many file descriptors in select().
Could I get some help with this issue? Thank you!
The purpose of the semaphore is to count how many fetch operations are currently running and enforce an upper limit. That's why you need to have one semaphore:
You could create it in fetch_all and pass to fetch_sem:
async def fetch_sem(semaphore, attempts, url, session):
async with semaphore:
...
return result
async def fetch_all(limit, attempts, urls):
semaphore = asyncio.Semaphore(limit)
async with ClientSession() as session:
return await asyncio.gather(*[fetch_sem(semaphore, attempts, url, session) for url in urls], return_exceptions=True)
....
results = loop.run_until_complete(fetch_all(1000, 3, queries))

Python asyncio.gather returns None

I'm using Python asyncio to implement a fast http client.
As you can see in the comments below inside the worker function I get the responses as soon as they are finished. I would like to get the responses ordered and this is why I'm using asyncio.gather.
Why is it returning None? Can anybody help?
Thank you so much!
import time
import aiohttp
import asyncio
MAXREQ = 100
MAXTHREAD = 500
URL = 'https://google.com'
g_thread_limit = asyncio.Semaphore(MAXTHREAD)
async def worker(session):
async with session.get(URL) as response:
await response.read() #If I print this line I get the responses correctly
async def run(worker, *argv):
async with g_thread_limit:
await worker(*argv)
async def main():
async with aiohttp.ClientSession() as session:
await asyncio.gather(*[run(worker, session) for _ in range(MAXREQ)])
if __name__ == '__main__':
totaltime = time.time()
print(asyncio.get_event_loop().run_until_complete(main())) #I'm getting a None here
print (time.time() - totaltime)
Your function run doesn't return nothing explicitly, so it returns None implicitly. Add return statement and you'll get a result
async def worker(session):
async with session.get(URL) as response:
return await response.read()
async def run(worker, *argv):
async with g_thread_limit:
return await worker(*argv)

asyncio tasks using aiohttp.ClientSession

I'm using python 3.7 and trying to make a crawler that can go multiple domains asynchronously. I'm using for this asyncio and aiohttp but i'm experiencing problems with the aiohttp.ClientSession. This is my reduced code:
import aiohttp
import asyncio
async def fetch(session, url):
async with session.get(url) as response:
print(await response.text())
async def main():
loop = asyncio.get_event_loop()
async with aiohttp.ClientSession(loop=loop) as session:
cwlist = [loop.create_task(fetch(session, url)) for url in ['http://python.org', 'http://google.com']]
asyncio.gather(*cwlist)
if __name__ == "__main__":
asyncio.run(main())
The thrown exception is this:
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=RuntimeError('Session is closed')>
What am i doing wrong here?
You forgot to await the asyncio.gather result:
async with aiohttp.ClientSession(loop=loop) as session:
cwlist = [loop.create_task(fetch(session, url)) for url in ['http://python.org', 'http://google.com']]
await asyncio.gather(*cwlist)
If you ever have an async with containing no await expressions you should be fairly suspicious.

How to run DB requests asynchronously?

When I run this it lists off the websites in the database one by one with the response code and it takes about 10 seconds to run through a very small list. It should be way faster and isn't running asynchronously but I'm not sure why.
import dblogin
import aiohttp
import asyncio
import async_timeout
dbconn = dblogin.connect()
dbcursor = dbconn.cursor(buffered=True)
dbcursor.execute("SELECT thistable FROM adatabase")
website_list = dbcursor.fetchall()
async def fetch(session, url):
with async_timeout.timeout(30):
async with session.get(url, ssl=False) as response:
await response.read()
return response.status, url
async def main():
async with aiohttp.ClientSession() as session:
for all_urls in website_list:
url = all_urls[0]
resp = await fetch(session, url)
print(resp, url)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
loop.close()
dbcursor.close()
dbconn.close()
This article explains the details. What you need to do is pass each fetch call in a Future object, and then pass a list of those to either asyncio.wait or asyncio.gather depending on your needs.
Your code would look something like this:
async def fetch(session, url):
with async_timeout.timeout(30):
async with session.get(url, ssl=False) as response:
await response.read()
return response.status, url
async def main():
tasks = []
async with aiohttp.ClientSession() as session:
for all_urls in website_list:
url = all_urls[0]
task = asyncio.create_task(fetch(session, url))
tasks.append(task)
responses = await asyncio.gather(*tasks)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
future = asyncio.create_task(main())
loop.run_until_complete(future)
Also, are you sure that loop.close() call is needed? The docs mention that
The loop must not be running when this function is called. Any pending callbacks will be discarded.
This method clears all queues and shuts down the executor, but does not wait for the executor to finish.
As mentioned in the docs and in the link that #user4815162342 posted, it is better to use the create_task method instead of the ensure_future method when we know that the argument is a coroutine. Note that this was added in Python 3.7, so previous versions should continue using ensure_future instead.

Categories

Resources