limit number of concurrent requests aiohttp - python

I'm downloading images using aiohttp, and was wondering if there is a way to limit the number of open requests that haven't finished. This is the code I currently have:
async def get_images(url, session):
chunk_size = 100
# Print statement to show when a request is being made.
print(f'Making request to {url}')
async with session.get(url=url) as r:
with open('path/name.png', 'wb') as file:
while True:
chunk = await r.content.read(chunk_size)
if not chunk:
break
file.write(chunk)
# List of urls to get images from
urls = [...]
conn = aiohttp.TCPConnector(limit=3)
loop = asyncio.get_event_loop()
session = aiohttp.ClientSession(connector=conn, loop=loop)
loop.run_until_complete(asyncio.gather(*(get_images(url, session=session) for url in urls)))
The problem is, I threw a print statement in to show me when each request is being made and it is making almost 21 requests at once, instead of the 3 that I am wanting to limit it to (i.e., once an image is done downloading, it can move on to the next url in the list to get). I'm just wondering what I am doing wrong here.

Your limit setting works correctly. You made mistake while debugging.
As Mikhail Gerasimov pointed in the comment, you put your print() call in wrong place - it must be inside session.get() context.
In order to be confident that limit is respected, I tested your code against simple logging server - and test shows that the server receives exactly that number of connections that you set in TCPConnector. Here is the test:
import asyncio
import aiohttp
loop = asyncio.get_event_loop()
class SilentServer(asyncio.Protocol):
def connection_made(self, transport):
# We will know when the connection is actually made:
print('SERVER |', transport.get_extra_info('peername'))
async def get_images(url, session):
chunk_size = 100
# This log doesn't guarantee that we will connect,
# session.get() will freeze if you reach TCPConnector limit
print(f'CLIENT | Making request to {url}')
async with session.get(url=url) as r:
while True:
chunk = await r.content.read(chunk_size)
if not chunk:
break
urls = [f'http://127.0.0.1:1337/{x}' for x in range(20)]
conn = aiohttp.TCPConnector(limit=3)
session = aiohttp.ClientSession(connector=conn, loop=loop)
async def test():
await loop.create_server(SilentServer, '127.0.0.1', 1337)
await asyncio.gather(*(get_images(url, session=session) for url in urls))
loop.run_until_complete(test())

asyncio.Semaphore solves exactly this issue.
In your case it'll be something like this:
semaphore = asyncio.Semaphore(3)
async def get_images(url, session):
async with semaphore:
print(f'Making request to {url}')
# ...
You may also be interested to take a look at this ready-to-run code example that demonstrates how semaphore works.

Related

python for each run async function without await and parallel

I have 10 links in my CSV which I'm trying to run all at the same time in a loop from getTasks function. However, the way it's working now, it send a request to link 1, waits for it to complete, then link 2, etc, etc. I want the 10 links that I have to run all whenever startTask is called, leading to 10 requests a second.
Anyone know how to code that using the code below? Thanks in advance.
import requests
from bs4 import BeautifulSoup
import asyncio
def getTasks(tasks):
for task in tasks:
asyncio.run(startTask(task))
async def startTask(task):
success = await getProduct(task)
if success is None:
return startTask(task)
success = await addToCart(task)
if success is None:
return startTask(task)
...
...
...
getTasks(tasks)
First of all, to make your requests sent concurrently, you should use the aiohttp instead of the requests package that blocks I/O. And use the asyncio's semaphore to limit the count of concurrent processes at the same time.
import asyncio
import aiohttp
# read links from CSV
links = [
...
]
semaphore = asyncio.BoundedSemaphore(10)
# 10 is the max count of concurrent tasks
# that can be processed at the same time.
# In this case, tasks are requests.
async def async_request(url):
async with aiohttp.ClientSession() as session:
async with semaphore, session.get(url) as response:
return await response.text()
async def main():
result = await asyncio.gather(*[
async_request(link) for link in links
])
print(result) # [response1, response2, ...]
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
loop.close()

Getting "ValueError: too many file descriptors in select()" despite of using asyncio.Semaphore

I am doing some requests to Azure Maps. I have a subscription key (subscriptionKey) and a list of addresses I want to look for (addresses):
query_template = 'https://atlas.microsoft.com/search/address/json?&subscription-key={}&api-version=1.0&language=en-US&query={}'
queries = [query_template.format(subscriptionKey, address) for address in addresses]
I come from this question (not necessary to read it to understand the following) and everything worked fine in my sample of 1k queries. However, when I tried 10k queries I got ValueError: too many file descriptors in select(). I added some of the answers from here and now my code looks like this:
import asyncio
from aiohttp import ClientSession
from ssl import SSLContext
from sys import platform
import nest_asyncio
nest_asyncio.apply()
# Function to get a JSON from the result of a query
async def fetch(url, session):
async with session.get(url, ssl=SSLContext()) as response:
return await response.json()
# Function to run 'fetch()' with a Semaphore and check that the result is a dictionary (JSON)
async def fetch_sem(sem, attempts, url, session):
semaphore = asyncio.Semaphore(sem)
async with semaphore:
for _ in range(attempts):
result = await fetch(url, session)
if isinstance(result, dict):
break
return result
# Function to search for all queries
async def fetch_all(sem, attempts, urls):
async with ClientSession() as session:
return await asyncio.gather(*[fetch_sem(sem, attempts, url, session) for url in urls], return_exceptions=True)
# Making the queries
if __name__ == '__main__':
if platform == 'win32':
loop = asyncio.ProactorEventLoop()
asyncio.set_event_loop(loop)
loop = asyncio.get_event_loop()
results = loop.run_until_complete(fetch_all(1000, 3, queries))
Note that I have included both asyncio.Semaphore and asyncio.ProactorEventLoop(). But despite of this additions, I still get ValueError: too many file descriptors in select().
Could I get some help with this issue? Thank you!
The purpose of the semaphore is to count how many fetch operations are currently running and enforce an upper limit. That's why you need to have one semaphore:
You could create it in fetch_all and pass to fetch_sem:
async def fetch_sem(semaphore, attempts, url, session):
async with semaphore:
...
return result
async def fetch_all(limit, attempts, urls):
semaphore = asyncio.Semaphore(limit)
async with ClientSession() as session:
return await asyncio.gather(*[fetch_sem(semaphore, attempts, url, session) for url in urls], return_exceptions=True)
....
results = loop.run_until_complete(fetch_all(1000, 3, queries))

aiohttp + uvloop parallel HTTP requests are slower than without uvloop

I'm writing a script to make millions of API calls in parallel.
I'm using Python 3.6 with aiohttp for this purpose.
I was expecting that uvloop would make it faster, but it seems to have made it slower. Am I doing something wrong?
with uvloop: 22 seconds
without uvloop: 15 seconds
import asyncio
import aiohttp
import uvloop
import time
import logging
from aiohttp import ClientSession, TCPConnector
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger()
urls = ["http://www.yahoo.com","http://www.bbcnews.com","http://www.cnn.com","http://www.buzzfeed.com","http://www.walmart.com","http://www.emirates.com","http://www.kayak.com","http://www.expedia.com","http://www.apple.com","http://www.youtube.com"]
bigurls = 10 * urls
def run(enable_uvloop):
try:
if enable_uvloop:
loop = uvloop.new_event_loop()
else:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
start = time.time()
conn = TCPConnector(limit=5000, use_dns_cache=True, loop=loop, verify_ssl=False)
with ClientSession(connector=conn) as session:
tasks = asyncio.gather(*[asyncio.ensure_future(do_request(url, session)) for url in bigurls]) # tasks to do
results = loop.run_until_complete(tasks) # loop until done
end = time.time()
logger.debug('total time:')
logger.debug(end - start)
return results
loop.close()
except Exception as e:
logger.error(e, exc_info=True)
async def do_request(url, session):
"""
"""
try:
async with session.get(url) as response:
resp = await response.text()
return resp
except Exception as e:
logger.error(e, exc_info=True)
run(True)
#run(False)
aiohttp recommends to use aiodns
also, as i remember, this with ClientSession(connector=conn) as session: should be async
You're not alone; I actually just got similar results (which led me to google my findings and brought me here).
My experiment involves running 500 concurrent GET requests to Google.com using aiohttp.
Here is the code for reference:
import asyncio, aiohttp, concurrent.futures
from datetime import datetime
import uvloop
class UVloopTester():
def __init__(self):
self.timeout = 20
self.threads = 500
self.totalTime = 0
self.totalRequests = 0
#staticmethod
def timestamp():
return f'[{datetime.now().strftime("%H:%M:%S")}]'
async def getCheck(self):
async with aiohttp.ClientSession() as session:
response = await session.get('https://www.google.com', timeout=self.timeout)
response.close()
await session.close()
return True
async def testRun(self, id):
now = datetime.now()
try:
if await self.getCheck():
elapsed = (datetime.now() - now).total_seconds()
print(f'{self.timestamp()} Request {id} TTC: {elapsed}')
self.totalTime += elapsed
self.totalRequests += 1
except concurrent.futures._base.TimeoutError: print(f'{self.timestamp()} Request {id} timed out')
async def main(self):
await asyncio.gather(*[asyncio.ensure_future(self.testRun(x)) for x in range(self.threads)])
def start(self):
# comment these lines to toggle
uvloop.install()
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
loop = asyncio.get_event_loop()
now = datetime.now()
loop.run_until_complete(self.main())
elapsed = (datetime.now() - now).total_seconds()
print(f'{self.timestamp()} Main TTC: {elapsed}')
print()
print(f'{self.timestamp()} Average TTC per Request: {self.totalTime / self.totalRequests}')
if len(asyncio.Task.all_tasks()) > 0:
for task in asyncio.Task.all_tasks(): task.cancel()
try: loop.run_until_complete(asyncio.gather(*asyncio.Task.all_tasks()))
except asyncio.CancelledError: pass
loop.close()
test = UVloopTester()
test.start()
I haven't planned out and executed any sort of careful experiment where I'm logging my findings and calculating standard deviations and p-values. But I have run this a (tiring) number of times and have come up with the following results.
Running without uvloop:
loop.run_until_complete(main()) takes about 10 seconds.
average time to complete for request takes about 4 seconds.
Running with uvloop:
loop.run_until_complete(main()) takes about 16 seconds.
average time to complete for request takes about 8.5 seconds.
I've shared this code with a friend of mine who is actually the one who suggested I try uvloop (since he gets a speed boost from it). Upon running it several times, his results confirm that he does in fact see an increase in speed from using uvloop (shorter time to complete for both main() and requests on average).
Our findings lead me to believe that the differences in our findings have to do with our setups: I'm using a Debian virtual machine with 8 GB RAM on a mid-tier laptop while he's using a native Linux desktop with a lot more 'muscle' under the hood.
My answer to your question is: No, I do not believe you are doing anything wrong because I am experiencing the same results and it does not appear that I am doing anything wrong although any constructive criticism is welcome and appreciated.
I wish I could be of more help; I hope my chiming in can be of some use.
I tried a similar experiment and see no real difference between uvloop and asyncio event loops for parallel http GET's:
asyncio event loop: avg=3.6285968542099 s. stdev=0.5583842811362075 s.
uvloop event loop: avg=3.419699764251709 s. stdev=0.13423859428541632 s.
It might be that the noticeable benefits of uvloop come into play when it is used in server code, i.e. for handling many incoming requests.
Code:
import time
from statistics import mean, stdev
import asyncio
import uvloop
import aiohttp
urls = [
'https://aws.amazon.com', 'https://google.com', 'https://microsoft.com', 'https://www.oracle.com/index.html'
'https://www.python.org', 'https://nodejs.org', 'https://angular.io', 'https://www.djangoproject.com',
'https://reactjs.org', 'https://www.mongodb.com', 'https://reinvent.awsevents.com',
'https://kafka.apache.org', 'https://github.com', 'https://slack.com', 'https://authy.com',
'https://cnn.com', 'https://fox.com', 'https://nbc.com', 'https://www.aljazeera.com',
'https://fly4.emirates.com', 'https://www.klm.com', 'https://www.china-airlines.com',
'https://en.wikipedia.org/wiki/List_of_Unicode_characters', 'https://en.wikipedia.org/wiki/Windows-1252'
]
def timed(func):
async def wrapper():
start = time.time()
await func()
return time.time() - start
return wrapper
#timed
async def main():
conn = aiohttp.TCPConnector(use_dns_cache=False)
async with aiohttp.ClientSession(connector=conn) as session:
coroutines = [fetch(session, url) for url in urls]
await asyncio.gather(*coroutines)
async def fetch(session, url):
async with session.get(url) as resp:
await resp.text()
asycio_results = [asyncio.run(main()) for i in range(10)]
print(f'asyncio event loop: avg={mean(asycio_results)} s. stdev={stdev(asycio_results)} s.')
# Change to uvloop
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
uvloop_results = [asyncio.run(main()) for i in range(10)]
print(f'uvloop event loop: avg={mean(uvloop_results)} s. stdev={stdev(uvloop_results)} s.')

aiohttp: when is the response.status available?

The Getting Started docs for aiohttp give the following client example:
async with aiohttp.ClientSession() as session:
async with session.get('https://api.github.com/events') as resp:
print(resp.status)
print(await resp.text())
I'm having trouble understanding when the response.status will be available. My understanding is that the coroutines releases control at the await response.read() line. How can I possibly access status before waiting for the response to comeback?
Important distinction: await ... may release control of the context, for example if the awaited data is not avalible fast enough. The same goes for the async with ... statement. Therefore your code reaches the line print(resp.status) not until the resp is avalible.
For example the code:
import aiohttp
import asyncio
import urllib.parse
import datetime
async def get(session, url):
print("[{:%M:%S.%f}] getting {} ...".format(datetime.datetime.now(), urllib.parse.urlsplit(url).hostname))
async with session.get(url) as resp:
print("[{:%M:%S.%f}] {}, status: {}".format(datetime.datetime.now(), urllib.parse.urlsplit(url).hostname, resp.status))
doc = await resp.text()
print("[{:%M:%S.%f}] {}, len: {}".format(datetime.datetime.now(), urllib.parse.urlsplit(url).hostname, len(doc)))
async def main():
session = aiohttp.ClientSession()
url = "http://demo.borland.com/Testsite/stadyn_largepagewithimages.html"
f1 = asyncio.ensure_future(get(session, url))
print("[{:%M:%S.%f}] added {} to event loop".format(datetime.datetime.now(), urllib.parse.urlsplit(url).hostname))
url = "https://stackoverflow.com/questions/46445019/aiohttp-when-is-the-response-status-available"
f2 = asyncio.ensure_future(get(session, url))
print("[{:%M:%S.%f}] added {} to event loop".format(datetime.datetime.now(), urllib.parse.urlsplit(url).hostname))
url = "https://api.github.com/events"
f3 = asyncio.ensure_future(get(session, url))
print("[{:%M:%S.%f}] added {} to event loop".format(datetime.datetime.now(), urllib.parse.urlsplit(url).hostname))
await f1
await f2
await f3
session.close()
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
can produce this result:
[16:42.415481] added demo.borland.com to event loop
[16:42.415481] added stackoverflow.com to event loop
[16:42.415481] added api.github.com to event loop
[16:42.415481] getting demo.borland.com ...
[16:42.422481] getting stackoverflow.com ...
[16:42.682496] getting api.github.com ...
[16:43.002515] demo.borland.com, status: 200
[16:43.510544] stackoverflow.com, status: 200
[16:43.759558] stackoverflow.com, len: 110650
[16:43.883565] demo.borland.com, len: 239012
[16:44.089577] api.github.com, status: 200
[16:44.318590] api.github.com, len: 43055
Clarification (thx #deceze): Here you can see (look at the times between the brackets) all coroutines releasing control after sending a request to retrieve the website and a second time while awaiting the text of the response. Also borland, in contrast to stackoverflow, has so much text (other network characteristics excluded) that it's only ready to be displayed after the text from stackoverflow was printed, despite being requested earlier.
You first get the HTTP response headers, which include in the first line the status code. If you so choose you can then read the rest of the response body (here with resp.text()). Since the headers are always relatively small and the body may be very large, aiohttp gives you the chance to read both separately.
resp object is available inside async with block. Therefore resp.status is available too. Also you can call await on some methods, like resp.text() but is doesn't release control of async with block. You can work with resp even after await has been called.

RuntimeError Session is closed when trying to make async requests

First of all heres the code:
import random
import asyncio
from aiohttp import ClientSession
import csv
headers =[]
def extractsites(file):
sites = []
readfile = open(file, "r")
reader = csv.reader(readfile, delimiter=",")
raw = list(reader)
for a in raw:
sites.append((a[1]))
return sites
async def fetchheaders(url, session):
async with session.get(url) as response:
responseheader = await response.headers
print(responseheader)
return responseheader
async def bound_fetch(sem, url, session):
async with sem:
print("doing request for "+ url)
await fetchheaders(url, session)
async def run():
urls = extractsites("cisco-umbrella.csv")
tasks = []
# create instance of Semaphore
sem = asyncio.Semaphore(100)
async with ClientSession() as session:
for i in urls:
task = asyncio.ensure_future(bound_fetch(sem, "http://"+i, session))
tasks.append(task)
return tasks
def main():
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run())
loop.run_until_complete(future)
if __name__ == '__main__':
main()
Most of this code was taken from this blog post:
https://pawelmhm.github.io/asyncio/python/aiohttp/2016/04/22/asyncio-aiohttp.html
Here is my problem that I'm facing: I am trying to read a million urls from a file and then make async request for each of them.
But when I try to execute the code above I get the Session expired error.
This is my line of thought:
I am relatively new to async programming so bear with me.
My though process was to create a long task list (that only allows 100 parallel requests), that I build in the run function, and then pass as a future to the event loop to execute.
I have included a print debug in the bound_fetch (which I copied from the blog post) and it looks like it loops over all urls that I have and as soon as it should start making requests in the fetchheaders function I get the runtime errors.
How do I fix my code ?
A couple things here.
First, in your run function you actually want to gather the tasks there and await them to fix your session issue, like so:
async def run():
urls = ['google.com','amazon.com']
tasks = []
# create instance of Semaphore
sem = asyncio.Semaphore(100)
async with ClientSession() as session:
for i in urls:
task = asyncio.ensure_future(bound_fetch(sem, "http://"+i, session))
tasks.append(task)
await asyncio.gather(*tasks)
Second, the aiohttp API is a little odd in dealing with headers in that you can't await them. I worked around this by awaiting body so that headers are populated and then returning the headers:
async def fetchheaders(url, session):
async with session.get(url) as response:
data = await response.read()
responseheader = response.headers
print(responseheader)
return responseheader
There is some additional overhead here in pulling the body however. I couldn't find another way to load headers though without doing a body read.

Categories

Resources