First of all heres the code:
import random
import asyncio
from aiohttp import ClientSession
import csv
headers =[]
def extractsites(file):
sites = []
readfile = open(file, "r")
reader = csv.reader(readfile, delimiter=",")
raw = list(reader)
for a in raw:
sites.append((a[1]))
return sites
async def fetchheaders(url, session):
async with session.get(url) as response:
responseheader = await response.headers
print(responseheader)
return responseheader
async def bound_fetch(sem, url, session):
async with sem:
print("doing request for "+ url)
await fetchheaders(url, session)
async def run():
urls = extractsites("cisco-umbrella.csv")
tasks = []
# create instance of Semaphore
sem = asyncio.Semaphore(100)
async with ClientSession() as session:
for i in urls:
task = asyncio.ensure_future(bound_fetch(sem, "http://"+i, session))
tasks.append(task)
return tasks
def main():
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run())
loop.run_until_complete(future)
if __name__ == '__main__':
main()
Most of this code was taken from this blog post:
https://pawelmhm.github.io/asyncio/python/aiohttp/2016/04/22/asyncio-aiohttp.html
Here is my problem that I'm facing: I am trying to read a million urls from a file and then make async request for each of them.
But when I try to execute the code above I get the Session expired error.
This is my line of thought:
I am relatively new to async programming so bear with me.
My though process was to create a long task list (that only allows 100 parallel requests), that I build in the run function, and then pass as a future to the event loop to execute.
I have included a print debug in the bound_fetch (which I copied from the blog post) and it looks like it loops over all urls that I have and as soon as it should start making requests in the fetchheaders function I get the runtime errors.
How do I fix my code ?
A couple things here.
First, in your run function you actually want to gather the tasks there and await them to fix your session issue, like so:
async def run():
urls = ['google.com','amazon.com']
tasks = []
# create instance of Semaphore
sem = asyncio.Semaphore(100)
async with ClientSession() as session:
for i in urls:
task = asyncio.ensure_future(bound_fetch(sem, "http://"+i, session))
tasks.append(task)
await asyncio.gather(*tasks)
Second, the aiohttp API is a little odd in dealing with headers in that you can't await them. I worked around this by awaiting body so that headers are populated and then returning the headers:
async def fetchheaders(url, session):
async with session.get(url) as response:
data = await response.read()
responseheader = response.headers
print(responseheader)
return responseheader
There is some additional overhead here in pulling the body however. I couldn't find another way to load headers though without doing a body read.
Related
I'm working with asyncio and aiohttp to call an API many times. While can print the responses, I want to collate the responses into a combined structure - a list or pandas dataframe etc.
In my example code I'm connecting to 2 urls and printing a chunk of the response. How can I collate the responses and access them all?
import asyncio, aiohttp
async def get_url(session, url, timeout=300):
async with session.get(url, timeout=timeout) as response:
http = await response.text()
print(str(http[:80])+'\n')
return http # becomes a list item when gathered
async def async_payload_wrapper(async_loop):
# test with 2 urls as PoC
urls = ['https://google.com','https://yahoo.com']
async with aiohttp.ClientSession(loop=async_loop) as session:
urls_to_check = [get_url(session, url) for url in urls]
await asyncio.gather(*urls_to_check)
if __name__ == '__main__':
event_loop = asyncio.get_event_loop()
event_loop.run_until_complete(async_payload_wrapper(event_loop))
I've tried printing to a file, and that works but it's slow and I need to read it again for further processing. I've tried appending to a global variable without success. E.g. using a variable inside get_url that is defined outside it generates an error, eg:
NameError: name 'my_list' is not defined or
UnboundLocalError: local variable 'my_list' referenced before assignment
Thanks #python_user that's exactly what I was missing and the returned type is indeed a simple list. I think I'd tried to pick up the responses inside the await part which doesn't work.
My updated PoC code below.
Adapting this for the API, JSON and pandas should now be easy : )
import asyncio, aiohttp
async def get_url(session, url, timeout=300):
async with session.get(url, timeout=timeout) as response:
http = await response.text()
return http[:80] # becomes a list element
async def async_payload_wrapper(async_loop):
# test with 2 urls as PoC
urls = ['https://google.com','https://yahoo.com']
async with aiohttp.ClientSession(loop=async_loop) as session:
urls_to_check = [get_url(session, url) for url in urls]
responses = await asyncio.gather(*urls_to_check)
print(type(responses))
print(responses)
if __name__ == '__main__':
event_loop = asyncio.get_event_loop()
event_loop.run_until_complete(async_payload_wrapper(event_loop))
I have a question regarding asynchronous requests:
How do I save response.json() to a file, on the fly?
I want to make a request and save response to a .json file, without keeping it in memory.
import asyncio
import aiohttp
async def fetch(sem, session, url):
async with sem:
async with session.get(url) as response:
return await response.json() # here
async def fetch_all(urls, loop):
sem = asyncio.Semaphore(4)
async with aiohttp.ClientSession(loop=loop) as session:
results = await asyncio.gather(
*[fetch(sem, session, url) for url in urls]
)
return results
if __name__ == '__main__':
urls = (
"https://public.api.openprocurement.org/api/2.5/tenders/6a0585fcfb05471796bb2b6a1d379f9b",
"https://public.api.openprocurement.org/api/2.5/tenders/d1c74ec8bb9143d5b49e7ef32202f51c",
"https://public.api.openprocurement.org/api/2.5/tenders/a3ec49c5b3e847fca2a1c215a2b69f8d",
"https://public.api.openprocurement.org/api/2.5/tenders/52d8a15c55dd4f2ca9232f40c89bfa82",
"https://public.api.openprocurement.org/api/2.5/tenders/b3af1cc6554440acbfe1d29103fe0c6a",
"https://public.api.openprocurement.org/api/2.5/tenders/1d1c6560baac4a968f2c82c004a35c90",
)
loop = asyncio.get_event_loop()
data = loop.run_until_complete(fetch_all(urls, loop))
print(data)
For now, the script just prints JSON files, and I can save them once they're all scraped:
data = loop.run_until_complete(fetch_all(urls, loop))
for i, resp in enumerate(data):
with open(f"{i}.json", "w") as f:
json.dump(resp, f)
But it doesn't feel right to me as it will definitely fail once I run out of memory for example.
Any suggestions?
Edit
Limited my post to only one question
How do I save response.json() to a file, on the fly?
Don't use response.json() in the first place, use the streaming API instead:
async def fetch(sem, session, url):
async with sem, session.get(url) as response:
with open("some_file_name.json", "wb") as out:
async for chunk in response.content.iter_chunked(4096)
out.write(chunk)
I am using the code below to get account information of one thousand instagram accounts using asycnio. In the initial requests the output is correct but after 10-20 calls, instagram starts returning loading page's HTML code. What could I be doing wrong here ? Below is the python code.
import random
import asyncio
from aiohttp import ClientSession
import urllib.request
import aiohttp
async def fetch(url, session,sem):
print("------")
print(url)
async with session.get(url = url) as response:
print(await response.text())
await response.text()
# exit()
if response.status == 200:
await sem.acquire()
fname = url[22:]
fname = fname.split('/')
fname = fname[0] + '.txt'
f = open(fname , 'w')
f.write(str(await response.text()))
sem.release()
# return (await response.text())
async def run(url_list):
tasks = []
# create instance of Semaphore
sem = asyncio.Semaphore(2)
# Create client session that will ensure we dont open new connection
# per each request.
async with ClientSession() as session:
for url in url_list:
task = asyncio.ensure_future(fetch(url, session,sem))
tasks.append(task)
responses = asyncio.gather(*tasks)
await responses
# making the url list here
url_list = []
file = open('url.txt', 'r')
for url in file:
url_list.append(url)
print(url_list)
import time
old = time.time()
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run(url_list))
loop.run_until_complete(future)
print(time.time() - old)
Here are some of the URL's from url.txt file
https://instagram.com/johanna_kre/?__a=1
https://instagram.com/channie_f/?__a=1
https://instagram.com/lilakuh68/?__a=1
https://instagram.com/nataliacallisto/?__a=1
https://instagram.com/edbastian/?__a=1
https://instagram.com/sylvana.h/?__a=1
https://instagram.com/munich_bombon/?__a=1
https://instagram.com/younotus/?__a=1
https://instagram.com/meet.herbert/?__a=1
https://instagram.com/inaaogo/?__a=1
https://instagram.com/dennisaogo/?__a=1
https://instagram.com/mrslight__/?__a=1
https://instagram.com/reneturrek/?__a=1
https://instagram.com/_eeasyyy/?__a=1
https://instagram.com/sentinobln/?__a=1
https://instagram.com/eri.ka_g/?__a=1
Your semaphore is not limiting the requests as you want it to; you should acquire it before making the request, not before processing the content.
With your current implementation you are making 100 concurrent requests (aiohttp's client default limit) but only process the responses two at a time (however at this point from the server's perspective the requests are already processed).
Use:
async def fetch(url, session,sem):
print("------")
print(url)
await sem.acquire()
async with session.get(url = url) as response:
print(await response.text())
await response.text()
...
sem.release()
...
I'm trying to have a small function scraping data from a JSON end point,
the url is like https://xxxxxxxx.com/products.json?&page=" which I can insert a page number,
While I was using requests module I just had a while loop and incrementing the page number and break until I get a empty response (which page is empty)
Is there a possible way to do the same thing with aiohttp?
What I only achieved so far is just pre-genenrate certain number of urls and pass it into tasks
Wondering if I can use a loop as well and stop when see empty response
Thank you very much
'''
import asyncio
import aiohttp
async def download_one(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
pprint.pprint(await resp.json(content_type=None))
async def download_all(sites):
tasks = [asyncio.create_task(download_one(site)) for site in sites]
await asyncio.gather(*tasks)
def main():
sites = list(map(lambda x: request_url + str(x), range(1, 50)))
asyncio.run(download_all(sites))
'''
Here is a piece of untested code. Even if it won't work, it will give you an idea how to do the job
import asyncio
import aiohttp
async def download_one(session, url):
async with session.get(url) as resp:
resp = await resp.json()
if not resp:
raise Exception("No data found") # needs to be there for breaking the loop
async def download_all(sites):
async with aiohttp.ClientSession() as session:
futures = [download_one(session, site) for site in sites]
done, pending = await asyncio.wait(
futures, return_when=FIRST_EXCEPTION # will return the result when exception is raised by any future
)
for future in pending:
future.cancel() # it will shut down all redundant jobs
def main():
sites = list(map(lambda x: request_url + str(x), range(1, 50)))
asyncio.run_until_complete(download_all(sites))
When I run this it lists off the websites in the database one by one with the response code and it takes about 10 seconds to run through a very small list. It should be way faster and isn't running asynchronously but I'm not sure why.
import dblogin
import aiohttp
import asyncio
import async_timeout
dbconn = dblogin.connect()
dbcursor = dbconn.cursor(buffered=True)
dbcursor.execute("SELECT thistable FROM adatabase")
website_list = dbcursor.fetchall()
async def fetch(session, url):
with async_timeout.timeout(30):
async with session.get(url, ssl=False) as response:
await response.read()
return response.status, url
async def main():
async with aiohttp.ClientSession() as session:
for all_urls in website_list:
url = all_urls[0]
resp = await fetch(session, url)
print(resp, url)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
loop.close()
dbcursor.close()
dbconn.close()
This article explains the details. What you need to do is pass each fetch call in a Future object, and then pass a list of those to either asyncio.wait or asyncio.gather depending on your needs.
Your code would look something like this:
async def fetch(session, url):
with async_timeout.timeout(30):
async with session.get(url, ssl=False) as response:
await response.read()
return response.status, url
async def main():
tasks = []
async with aiohttp.ClientSession() as session:
for all_urls in website_list:
url = all_urls[0]
task = asyncio.create_task(fetch(session, url))
tasks.append(task)
responses = await asyncio.gather(*tasks)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
future = asyncio.create_task(main())
loop.run_until_complete(future)
Also, are you sure that loop.close() call is needed? The docs mention that
The loop must not be running when this function is called. Any pending callbacks will be discarded.
This method clears all queues and shuts down the executor, but does not wait for the executor to finish.
As mentioned in the docs and in the link that #user4815162342 posted, it is better to use the create_task method instead of the ensure_future method when we know that the argument is a coroutine. Note that this was added in Python 3.7, so previous versions should continue using ensure_future instead.