I'm using Asyncio and aiohttp to asynchronously get files from an endpoint. My status codes for the request are successful but when I try to write the files everything is always empty for some reason.
This is what my code looks like right now:
async def download_link(url:str,my_session:ClientSession, filename:Path):
async with my_session.get(url, allow_redirects=True) as response:
with filename.open(mode='wb') as f: #Line 3
await f.write(response.content)
async def download_all(urls:list, filenames:list):
my_conn = aiohttp.TCPConnector(limit=10)
async with aiohttp.ClientSession(connector=my_conn) as session:
tasks = []
for item in zip(urls,file_names):
task = asyncio.ensure_future(download_link(url=item[0],my_session=session, filename=item[1]))
tasks.append(task)
await asyncio.gather(*tasks,return_exceptions=True)
I've also tried to put async in front of the with on line 3, inside the download_link function. And I've also tried making the code that opens the file and writes into it a separate async function a such:
async def store_response(response, filename:Path):
async with filename.open(model='wb') as f:
f.write(response.content)
I know the files I'm fetching from do have data, when I use multi-threading I'm able to get data back. Anyone know why this is happening?
I have used this code to download files asynchronously with no problem and good speed.
import asyncio
import aiohttp
import aiofile
async def download_file(url: str):
filename = url.split('?')[0].split('/')[-1]
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
if not resp.ok:
print(f"Invalid status code: {resp.status}")
else:
try:
async with aiofile.async_open(filename, "wb+") as afp:
async for chunk in resp.content.iter_chunked(1024 * 512): # 500 KB
await afp.write(chunk)
except asyncio.TimeoutError:
print(f"A timeout ocurred while downloading '{filename}'")
asyncio.run(download_file("https://www.python.org/static/img/python-logo.png"))
Related
I have a question regarding asynchronous requests:
How do I save response.json() to a file, on the fly?
I want to make a request and save response to a .json file, without keeping it in memory.
import asyncio
import aiohttp
async def fetch(sem, session, url):
async with sem:
async with session.get(url) as response:
return await response.json() # here
async def fetch_all(urls, loop):
sem = asyncio.Semaphore(4)
async with aiohttp.ClientSession(loop=loop) as session:
results = await asyncio.gather(
*[fetch(sem, session, url) for url in urls]
)
return results
if __name__ == '__main__':
urls = (
"https://public.api.openprocurement.org/api/2.5/tenders/6a0585fcfb05471796bb2b6a1d379f9b",
"https://public.api.openprocurement.org/api/2.5/tenders/d1c74ec8bb9143d5b49e7ef32202f51c",
"https://public.api.openprocurement.org/api/2.5/tenders/a3ec49c5b3e847fca2a1c215a2b69f8d",
"https://public.api.openprocurement.org/api/2.5/tenders/52d8a15c55dd4f2ca9232f40c89bfa82",
"https://public.api.openprocurement.org/api/2.5/tenders/b3af1cc6554440acbfe1d29103fe0c6a",
"https://public.api.openprocurement.org/api/2.5/tenders/1d1c6560baac4a968f2c82c004a35c90",
)
loop = asyncio.get_event_loop()
data = loop.run_until_complete(fetch_all(urls, loop))
print(data)
For now, the script just prints JSON files, and I can save them once they're all scraped:
data = loop.run_until_complete(fetch_all(urls, loop))
for i, resp in enumerate(data):
with open(f"{i}.json", "w") as f:
json.dump(resp, f)
But it doesn't feel right to me as it will definitely fail once I run out of memory for example.
Any suggestions?
Edit
Limited my post to only one question
How do I save response.json() to a file, on the fly?
Don't use response.json() in the first place, use the streaming API instead:
async def fetch(sem, session, url):
async with sem, session.get(url) as response:
with open("some_file_name.json", "wb") as out:
async for chunk in response.content.iter_chunked(4096)
out.write(chunk)
I'm trying to have a small function scraping data from a JSON end point,
the url is like https://xxxxxxxx.com/products.json?&page=" which I can insert a page number,
While I was using requests module I just had a while loop and incrementing the page number and break until I get a empty response (which page is empty)
Is there a possible way to do the same thing with aiohttp?
What I only achieved so far is just pre-genenrate certain number of urls and pass it into tasks
Wondering if I can use a loop as well and stop when see empty response
Thank you very much
'''
import asyncio
import aiohttp
async def download_one(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
pprint.pprint(await resp.json(content_type=None))
async def download_all(sites):
tasks = [asyncio.create_task(download_one(site)) for site in sites]
await asyncio.gather(*tasks)
def main():
sites = list(map(lambda x: request_url + str(x), range(1, 50)))
asyncio.run(download_all(sites))
'''
Here is a piece of untested code. Even if it won't work, it will give you an idea how to do the job
import asyncio
import aiohttp
async def download_one(session, url):
async with session.get(url) as resp:
resp = await resp.json()
if not resp:
raise Exception("No data found") # needs to be there for breaking the loop
async def download_all(sites):
async with aiohttp.ClientSession() as session:
futures = [download_one(session, site) for site in sites]
done, pending = await asyncio.wait(
futures, return_when=FIRST_EXCEPTION # will return the result when exception is raised by any future
)
for future in pending:
future.cancel() # it will shut down all redundant jobs
def main():
sites = list(map(lambda x: request_url + str(x), range(1, 50)))
asyncio.run_until_complete(download_all(sites))
I want to go through a huge list of urls and send requests to them asynchronously.
Since the CSV-file with the urls is too big to load it at once, I would like to read the lines row by row, and each time the row is loaded, it should start a request and save the result to a file.
My problem is that if I understood it right when I use asyncio.gather all tasks have to be collected at once.
It would be great if you could tell me how to change my code, to get it to send asynchronous requests for each row of the csv file.
Here is the code I am stuck with:
import asyncio
import aiohttp
import async_timeout
import csv
async def fetch( session, url ):
async with async_timeout.timeout(10):
try:
async with session.get(url) as response:
return response
except Exception as e:
print(str( e ))
return False
async def write_result( result ):
with open( 'results.csv', 'a' ) as csv_file:
writer = csv.writer( csv_file )
writer.writerow( result )
async def validate_page( session, url ):
response = await fetch( session, url )
await write_result( response )
async def main():
async with aiohttp.ClientSession() as session:
with open('urls.csv') as csv_file:
for row in csv.reader( csv_file ):
await validate_page( session, row )
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
To process each line from csv file asynchronously use the following approach.
A bunch of optimizations and restructuring of your current approach:
no need to create csv.reader for csv input file if it contains only url on each separate line (just traverse though a file object)
no need to wrap with additional async with async_timeout.timeout(10) as aiohttp.ClientSession itself has timeout option
definitely no need to construct a new writer = csv.writer( csv_file ) for each processed url (with consequent result) - create writer object just once (ensuring graceful writing with asyncio.Lock - see below)
import asyncio
import aiohttp
import csv
async def fetch(session, url):
try:
async with session.get(url, timeout=10) as response:
return await response.text()
except Exception as e:
print(url, str(e))
return False
async def write_result(result, writer):
async with asyncio.Lock(): # lock for gracefully write to shared file object
res = [<needed parts from result, >] # <- adjust a resulting list of strings
writer.writerow(res)
async def validate_page(session, url, writer):
res = await fetch(session, url)
if res:
await write_result(res, writer)
async def main():
async with aiohttp.ClientSession() as session:
with open('urls.csv') as csv_in, open('results.csv', 'a') as csv_out:
writer = csv.writer(csv_out, delimiter=',')
aws = [validate_page(session, url.strip(), writer) for url in csv_in]
await asyncio.gather(*aws)
print('!--- finished processing')
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
First of all heres the code:
import random
import asyncio
from aiohttp import ClientSession
import csv
headers =[]
def extractsites(file):
sites = []
readfile = open(file, "r")
reader = csv.reader(readfile, delimiter=",")
raw = list(reader)
for a in raw:
sites.append((a[1]))
return sites
async def fetchheaders(url, session):
async with session.get(url) as response:
responseheader = await response.headers
print(responseheader)
return responseheader
async def bound_fetch(sem, url, session):
async with sem:
print("doing request for "+ url)
await fetchheaders(url, session)
async def run():
urls = extractsites("cisco-umbrella.csv")
tasks = []
# create instance of Semaphore
sem = asyncio.Semaphore(100)
async with ClientSession() as session:
for i in urls:
task = asyncio.ensure_future(bound_fetch(sem, "http://"+i, session))
tasks.append(task)
return tasks
def main():
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run())
loop.run_until_complete(future)
if __name__ == '__main__':
main()
Most of this code was taken from this blog post:
https://pawelmhm.github.io/asyncio/python/aiohttp/2016/04/22/asyncio-aiohttp.html
Here is my problem that I'm facing: I am trying to read a million urls from a file and then make async request for each of them.
But when I try to execute the code above I get the Session expired error.
This is my line of thought:
I am relatively new to async programming so bear with me.
My though process was to create a long task list (that only allows 100 parallel requests), that I build in the run function, and then pass as a future to the event loop to execute.
I have included a print debug in the bound_fetch (which I copied from the blog post) and it looks like it loops over all urls that I have and as soon as it should start making requests in the fetchheaders function I get the runtime errors.
How do I fix my code ?
A couple things here.
First, in your run function you actually want to gather the tasks there and await them to fix your session issue, like so:
async def run():
urls = ['google.com','amazon.com']
tasks = []
# create instance of Semaphore
sem = asyncio.Semaphore(100)
async with ClientSession() as session:
for i in urls:
task = asyncio.ensure_future(bound_fetch(sem, "http://"+i, session))
tasks.append(task)
await asyncio.gather(*tasks)
Second, the aiohttp API is a little odd in dealing with headers in that you can't await them. I worked around this by awaiting body so that headers are populated and then returning the headers:
async def fetchheaders(url, session):
async with session.get(url) as response:
data = await response.read()
responseheader = response.headers
print(responseheader)
return responseheader
There is some additional overhead here in pulling the body however. I couldn't find another way to load headers though without doing a body read.
So I have a discord bot that I'm playing with to learn Python. I have a command that downloads images, and edits/merges them, then sends the edited image to chat. I was using requests to do this before, but I was told by one of the library devs for discord.py that I should be using aiohttpinstead of requests. I can't find how to download images in aiohttp, I've tried a bunch of stuff, but none of it works.
if message.content.startswith("!async"):
import aiohttp
import random
import time
import shutil
start = time.time()
notr = 0
imagemake = Image.new("RGBA",(2048,2160))
imgsave = "H:\Documents\PyCharmProjects\ChatBot\Images"
imagesend = os.path.join(imgsave,"merged.png")
imgmergedsend =os.path.join(imgsave,"merged2.png")
with aiohttp.ClientSession() as session:
async with session.get("http://schoolido.lu/api/cards/788/") as resp:
data = await resp.json()
cardsave = session.get(data["card_image"])
with open((os.path.join(imgsave, "card.png")),"wb") as out_file:
shutil.copyfileobj(cardsave, out_file)
is what I have right now, but that still doesn't work.
So, is there a way to download images?
You lock loop when write file. You need use aiofiles.
import aiohttp
import aiofiles
async with aiohttp.ClientSession() as session:
url = "http://host/file.img"
async with session.get(url) as resp:
if resp.status == 200:
f = await aiofiles.open('/some/file.img', mode='wb')
await f.write(await resp.read())
await f.close()
So I figured it out, a while ago:
if message.content.startswith("!async2"):
import aiohttp
with aiohttp.ClientSession() as session:
async with session.get("http://schoolido.lu/api/cards/788/") as resp:
data = await resp.json()
card = data["card_image"]
async with session.get(card) as resp2:
test = await resp2.read()
with open("cardtest2.png", "wb") as f:
f.write(test)
I was getting a response, not an image response
pdf_url = 'https://example.com/file.pdf'
async with aiohttp.ClientSession() as session:
async with session.get(pdf_url) as resp:
if resp.status == 200:
with open('file.pdf', 'wb') as fd:
async for chunk in resp.content.iter_chunked(10):
fd.write(chunk)
While methods read(), json() and text() are very convenient you should
use them carefully. All these methods load the whole response in
memory. For example if you want to download several gigabyte sized
files, these methods will load all the data in memory. Instead you can
use the content attribute. It is an instance of the
aiohttp.StreamReader class. The gzip and deflate transfer-encodings
are automatically decoded for you:
https://docs.aiohttp.org/en/stable/client_quickstart.html#streaming-response-content