I am trying to achieve streaming in python. i have a requirement to return huge result from cursor to rest api. i am using flask stream_with_context while returning responses. but when i try requesting using stream=True flag my request wait for responses till cursor executes all the data.
expecting to receive 1 to 10 elements one by one when gets yield in response. but my requestor from test.py waits until all elemnst are processed from service_runner.py
here are the code sample.
service_runner.py
from flask import Flask, stream_with_context, Response
import time, json
app = Flask(__name__)
#app.route('/')
def hello_world():
return 'Hello, World!'
#app.route('/StreamData')
def StreamData():
def stream1():
for i in range(10):
print(i)
time.sleep(1) # this is to see requestor recives stream or not.
yield json.dumps(i)
return Response(stream_with_context(stream1()))
test.py
import requests, asyncio, aiohttp
URL='http://127.0.0.1:5000/StreamData'
def TestStream():
req1 = requests.get(URL, stream=True)
print(req1)
for r in req1.iter_lines(chunk_size=1):
print(r)
async def TestWithAsync():
async with aiohttp.ClientSession() as session:
async with session.get(URL) as resp:
print(await resp.content.read())
def main():
event_loop = asyncio.get_event_loop()
event_loop.run_until_complete(TestWithAsync())
event_loop.close()
if __name__=='__main__':
TestStream()
main()
Related
I'm working with asyncio and aiohttp to call an API many times. While can print the responses, I want to collate the responses into a combined structure - a list or pandas dataframe etc.
In my example code I'm connecting to 2 urls and printing a chunk of the response. How can I collate the responses and access them all?
import asyncio, aiohttp
async def get_url(session, url, timeout=300):
async with session.get(url, timeout=timeout) as response:
http = await response.text()
print(str(http[:80])+'\n')
return http # becomes a list item when gathered
async def async_payload_wrapper(async_loop):
# test with 2 urls as PoC
urls = ['https://google.com','https://yahoo.com']
async with aiohttp.ClientSession(loop=async_loop) as session:
urls_to_check = [get_url(session, url) for url in urls]
await asyncio.gather(*urls_to_check)
if __name__ == '__main__':
event_loop = asyncio.get_event_loop()
event_loop.run_until_complete(async_payload_wrapper(event_loop))
I've tried printing to a file, and that works but it's slow and I need to read it again for further processing. I've tried appending to a global variable without success. E.g. using a variable inside get_url that is defined outside it generates an error, eg:
NameError: name 'my_list' is not defined or
UnboundLocalError: local variable 'my_list' referenced before assignment
Thanks #python_user that's exactly what I was missing and the returned type is indeed a simple list. I think I'd tried to pick up the responses inside the await part which doesn't work.
My updated PoC code below.
Adapting this for the API, JSON and pandas should now be easy : )
import asyncio, aiohttp
async def get_url(session, url, timeout=300):
async with session.get(url, timeout=timeout) as response:
http = await response.text()
return http[:80] # becomes a list element
async def async_payload_wrapper(async_loop):
# test with 2 urls as PoC
urls = ['https://google.com','https://yahoo.com']
async with aiohttp.ClientSession(loop=async_loop) as session:
urls_to_check = [get_url(session, url) for url in urls]
responses = await asyncio.gather(*urls_to_check)
print(type(responses))
print(responses)
if __name__ == '__main__':
event_loop = asyncio.get_event_loop()
event_loop.run_until_complete(async_payload_wrapper(event_loop))
I'm trying to have a small function scraping data from a JSON end point,
the url is like https://xxxxxxxx.com/products.json?&page=" which I can insert a page number,
While I was using requests module I just had a while loop and incrementing the page number and break until I get a empty response (which page is empty)
Is there a possible way to do the same thing with aiohttp?
What I only achieved so far is just pre-genenrate certain number of urls and pass it into tasks
Wondering if I can use a loop as well and stop when see empty response
Thank you very much
'''
import asyncio
import aiohttp
async def download_one(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
pprint.pprint(await resp.json(content_type=None))
async def download_all(sites):
tasks = [asyncio.create_task(download_one(site)) for site in sites]
await asyncio.gather(*tasks)
def main():
sites = list(map(lambda x: request_url + str(x), range(1, 50)))
asyncio.run(download_all(sites))
'''
Here is a piece of untested code. Even if it won't work, it will give you an idea how to do the job
import asyncio
import aiohttp
async def download_one(session, url):
async with session.get(url) as resp:
resp = await resp.json()
if not resp:
raise Exception("No data found") # needs to be there for breaking the loop
async def download_all(sites):
async with aiohttp.ClientSession() as session:
futures = [download_one(session, site) for site in sites]
done, pending = await asyncio.wait(
futures, return_when=FIRST_EXCEPTION # will return the result when exception is raised by any future
)
for future in pending:
future.cancel() # it will shut down all redundant jobs
def main():
sites = list(map(lambda x: request_url + str(x), range(1, 50)))
asyncio.run_until_complete(download_all(sites))
When I run this it lists off the websites in the database one by one with the response code and it takes about 10 seconds to run through a very small list. It should be way faster and isn't running asynchronously but I'm not sure why.
import dblogin
import aiohttp
import asyncio
import async_timeout
dbconn = dblogin.connect()
dbcursor = dbconn.cursor(buffered=True)
dbcursor.execute("SELECT thistable FROM adatabase")
website_list = dbcursor.fetchall()
async def fetch(session, url):
with async_timeout.timeout(30):
async with session.get(url, ssl=False) as response:
await response.read()
return response.status, url
async def main():
async with aiohttp.ClientSession() as session:
for all_urls in website_list:
url = all_urls[0]
resp = await fetch(session, url)
print(resp, url)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
loop.close()
dbcursor.close()
dbconn.close()
This article explains the details. What you need to do is pass each fetch call in a Future object, and then pass a list of those to either asyncio.wait or asyncio.gather depending on your needs.
Your code would look something like this:
async def fetch(session, url):
with async_timeout.timeout(30):
async with session.get(url, ssl=False) as response:
await response.read()
return response.status, url
async def main():
tasks = []
async with aiohttp.ClientSession() as session:
for all_urls in website_list:
url = all_urls[0]
task = asyncio.create_task(fetch(session, url))
tasks.append(task)
responses = await asyncio.gather(*tasks)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
future = asyncio.create_task(main())
loop.run_until_complete(future)
Also, are you sure that loop.close() call is needed? The docs mention that
The loop must not be running when this function is called. Any pending callbacks will be discarded.
This method clears all queues and shuts down the executor, but does not wait for the executor to finish.
As mentioned in the docs and in the link that #user4815162342 posted, it is better to use the create_task method instead of the ensure_future method when we know that the argument is a coroutine. Note that this was added in Python 3.7, so previous versions should continue using ensure_future instead.
First of all heres the code:
import random
import asyncio
from aiohttp import ClientSession
import csv
headers =[]
def extractsites(file):
sites = []
readfile = open(file, "r")
reader = csv.reader(readfile, delimiter=",")
raw = list(reader)
for a in raw:
sites.append((a[1]))
return sites
async def fetchheaders(url, session):
async with session.get(url) as response:
responseheader = await response.headers
print(responseheader)
return responseheader
async def bound_fetch(sem, url, session):
async with sem:
print("doing request for "+ url)
await fetchheaders(url, session)
async def run():
urls = extractsites("cisco-umbrella.csv")
tasks = []
# create instance of Semaphore
sem = asyncio.Semaphore(100)
async with ClientSession() as session:
for i in urls:
task = asyncio.ensure_future(bound_fetch(sem, "http://"+i, session))
tasks.append(task)
return tasks
def main():
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run())
loop.run_until_complete(future)
if __name__ == '__main__':
main()
Most of this code was taken from this blog post:
https://pawelmhm.github.io/asyncio/python/aiohttp/2016/04/22/asyncio-aiohttp.html
Here is my problem that I'm facing: I am trying to read a million urls from a file and then make async request for each of them.
But when I try to execute the code above I get the Session expired error.
This is my line of thought:
I am relatively new to async programming so bear with me.
My though process was to create a long task list (that only allows 100 parallel requests), that I build in the run function, and then pass as a future to the event loop to execute.
I have included a print debug in the bound_fetch (which I copied from the blog post) and it looks like it loops over all urls that I have and as soon as it should start making requests in the fetchheaders function I get the runtime errors.
How do I fix my code ?
A couple things here.
First, in your run function you actually want to gather the tasks there and await them to fix your session issue, like so:
async def run():
urls = ['google.com','amazon.com']
tasks = []
# create instance of Semaphore
sem = asyncio.Semaphore(100)
async with ClientSession() as session:
for i in urls:
task = asyncio.ensure_future(bound_fetch(sem, "http://"+i, session))
tasks.append(task)
await asyncio.gather(*tasks)
Second, the aiohttp API is a little odd in dealing with headers in that you can't await them. I worked around this by awaiting body so that headers are populated and then returning the headers:
async def fetchheaders(url, session):
async with session.get(url) as response:
data = await response.read()
responseheader = response.headers
print(responseheader)
return responseheader
There is some additional overhead here in pulling the body however. I couldn't find another way to load headers though without doing a body read.
from aiohttp import web
from aiohttp import ClientSession
# this would go in a different file but keep it simple for now
class Generate:
# Get a person object from my website
async def get_person(self):
async with ClientSession() as session:
async with session.get('http://surveycodebot.com/person/generate') as response:
resp = await response.json()
# this prints the person
print(resp)
return resp
# loops `get_person` to get more than 1 person
async def get_people(self):
# array for gathering all responses
for _ in range(0,10):
resp = await self.get_person()
return resp
# class to handle '/'
class HomePage(web.View):
async def get(self):
# initiate the Generate class and call get_people
await Generate().get_people()
return web.Response(text="Hello, world")
if __name__ == "__main__":
app = web.Application()
app.router.add_get('/', HomePage)
web.run_app(app)
Code works and everything is fine. I was wondering why the HomePage takes a while to load. I think I should be using yield on line 28, but it barfs when I do that. Thanks.
You can optimize by sharing the session between several client requests via the aiohttp on_startup signal.
Something like the following will do:
import asyncio
from aiohttp import web
from aiohttp import ClientSession
class Generate:
def __init__(self, session):
self.session = session
# Get a person object from my website
async def get_person(self):
async with self.session.get('http://surveycodebot.com/person/generate') as response:
resp = await response.json()
# this prints the person
print(resp)
return resp
# loops `get_person` to get more than 1 person
async def get_people(self):
# array for gathering all responses
for _ in range(0,10):
resp = await self.get_person()
return resp
# class to handle '/'
class HomePage(web.View):
async def get(self):
# initiate the Generate class and call get_people
await app['generate'].get_people()
return web.Response(text="Hello, world")
async def on_startup(app):
session = ClientSession()
app['generate'] = Generate(session)
if __name__ == "__main__":
app = web.Application()
app.router.add_get('/', HomePage)
app.on_startup.append(on_startup)
web.run_app(app)