from aiohttp import web
from aiohttp import ClientSession
# this would go in a different file but keep it simple for now
class Generate:
# Get a person object from my website
async def get_person(self):
async with ClientSession() as session:
async with session.get('http://surveycodebot.com/person/generate') as response:
resp = await response.json()
# this prints the person
print(resp)
return resp
# loops `get_person` to get more than 1 person
async def get_people(self):
# array for gathering all responses
for _ in range(0,10):
resp = await self.get_person()
return resp
# class to handle '/'
class HomePage(web.View):
async def get(self):
# initiate the Generate class and call get_people
await Generate().get_people()
return web.Response(text="Hello, world")
if __name__ == "__main__":
app = web.Application()
app.router.add_get('/', HomePage)
web.run_app(app)
Code works and everything is fine. I was wondering why the HomePage takes a while to load. I think I should be using yield on line 28, but it barfs when I do that. Thanks.
You can optimize by sharing the session between several client requests via the aiohttp on_startup signal.
Something like the following will do:
import asyncio
from aiohttp import web
from aiohttp import ClientSession
class Generate:
def __init__(self, session):
self.session = session
# Get a person object from my website
async def get_person(self):
async with self.session.get('http://surveycodebot.com/person/generate') as response:
resp = await response.json()
# this prints the person
print(resp)
return resp
# loops `get_person` to get more than 1 person
async def get_people(self):
# array for gathering all responses
for _ in range(0,10):
resp = await self.get_person()
return resp
# class to handle '/'
class HomePage(web.View):
async def get(self):
# initiate the Generate class and call get_people
await app['generate'].get_people()
return web.Response(text="Hello, world")
async def on_startup(app):
session = ClientSession()
app['generate'] = Generate(session)
if __name__ == "__main__":
app = web.Application()
app.router.add_get('/', HomePage)
app.on_startup.append(on_startup)
web.run_app(app)
Related
I have the next code:
from fastapi import FastAPI, WebSocket, BackgroundTasks
import uvicorn
import time
app = FastAPI()
def run_model():
...
## code of the model
answer = [1, 2, 3]
...
results = {"message": "the model has been excuted succesfully!!", "results": answer}
return results
#app.post("/execute-model")
async def ping(background_tasks: BackgroundTasks):
background_tasks.add_task(run_model)
return {"message": "the model is executing"}
#app.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket):
await websocket.accept()
while True:
## Here I wnat the results of run_model
await websocket.send_text(1)
if __name__ == "__main__":
uvicorn.run(app, host="localhost", port=8001)
I need to make a post fecth to /execute-model. This endpoint will excute a run_model function as a background task. I need return the answer to the front when run_model() finish and I thought in use websockets but I don't know how do it. Help please.
I had something similar. Here is how I did it (not saying it's the best or even a good solution, but it's working so far):
The route endpoint:
# client makes a post request, gets saved model immeditely, while a background task is started to process the image
#app.post("/analyse", response_model=schemas.ImageAnalysis , tags=["Image Analysis"])
async def create_image_analysis(
img: schemas.ImageAnalysisCreate,
background_tasks: BackgroundTasks,
db: Session = Depends(get_db),
):
saved = crud.create_analysis(db=db, img=img)
background_tasks.add_task(analyse_image,db=db, img=img)
#model includes a ws_token (some random string) that the client can connect to right away
return saved
The websocket endpoint:
#app.websocket("/ws/{ws_token}")
async def websocket_endpoint(websocket: WebSocket, ws_token: str):
#add the websocket to the connections dict (by ws_token)
await socket_connections.connect(websocket,ws_token=ws_token)
try:
while True:
print(socket_connections)
await websocket.receive_text() #not really necessary
except WebSocketDisconnect:
socket_connections.disconnect(websocket,ws_token=ws_token)
The analyse_image function:
#notice - the function is not async, as it does not work with background tasks otherwise!!
def analyse_image (db: Session, img: ImageAnalysis):
print('analyse_image started')
for index, round in enumerate(img.rounds):
# some heavy workload etc
# send update to user
socket_connections.send_message({
"status":EstimationStatus.RUNNING,
"current_step":index+1,
"total_steps":len(img.rounds)
}, ws_token=img.ws_token)
print("analysis finished")
The connection Manager:
import asyncio
from typing import Dict, List
from fastapi import WebSocket
#notice: active_connections is changed to a dict (key= ws_token), so we know which user listens to which model
class ConnectionManager:
def __init__(self):
self.active_connections: Dict[str, List[WebSocket]] = {}
async def connect(self, websocket: WebSocket, ws_token: str):
await websocket.accept()
if ws_token in self.active_connections:
self.active_connections.get(ws_token).append(websocket)
else:
self.active_connections.update({ws_token: [websocket]})
def disconnect(self, websocket: WebSocket, ws_token: str):
self.active_connections.get(ws_token).remove(websocket)
if(len(self.active_connections.get(ws_token))==0):
self.active_connections.pop(ws_token)
# notice: changed from async to sync as background tasks messes up with async functions
def send_message(self, data: dict,ws_token: str):
sockets = self.active_connections.get(ws_token)
if sockets:
#notice: socket send is originally async. We have to change it to syncronous code -
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
for socket in sockets:
socket.send_text
loop.run_until_complete(socket.send_json(data))
socket_connections = ConnectionManager()
I am trying to achieve streaming in python. i have a requirement to return huge result from cursor to rest api. i am using flask stream_with_context while returning responses. but when i try requesting using stream=True flag my request wait for responses till cursor executes all the data.
expecting to receive 1 to 10 elements one by one when gets yield in response. but my requestor from test.py waits until all elemnst are processed from service_runner.py
here are the code sample.
service_runner.py
from flask import Flask, stream_with_context, Response
import time, json
app = Flask(__name__)
#app.route('/')
def hello_world():
return 'Hello, World!'
#app.route('/StreamData')
def StreamData():
def stream1():
for i in range(10):
print(i)
time.sleep(1) # this is to see requestor recives stream or not.
yield json.dumps(i)
return Response(stream_with_context(stream1()))
test.py
import requests, asyncio, aiohttp
URL='http://127.0.0.1:5000/StreamData'
def TestStream():
req1 = requests.get(URL, stream=True)
print(req1)
for r in req1.iter_lines(chunk_size=1):
print(r)
async def TestWithAsync():
async with aiohttp.ClientSession() as session:
async with session.get(URL) as resp:
print(await resp.content.read())
def main():
event_loop = asyncio.get_event_loop()
event_loop.run_until_complete(TestWithAsync())
event_loop.close()
if __name__=='__main__':
TestStream()
main()
I am trying to scrape some data from https://www.officialcharts.com/ by parallelising web requests using asyncio/aiohttp. I implemented the code given at the link here.
I followed two different procedures. The first one goes like this.
from bs4 import BeautifulSoup
from urllib.request import urlopen
from selenium import webdriver
import time
import pandas as pd
import numpy as np
import re
import json
import requests
from bs4 import BeautifulSoup
from datetime import date, timedelta
from IPython.display import clear_output
import memory_profiler
import spotipy
import spotipy.util as util
import pandas as pd
from more_itertools import unique_everseen
weeks = []
d = date(1970, 1, 1)
d += timedelta(days = 6 - d.weekday())
for i in range(2500):
weeks.append(d.strftime('%Y%m%d'))
d += timedelta(days = 7)
import asyncio
from aiohttp import ClientSession
import nest_asyncio
nest_asyncio.apply()
result = []
async def fetch(url, session):
async with session.get(url) as response:
return await response.read()
async def run(r):
tasks = []
# Fetch all responses within one Client session,
# keep connection alive for all requests.
async with ClientSession() as session:
for i in range(r):
url = 'https://www.officialcharts.com/charts/singles-chart/' + weeks[i] + '/'
task = asyncio.ensure_future(fetch(url, session))
tasks.append(task)
responses = await asyncio.gather(*tasks)
result.append(responses)
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run(5))
loop.run_until_complete(future)
print('Done')
print(result[0][0] == None)
The problem with above code is, it fails when I make more than simultaneous 1000 requests.
The author of the post implemented a different procedure to address this issue and he claims we can do as many as 10K requests. I followed along his second procedure and here is my code for that.
import random
import asyncio
from aiohttp import ClientSession
import nest_asyncio
nest_asyncio.apply()
result = []
async def fetch(url, session):
async with session.get(url) as response:
delay = response.headers.get("DELAY")
date = response.headers.get("DATE")
print("{}:{} with delay {}".format(date, response.url, delay))
return await response.read()
async def bound_fetch(sem, url, session):
# Getter function with semaphore.
async with sem:
await fetch(url, session)
async def run(r):
tasks = []
# create instance of Semaphore
sem = asyncio.Semaphore(1000)
# Create client session that will ensure we dont open new connection
# per each request.
async with ClientSession() as session:
for i in range(r):
url = 'https://www.officialcharts.com/charts/singles-chart/' + weeks[i] + '/'
task = asyncio.ensure_future(bound_fetch(sem, url, session))
tasks.append(task)
responses = await asyncio.gather(*tasks)
result.append(responses)
number = 5
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run(number))
loop.run_until_complete(future)
print('Done')
print(result[0][0] == None)
For some reason, this doesn't return any responses.
PS:I am not from CS background and just program for fun. I have no clue what's going on inside the asyncio code.
Try to use the latest version.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from aiohttp import ClientSession, client_exceptions
from asyncio import Semaphore, ensure_future, gather, run
from json import dumps, loads
limit = 10
http_ok = [200]
async def scrape(url_list):
tasks = list()
sem = Semaphore(limit)
async with ClientSession() as session:
for url in url_list:
task = ensure_future(scrape_bounded(url, sem, session))
tasks.append(task)
result = await gather(*tasks)
return result
async def scrape_bounded(url, sem, session):
async with sem:
return await scrape_one(url, session)
async def scrape_one(url, session):
try:
async with session.get(url) as response:
content = await response.read()
except client_exceptions.ClientConnectorError:
print('Scraping %s failed due to the connection problem', url)
return False
if response.status not in http_ok:
print('Scraping%s failed due to the return code %s', url, response.status)
return False
content = loads(content.decode('UTF-8'))
return content
if __name__ == '__main__':
urls = ['http://demin.co/echo1/', 'http://demin.co/echo2/']
res = run(scrape(urls))
print(dumps(res, indent=4))
This is a template of a real project that works as predicted.
You can find this source code here
I've written a script in python using asyncio association with aiohttp library to parse the names out of pop up boxes initiated upon clicking on contact info buttons out of diffetent agency information located within a table from this website asynchronously. The webpage displayes the tabular contents across 513 pages.
I encountered this error too many file descriptors in select() when I tried with asyncio.get_event_loop() but when I came across this thread I could see that there is a suggestion to use asyncio.ProactorEventLoop() to avoid such error so I used the latter but noticed that, even when I complied with the suggestion, the script collects the names only from few pages until it throws the following error. How can i fix this?
raise client_error(req.connection_key, exc) from exc
aiohttp.client_exceptions.ClientConnectorError: Cannot connect to host www.tursab.org.tr:443 ssl:None [The semaphore timeout period has expired]
This is my try so far with:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
links = ["https://www.tursab.org.tr/en/travel-agencies/search-travel-agency?sayfa={}".format(page) for page in range(1,514)]
lead_link = "https://www.tursab.org.tr/en/displayAcenta?AID={}"
async def get_links(url):
async with asyncio.Semaphore(10):
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
text = await response.text()
result = await process_docs(text)
return result
async def process_docs(html):
coros = []
soup = BeautifulSoup(html,"lxml")
items = [itemnum.get("data-id") for itemnum in soup.select("#acentaTbl tr[data-id]")]
for item in items:
coros.append(fetch_again(lead_link.format(item)))
await asyncio.gather(*coros)
async def fetch_again(link):
async with asyncio.Semaphore(10):
async with aiohttp.ClientSession() as session:
async with session.get(link) as response:
text = await response.text()
sauce = BeautifulSoup(text,"lxml")
try:
name = sauce.select_one("p > b").text
except Exception: name = ""
print(name)
if __name__ == '__main__':
loop = asyncio.ProactorEventLoop()
asyncio.set_event_loop(loop)
loop.run_until_complete(asyncio.gather(*(get_links(link) for link in links)))
In short, What the process_docs() function does is collect data-id numbers from each pages to reuse them as the prefix of this https://www.tursab.org.tr/en/displayAcenta?AID={} link to collect the names from pop up boxes. One such id is 8757 and one such qualified links therefore https://www.tursab.org.tr/en/displayAcenta?AID=8757.
Btw, If I change the highest number used in the links variable to 20 or 30 or so, It goes smoothly.
async def get_links(url):
async with asyncio.Semaphore(10):
You can't do such a thing: it means that on each function call new semaphore instance will be created, while you need to single semaphore instance for all requests. Change your code this way:
sem = asyncio.Semaphore(10) # module level
async def get_links(url):
async with sem:
# ...
async def fetch_again(link):
async with sem:
# ...
You can also return default loop once you're using semaphore correctly:
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(...)
Finally, you should alter both get_links(url) and fetch_again(link) to do parsing outside of semaphore to release it as soon as possible, before semaphore is needed inside process_docs(text).
Final code:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
links = ["https://www.tursab.org.tr/en/travel-agencies/search-travel-agency?sayfa={}".format(page) for page in range(1,514)]
lead_link = "https://www.tursab.org.tr/en/displayAcenta?AID={}"
sem = asyncio.Semaphore(10)
async def get_links(url):
async with sem:
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
text = await response.text()
result = await process_docs(text)
return result
async def process_docs(html):
coros = []
soup = BeautifulSoup(html,"lxml")
items = [itemnum.get("data-id") for itemnum in soup.select("#acentaTbl tr[data-id]")]
for item in items:
coros.append(fetch_again(lead_link.format(item)))
await asyncio.gather(*coros)
async def fetch_again(link):
async with sem:
async with aiohttp.ClientSession() as session:
async with session.get(link) as response:
text = await response.text()
sauce = BeautifulSoup(text,"lxml")
try:
name = sauce.select_one("p > b").text
except Exception:
name = "o"
print(name)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.gather(*(get_links(link) for link in links)))
First of all heres the code:
import random
import asyncio
from aiohttp import ClientSession
import csv
headers =[]
def extractsites(file):
sites = []
readfile = open(file, "r")
reader = csv.reader(readfile, delimiter=",")
raw = list(reader)
for a in raw:
sites.append((a[1]))
return sites
async def fetchheaders(url, session):
async with session.get(url) as response:
responseheader = await response.headers
print(responseheader)
return responseheader
async def bound_fetch(sem, url, session):
async with sem:
print("doing request for "+ url)
await fetchheaders(url, session)
async def run():
urls = extractsites("cisco-umbrella.csv")
tasks = []
# create instance of Semaphore
sem = asyncio.Semaphore(100)
async with ClientSession() as session:
for i in urls:
task = asyncio.ensure_future(bound_fetch(sem, "http://"+i, session))
tasks.append(task)
return tasks
def main():
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run())
loop.run_until_complete(future)
if __name__ == '__main__':
main()
Most of this code was taken from this blog post:
https://pawelmhm.github.io/asyncio/python/aiohttp/2016/04/22/asyncio-aiohttp.html
Here is my problem that I'm facing: I am trying to read a million urls from a file and then make async request for each of them.
But when I try to execute the code above I get the Session expired error.
This is my line of thought:
I am relatively new to async programming so bear with me.
My though process was to create a long task list (that only allows 100 parallel requests), that I build in the run function, and then pass as a future to the event loop to execute.
I have included a print debug in the bound_fetch (which I copied from the blog post) and it looks like it loops over all urls that I have and as soon as it should start making requests in the fetchheaders function I get the runtime errors.
How do I fix my code ?
A couple things here.
First, in your run function you actually want to gather the tasks there and await them to fix your session issue, like so:
async def run():
urls = ['google.com','amazon.com']
tasks = []
# create instance of Semaphore
sem = asyncio.Semaphore(100)
async with ClientSession() as session:
for i in urls:
task = asyncio.ensure_future(bound_fetch(sem, "http://"+i, session))
tasks.append(task)
await asyncio.gather(*tasks)
Second, the aiohttp API is a little odd in dealing with headers in that you can't await them. I worked around this by awaiting body so that headers are populated and then returning the headers:
async def fetchheaders(url, session):
async with session.get(url) as response:
data = await response.read()
responseheader = response.headers
print(responseheader)
return responseheader
There is some additional overhead here in pulling the body however. I couldn't find another way to load headers though without doing a body read.