Timeout and count control with asyncio - python

I just try to use asynchronous script for parsing with asyncio. I find the similar question and took this answer as pattern for my tasks. I added latency for requests (1st part of linked answer) and also tried to add a counter of active requests(2nd part). But this code launch just 5 requests and after become to wait.
I didn`t find good explanation for me how asyncio.Event works, so I would like to ask you to help me to improve my code. Thank you in advance.
import json
from bs4 import BeautifulSoup
import asyncio
import aiohttp
active_calls = 0
MAX_CALLS = 5
def write_to_txt_file(text, name):
f = open(f'{PATH}{name}.txt', 'w')
f.write(text)
f.close()
async def getData(item , session, next_delay , event):
global active_calls, next_delay
await event.wait()
if active_calls > MAX_CALLS - 1:
event.clear()
next_delay = 0.1
print( 'start' , active_calls)
active_calls += 1
next_delay += DELAY
await asyncio.sleep(next_delay)
try:
async with session.get(url=item['Link'] ) as response:
soup = BeautifulSoup(await response.text(), 'html.parser')
name = str(item["ID"]) + '. ' + item["Title"][:100]
text = soup.find(id="begin").get_text()
write_to_txt_file(text , name)
finally:
active_calls -= 1
if active_calls == 0:
event.set()
async def parseFromJson():
with open('./data2/csvjson.json') as data_file: #take links from JSON
data = json.load(data_file)
async with aiohttp.ClientSession() as session:
tasks = []
event = asyncio.Event()
event.set()
next_delay = 0.1
DELAY = 0.3
for item in data:
task = asyncio.create_task(getData(item , session, next_delay , event))
next_delay += DELAY
tasks.append(task)
await asyncio.gather(*tasks)
def main():
asyncio.run(parseFromJson())
if __name__ == "__main__":
main()
UPD: As I suppose this code can just stop requests but no start it again?
UPD: I have changed my code and now it works like this:
All getData() functions launched at one time
Script launches 5 requests and stops on 'await event.wait()' line
rest of them
All requests finished and I start 'event.set()'
After this all functions continuous work and rest requests
started(without limitation of 5 requests).
How to fix it?

So I just added the loop, and all functions check waiting every time, it seems not correct but it helped me
while active_calls > MAX_CALLS:
print(1)
await event.wait()

Related

How to run 2 APIs simultaneously and one is dependent on the other

I am trying to run the loops simultaneously the second loop is dependent on the first one output and needs it to fetch the input from the ids list so no need to wait for the first one until the finish. I tried to do it with multiple libraries and methods but failed to find the optimal structure for that.
import time
import pandas as pd
import requests
import json
from matplotlib import pyplot
import seaborn as sns
import numpy as np
API_KEY = ''
df = pd.read_csv('lat_long file')
# get name and information of each place
id = df['id']
lat = df['latitude']
lon = df['longitude']
ids=[]
loc=[]
unit=[]
print('First API now running')
def get_details(lat, lon):
try:
url = "https://maps.googleapis.com/maps/api/geocode/json?latlng="+ str(lat) + ',' + str(lon)+'&key='+ API_KEY
response = requests.get(url)
data = json.loads(response.text)
ids.append(data['results'][0]['place_id'])
except Exception as e:
print('This code NOT be running because of', e)
return data
def get_deta(ids):
url1 = "https://maps.googleapis.com/maps/api/place/details/json?language=en-US&placeid="+str(ids)+"&key=" + API_KEY
responsedata = requests.get(url1)
data2 = json.loads(responsedata.text)
if 'business_status' in data2['result'].keys():
loc.append((data2['result']['business_status']))
else:
loc.append('0')
flag = False
if data2['result']:
for level in data2['result']['address_components']:
#if len(level['types']) > 1:
if level['types'][0] == 'premise':
flag = True
unit.append(level['long_name'][4:])
else:
print(data2)
if not flag:
unit.append('0')
return data2
def loop1():
for i in range(len(id)):
get_details(lat[i], lon[i])
return
print('Seconed API now running')
def loop2(len(id)):
#printing and appending addresses to use them with the next API
for i in range(50):
get_deta(ids[i])
return
loop1()
loop2()
It is not very clear what you are trying to achieve here. How exactly does the second API depends on the first?
To achieve concurrency you could use the AsyncIO library which is designed to perform concurrent network requests efficiently. However, the requests library you are using is a synchronous one, you must change to an asynchronous one such as aiohttp.
Given that, you can communicate between two concurrent tasks using asyncio.Queue. Here is a draft of what your program could look like:
import asyncio
import aiohttp
import json
async def get_details(lat, lon, session: aiohttp.ClientSession, id_queue: asyncio.Queue):
url: str = f"https://maps.googleapis.com/maps/api/geocode/json?latlng={lat},{lon}&key={API_KEY}"
async with session.get(url) as response:
data = await json.loads(response.text())
await id_queue.put(data['results'][0]['place_id'])
async def get_data(id, session: aiohttp.ClientSSLError, loc_queue: asyncio.Queue):
# Network requests and JSON decoding
...
await loc_queue.put((data['result']['business_status']))
async def loop_1(coords, id_queue: asyncio.Queue):
await asyncio.gather(
*[get_details(lat, lon) for lat, lon in coords]
)
async def loop_2(id_queue: asyncio.Queue, loc_queue: asyncio.Queue):
while True:
id = await id_queue.get()
await get_data(id)
async def main():
id_queue = asyncio.Queue(maxsize=100)
loc_queue = asyncio.Queue(maxsize=100)
await asyncio.gather(
loop_1(),
loop_2()
)
if __name__ == "__main__":
asyncio.run(main())
I simplified your example for the purpose of the example. If you take a look at the main() function, the two loops are executed concurrently with asyncio.gather(). The first loop gets the details of all places concurrently (again with asyncio.gather) and feed a shared queue id_queue. The second loop waits for new ids to come up in the queue and process them with the second API as soon as they are available. It then enqueue the results in a last queue loc_queue.
You could extend this program by adding a third API linked plugged to this last queue and continue to process.

(Python) How can I apply asyncio in while loop with accumulator?

I have a block of codes that works well in fetching data from API requests to a specific site. The issue is that the site only gives me a limit of 50 objects for each call, and I have to make multiple calls. As a result, it takes me too long to finish the fetching work (sometimes I have to wait nearly 20 minutes). Here are my codes:
import concurrent.futures
import requests
supply = 3000
offset = 0
token_ids = []
while offset < supply:
url = "url_1" + str(offset)
response = requests.request("GET", url)
a = response.json()
assets = a["assets"]
def get_token_ids(an):
if str(an['sell_orders']) == 'None' and str(an['last_sale']) == 'None' and str(an['num_sales']) == '0':
token_ids.append(str(an['token_id']))
with concurrent.futures.ThreadPoolExecutor() as executor:
results = [executor.submit(get_token_ids, asset) for asset in assets]
offset += 50
print(token_ids)
The problem is that the codes run through and wait for all actions to be finished before making another request. I am thinking of an improvement that when the request is sent, the offset value gets added, and the loop processes to another request, thus I don't have to wait. I don't know how to do it, I studied 'asyncio', but it is still a challenge for me. Can anyone help me with this?
The problem is that Requests is not asynchronous code, so each of its network calls blocks the loop until its completion.
https://docs.python-requests.org/en/latest/user/advanced/#blocking-or-non-blocking
Therefore, it is better to try asynchronous libraries, for example, aiohttp:
https://github.com/aio-libs/aiohttp
Example
Create session for all connections:
async with aiohttp.ClientSession() as session:
and run all desired requests:
results = await asyncio.gather(
*[get_data(session, offset) for offset in range(0, supply, step)]
)
here, requests are executed asynchronously, with session.get(url) gets only the response headers, and the content gets await response.json():
async with session.get(url) as response:
a = await response.json()
And in the main block main loop starts:
loop = asyncio.get_event_loop()
token_ids = loop.run_until_complete(main())
loop.close()
The full code
import aiohttp
import asyncio
async def get_data(session, offset):
token_ids = []
url = "url_1" + str(offset)
async with session.get(url) as response:
# For tests:
# print("Status:", response.status)
# print("Content-type:", response.headers['content-type'])
a = await response.json()
assets = a["assets"]
for asset in assets:
if str(asset['sell_orders']) == 'None' and str(asset['last_sale']) == 'None' and str(asset['num_sales']) == '0':
token_ids.append(str(asset['token_id']))
return token_ids
async def main():
supply = 3000
step = 50
token_ids = []
# Create session for all connections and pass it to "get" function
async with aiohttp.ClientSession() as session:
results = await asyncio.gather(
*[get_data(session, offset) for offset in range(0, supply, step)]
)
for ids in results:
token_ids.extend(ids)
return token_ids
if __name__ == "__main__":
# asynchronous code start here
loop = asyncio.get_event_loop()
token_ids = loop.run_until_complete(main())
loop.close()
# asynchronous code end here
print(token_ids)

How to concurrently run asynchronous launchers/concurrently run asyncio functions in a big manager

I am trying to make my code run faster for finding roblox account names. I tried using larger and larger event loops (they basically took the previous event manager and used that to make a larger event manager), but that resulted in the same, if not worse performance when compared to using just a single small event loop.
This code was supplied in another question of mine (with modifications from me here). It works great, but it still can take a good few minutes to handle larger quantities of accounts. Usually I wouldn't care, but I am trying to get to 100,000 accounts, so I need performance. Is this just how fast it can go? Or can we drive this even further? Is the answer just more CPU/memory? Better internet? Do I need network programming at all, or is there a faster, no-request way?
Code:
import asyncio
import aiohttp
async def find_account(url, session, id):
try:
async with session.get(url) as response:
if response.status == 200:
r = await response.read()
from bs4 import BeautifulSoup
soup = BeautifulSoup(r, 'html.parser')
h2 = []
for i in soup.find_all('h2'):
h2.append(i)
print('Done')
return str(list(list(h2)[0])[0]) + ' ' + str(url)
else:
return 'This account does not exist ID: {}'.format(id)
except aiohttp.ServerDisconnectedError:
print('Done')
return find_account(url, session, id)
async def main(min_id, max_id):
tasks = []
async with aiohttp.ClientSession() as session:
for id in range(min_id, max_id):
url = f'https://web.roblox.com/users/{str(id)}/profile'
tasks.append(asyncio.create_task(find_account(url=url, session=session, id=id)))
return await asyncio.gather(*tasks)
from time import time
loop = asyncio.get_event_loop()
starting = int(input("Type Your Starting Id Number>> "))
ending = int(input("Type Your Ending Id Number>> "))
timer = time()
users = loop.run_until_complete(main(starting, ending))
users = [i for i in users if i != '1']
print(users)
print(time()-timer)
You could run BeautifulSoup in multiple processes to speed it up. For example, you can extract the part of find_account that does the parsing and pass that to a process pool executor:
import concurrent.futures
_pool = concurrent.futures.ProcessPoolExecutor()
def parse(html):
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
h2 = []
for i in soup.find_all('h2'):
h2.append(i)
return str(list(list(h2)[0])[0])
async def find_account(url, session, id):
while True:
async with session.get(url) as response:
if response.status == 200:
r = await response.read()
loop = asyncio.get_event_loop()
extracted = await loop.run_in_executor(_pool, parse, r)
print('Done')
return extracted + ' ' + str(url)
else:
return 'This account does not exist ID: {}'.format(id)
except aiohttp.ServerDisconnectedError:
print('Done')
# keep looping
On an unrelated note, your recursive call to find_account() was incorrect because it was missing an await. The above code fixes that and switches to a loop instead, which makes it a bit more explicit that the code is in fact looping.

How to avoid error 429 (Too Many Requests) python with Asyncio

I am using the following code to make requests with aiohttp client. The server that I am trying to send request has a 30k request limit per hour per IP. So I am getting 429 too many request error. I want to put the job on sleep whenever it hits the limit.
I can extract the x_rateLimit_reset from the header so I thought I could use it to put the job on sleep but I observed very strange behavior. Sometimes the job the sleep time becomes negative and sometimes it gets stuck in sleeping mode.
For example, the last time that I ran the job, it first slept for 2000 seconds and then after the time passed, it again tried to sleep for another 2500 seconds and got stuck in sleeping mode. I think maybe the other parallel processes caused the issue so was wondering how to deal with too many request error msg when using Asyncio.
#backoff.on_exception(backoff.expo, (asyncio.TimeoutError, aiohttp.client_exceptions.ServerDisconnectedError,TooManyRequests),
max_time=300)
async def fetch(self, url, session, params):
try:
async with session.get(url, params=params) as response:
now = int(time.time())
print(response)
output = await response.read()
output = json.loads(output)
if 'X-RateLimit-Remaining' in response.headers:
rate = response.headers['X-RateLimit-Remaining']
if 'status' in output and output['status'] == 429:
x_rateLimit_reset = int(response.headers['X-RateLimit-Reset'])
print("sleep mode")
seconds = x_rateLimit_reset - now
LOGGER.info("The job will sleep for {} seconds".format(seconds))
time.sleep(max(seconds,0))
raise TooManyRequests()
return output
except (asyncio.TimeoutError, TypeError, json.decoder.JSONDecodeError,
aiohttp.client_exceptions.ServerDisconnectedError) as e:
print(str(e))
async def bound_fetch(self, sem, url, session, params):
# Getter function with semaphore.
async with sem:
output = await self.fetch(url, session, params)
return {"url": url, "output": output}
Edited:
This is how I initiate bound_fetch and define the URLs:
def get_responses(self, urls, office_token, params=None):
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(self.run(office_token, urls, params))
responses = loop.run_until_complete(future)
return responses
async def run(self, office_token, urls, params):
tasks = []
# create instance of Semaphore
sem = asyncio.BoundedSemaphore(200)
timeout = ClientTimeout(total=1000)
async with ClientSession(auth=BasicAuth(office_token, password=' '), timeout=timeout,
connector=TCPConnector(ssl=False)) as session:
for url in urls:
# pass Semaphore and session to every GET request
task = asyncio.ensure_future(self.bound_fetch(sem, url, session, params))
tasks.append(task)
responses = await asyncio.gather(*tasks)
return responses
urls = [
"{}/{}".format(self.base_url, "{}?page={}&api_key={}".format(object_name, page_number, self.api_keys))
for page_number in range(batch * chunk_size + 1, chunk_size * (1 + batch) + 1)]
Main reason you are using time.sleep() instead await asyncio.sleep().
UPDATE
Here is minimal working solution and some comment how it works.
Please use it to adopt your solution.
Take a look on asyncio-throttle
import aiohttp
import asyncio
from datetime import datetime
async def fetch(session, task): # fetching urls and mark result of execution
async with session.get(task['url']) as response:
if response.status != 200:
# response.raise_for_status()
# Here you need to somehow handle 429 code if it acquired
# In my example I just skip it.
task['result'] = response.status
task['status'] = 'done'
await response.text() # just to be sure we acquire data
print(f"{str(datetime.now())}: Got result of {task['url']}") # logging
task['result'] = response.status
task['status'] = 'done'
async def fetch_all(session, urls, persecond):
# convert to list of dicts
url_tasks = [{'url': i, 'result': None, 'status': 'new'} for i in urls]
n = 0 # counter
while True:
# calc how many tasks are fetching right now
running_tasks = len([i for i in url_tasks if i['status'] in ['fetch']])
# calc how many tasks are still need to be executed
is_tasks_to_wait = len([i for i in url_tasks if i['status'] != 'done'])
# check we are not in the end of list n < len()
# check we have room for one more task
if n < len(url_tasks) and running_tasks < persecond:
url_tasks[n]['status'] = 'fetch'
#
# Here is main trick
# If you schedule task inside running loop
# it will start to execute sync code until find some await
#
asyncio.create_task(fetch(session, url_tasks[n]))
n += 1
print(f'Schedule tasks {n}. '
f'Running {running_tasks} '
f'Remain {is_tasks_to_wait}')
# Check persecond constrain and wait a sec (or period)
if running_tasks >= persecond:
print('Throttling')
await asyncio.sleep(1)
#
# Here is another main trick
# To keep asyncio.run (or loop.run_until_complete) executing
# we need to wait a little than check that all tasks are done and
# wait and so on
if is_tasks_to_wait != 0:
await asyncio.sleep(0.1) # wait all tasks done
else:
# All tasks done
break
return url_tasks
async def main():
urls = ['http://google.com/?1',
'http://google.com/?2',
'http://google.com/?3']*3
async with aiohttp.ClientSession() as session:
res = await fetch_all(session, urls, 3)
print(res)
if __name__ == '__main__':
asyncio.run(main())
# (asyncio.run) do cancel all pending tasks (we do not have them,
# because we check all task done)
# (asyncio.run) do await canceling all tasks
# (asyncio.run) do stop loop
# exit program

Asyncio not running things in parallel

I am running this asyncio example
import asyncio
from aiohttp import ClientSession
async def fetch(url, session, index):
async with session.get(url) as response:
print("Before " + str(index))
buffer = await response.read()
print("After " + str(index))
async def run(r):
url = "http://google.com"
tasks = []
# Fetch all responses within one Client session,
# keep connection alive for all requests.
async with ClientSession() as session:
for i in range(r):
task = asyncio.ensure_future(fetch(url, session, i))
tasks.append(task)
responses = await asyncio.gather(*tasks)
# you now have all response bodies in this variable
print(responses)
def print_responses(result):
print(result)
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run(4))
loop.run_until_complete(future)
The results are something like this
Before 1
After 1
Before 3
After 3
Before 2
After 2
Before 0
After 0
This looks like the things are running serially. The fetch part itself takes some milliseconds. I ran it several times, but the results are not intermingled. Any suggestions?
As mentioned in comments, it's likely that by the time response.read() is called, there is nothing remaining to download, the coroutine isn't suspended, and print("After " + str(index)) is then immediately called.
To see more of an intermingling, you can choose a slower URL. For example for me
url = "http://speedtest.tele2.net/100MB.zip"
outputs
Before 0
Before 1
Before 3
Before 2
After 0
After 1
After 2
After 3

Categories

Resources