Unable to asynchronously make requests uing aiohttp and asyncio - python

I have written a script where draft orders are created in shopify and then used the id of the first request for the url of the second request which involves completing the order. Below is just a portion of the script that I wrote. However, there are delays that are happening when response is being fetched. And , because of this the draft order only sometimes get completed:
url=f"https://{shop_url}/admin/api/{api_version}/draft_orders.json"
headers = {"X-Shopify-Access-Token": private_app_password}
counter = count(start=1)
for _ in range(number_of_orders):
order = get_order(
line_items_list, locale="en_US", country="United States"
)
response = requests.post(url, json=order, headers=headers)
data = response.json()
# complete order
url2 = f"https://{shop_url}/admin/api/{api_version}/draft_orders/{data['draft_order']['id']}/complete.json"
requests.put(url2,headers=headers)
The problem seems to have to do with the delay that happens when the first response is fetched. Hence why I tried to wrap my API calls in asycn to fetch but the same thing is still occuring. The portion of the script is given below:
async def fetch(session, url,order,headers):
async with session.post(url,headers=headers,json=order) as response:
return await response.json()
async def get_draft_order(url,order,headers):
async with aiohttp.ClientSession() as session:
data = await fetch(session,url,order,headers)
url2 = f"https://{shop_url}/admin/api/{api_version}/draft_orders/{data['draft_order']['id']}/complete.json"
await session.put(url2,headers=headers,json=data)
def create_orders():
# POST request
url = f"https://{shop_url}/admin/api/{api_version}/draft_orders.json"
headers = {"X-Shopify-Access-Token": private_app_password}
counter = count(start=1)
for _ in range(number_of_orders):
order = get_order(
line_items_list, locale="en_US", country="United States"
)
asyncio.run(get_draft_order(url,order,headers))
Could someone help me understand what is wrong with the way I have implemented it. The second request depends on the id of the first request.

Related

Asyncio Sleep Does not seem to Satisfy Rate Limits for certain status codes

I am working with an API that limits to 4 requests/s. I am using asyncio and aiohttp to make asynchronous http requests. I am using Windows.
When working with the API I receive 3 status codes most commonly, 200, 400 & 429.
The 4/s issue works entirely fine when seeing many 400s but soon as I receive a 200 and try to resolve the json using response.json I begin to receive 429s for too many requests. I am trying to understand why something like this would occur.
EDIT: After doing some logging, I am seeing that the 429s appear to creep up after I have a response that takes longer than a second to complete (in the case of a 200, a large JSON response might take a bit of time to resolve). It appears that after a > 1s request elapsed time request occurs, followed by a fast one (which takes a few ms) the requests sort of "jump" ahead too quickly and overwhelm the API with more than 4 requests resolving in a second.
I am utilizing a semaphore of size 3 (trying 4 hits 429 way more often). The workflow is generally:
1. Create event loop
2. Gather tasks
3. Create http session and begin async requests with our Semaphore.
4. _fetch() is handling the specific asynchronous requests.
I am trying to understand why that when I receive 200s (which requires some JSON serialization and likely adds some latency). If I am always awaiting a sleep call of 1.5 seconds per call, why am I still able to hit rate limits? Is this fault of the API I am hitting or is there something intrinsically wrong with my async-await calls.
Below is my code:
import asyncio
import aiohttp
import time
class Request:
def __init__(self, url: str, method: str="get", payload: str=None):
self.url: str = url
self.method: str = method
self.payload: str or dict = payload or dict()
class Response:
def __init__(self, url: str, status: int, payload: dict=None, error: bool=False, text: str=None):
self.url: str = url
self.status: int = status
self.payload: dict = payload or dict()
self.error: bool = error
self.text: str = text or ''
def make_requests(headers: dict, requests: list[Request]) -> asyncio.AbstractEventLoop:
"""
requests is a list with data necessary to make requests
"""
loop: asyncio.AbstractEventLoop = asyncio.get_event_loop()
responses: asyncio.AbstractEventLoop = loop.run_until_complete(_run(headers, requests))
return responses
async def _run(headers: dict, requests: list[Request]) -> "list[Response]":
# Create a semaphore to limit how many concurrent thread processes we can run (MAXIMUM) at a time.
semaphore: asyncio.Semaphore = asyncio.Semaphore(3)
time.sleep(10) # wait 10 seconds before beginning our async requests
async with aiohttp.ClientSession(headers=headers) as session:
tasks: list[asyncio.Task] = [asyncio.create_task(_iterate(semaphore, session, request)) for request in requests]
responses: list[Response] = await asyncio.gather(*tasks)
return responses
async def _iterate(semaphore: asyncio.Semaphore, session: aiohttp.ClientSession, request: Request) -> Response:
async with semaphore:
return await _fetch(session, request)
async def _fetch(session: aiohttp.ClientSession, request: Request) -> Response:
try:
async with session.request(request.method, request.url, params=request.payload) as response:
print(f"NOW: {time.time()}")
print(f"Response Status: {response.status}.")
content: dict = await response.json()
response.raise_for_status()
await asyncio.sleep(1.5)
return Response(request.url, response.status, payload=content, error=False)
except aiohttp.ClientResponseError:
if response.status == 429:
await asyncio.sleep(12) # Back off before proceeding with more requests
return await _fetch(session, request)
else:
await asyncio.sleep(1.5)
return Response(request.url, response.status, error=True)
The 4/s issue works entirely fine when seeing many 400s, but soon as I
receive a 200 and try to resolve the JSON using response.json, I begin
to receive 429s for too many requests. I am trying to understand why
something like this would occur.
The response status does not depend on how often you call the .json method on responses. The cause can be the security of the server API is running on. At the debugging time, I had to optimize the make_requests to make it more readable.
import asyncio
import aiohttp
class Request:
def __init__(self, url: str, method: str = "get", payload: str = None):
self.url: str = url
self.method: str = method
self.payload: str or dict = payload or dict()
class Response:
def __init__(self, url: str, status: int, payload: dict = None, error: bool = False, text: str = None):
self.url: str = url
self.status: int = status
self.payload: dict = payload or dict()
self.error: bool = error
self.text: str = text or ''
async def make_requests(headers: dict, requests: "list[Request]"):
"""
This function makes concurrent requests with a semaphore.
:param headers: Main HTTP headers to use in the session.
:param requests: A list of Request objects.
:return: List of responses converted to Response objects.
"""
async def make_request(request: Request) -> Response:
"""
This closure makes limited requests at the time.
:param request: An instance of Request that describes HTTP request.
:return: A processed response.
"""
async with semaphore:
try:
response = await session.request(request.method, request.url, params=request.payload)
content = await response.json()
response.raise_for_status()
return Response(request.url, response.status, payload=content, error=False)
except (aiohttp.ClientResponseError, aiohttp.ContentTypeError, aiohttp.ClientError):
if response.status == 429:
return await make_request(request)
return Response(request.url, response.status, error=True)
semaphore = asyncio.Semaphore(3)
curr_loop = asyncio.get_running_loop()
async with aiohttp.ClientSession(headers=headers) as session:
return await asyncio.gather(*[curr_loop.create_task(make_request(request)) for request in requests])
if __name__ == "__main__":
HEADERS = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:101.0) Gecko/20100101 Firefox/101.0"
}
REQUESTS = [
Request("https://www.google.com/search?q=query1"),
Request("https://www.google.com/search?q=query2"),
Request("https://www.google.com/search?q=query3"),
Request("https://www.google.com/search?q=query4"),
Request("https://www.google.com/search?q=query5"),
]
loop = asyncio.get_event_loop()
responses = loop.run_until_complete(make_requests(HEADERS, REQUESTS))
print(responses) # [<__main__.Response object at 0x7f4f73e5da30>, <__main__.Response object at 0x7f4f73e734f0>, <__main__.Response object at 0x7f4f73e73790>, <__main__.Response object at 0x7f4f73e5d9d0>, <__main__.Response object at 0x7f4f73e73490>]
loop.close()
If you get 400s after some count of requests, you need to check what headers are sent by the browser that is missed in your request.
I am trying to understand why when I receive 200s (which requires some
JSON serialization and it likely adds some latency). If I am always
awaiting a sleep call of 1.5 seconds per call, why am I still able to
hit rate limits? Is this fault of the API I am hitting, or is there
something intrinsically wrong with my async-await calls?
I'm not sure what you meant by saying "to be able to hit rate limits", but asyncio.sleep should work properly. The script makes the first limited count of concurrent requests (in this case, semaphore allows three concurrent tasks) almost at the same time. After a request is received, it waits for 1.5 sec concurrently and returns the result of the task. The key is concurrency. If you wait with asyncio.sleep for 1.5 sec in 3 different tasks, it will wait 1.5 sec but not 4.5. If you wanted to set delays between requests, you could wait before or after calling the create_task.

Why I can't fetch responses from aiohttp in cycle? Exception: ValueError: I/O operation on closed file + aiohttp.payload.LookupError

When I tried to get response from each api_key in cycle I'm always confused with two errors - ValueError: I/O operation on closed file AND aiohttp.payload.LookupError...
async def get_api_response(photo_bytes, api_key, language):
async with aiohttp.ClientSession() as session:
async with session.post(URL_API,
data={
"picture.png": photo_bytes,
"apikey": api_key,
"language": language
}) as response:
return await response.text()
It`s async function for async requests to API
for api_key in API_KEYS:
response = json.loads(await get_api_response(photo_bytes=photo_bytes, api_key=api_key, language=photo_lang))
print(response)
And this is cycle with results ↑
But when I tried to implement cycle in the function - gives the same result. But when I call function without cycle - it's work!
How can this be fixed with minimal changes?
UPD: DURING RESEARCH I'M NOTICED THAT IT'S OCCURES WHEN AFTER FIRST CYCLE PHOTO_BYTES CLOSE UP.
Here is init photo_bytes:
photo = cv2.imread(photo_path)
unused_var, compressed_image = cv2.imencode('.png', photo, [1, 90])
photo_bytes = io.BytesIO(compressed_image)
Like something like this
with io.BytesIO(compressed_image) as f:
photo_bytes = f.read()
I had similar issue in which I pass one BytesIO object to multiple request.
The problem seem to be that aiohttp closes the BytesIO object when it finished using it, so we can't reuse it with another request.
The solution is to create BytesIO object every time I pass it to request method.

Concurrent HTTP and SQL requests using async Python 3

first time trying asyncio and aiohttp.
I have the following code that gets urls from the MySQL database for GET requests. Gets the responses and pushes them to MySQL database.
if __name__ == "__main__":
database_name = 'db_name'
company_name = 'company_name'
my_db = Db(database=database_name) # wrapper class for mysql.connector
urls_dict = my_db.get_rest_api_urls_for_specific_company(company_name=company_name)
update_id = my_db.get_updateid()
my_db.get_connection(dictionary=True)
for url in urls_dict:
url_id = url['id']
url = url['url']
table_name = my_db.make_sql_table_name_by_url(url)
insert_query = my_db.get_sql_for_insert(table_name)
r = requests.get(url=url).json() # make the request
args = [json.dumps(r), update_id, url_id]
my_db.db_execute_one(insert_query, args, close_conn=False)
my_db.close_conn()
This works fine but to speed it up How can I run it asynchronously?
I have looked here, here and here but can't seem to get my head around it.
Here is what I have tried based on #Raphael Medaer's answer.
async def fetch(url):
async with ClientSession() as session:
async with session.request(method='GET', url=url) as response:
json = await response.json()
return json
async def process(url, update_id):
table_name = await db.make_sql_table_name_by_url(url)
result = await fetch(url)
print(url, result)
if __name__ == "__main__":
"""Get urls from DB"""
db = Db(database="fuse_src")
urls = db.get_rest_api_urls() # This returns list of dictionary
update_id = db.get_updateid()
url_list = []
for url in urls:
url_list.append(url['url'])
print(update_id)
asyncio.get_event_loop().run_until_complete(
asyncio.gather(*[process(url, update_id) for url in url_list]))
I get an error in the process method:
TypeError: object str can't be used in 'await' expression
Not sure whats the problem?
Any code example specific to this would be highly appreciated.
Make this code asynchronous will not speed it up at all. Except if you consider to run a part of your code in "parallel". For instance you can run multiple (SQL or HTTP) queries in "same time". By doing asynchronous programming you will not execute code in "same time". Although you will get benefit of long IO tasks to execute other part of your code while you're waiting for IOs.
First of all, you'll have to use asynchronous libraries (instead of synchronous one).
mysql.connector could be replaced by aiomysql from aio-libs.
requests could be replaced by aiohttp
To execute multiple asynchronous tasks in "parallel" (for instance to replace your loop for url in urls_dict:), you have to read carefully about asyncio tasks and function gather.
I will not (re)write your code in an asynchronous way, however here are a few lines of pseudo code which could help you:
async def process(url):
result = await fetch(url)
await db.commit(result)
if __name__ == "__main__":
db = MyDbConnection()
urls = await db.fetch_all_urls()
asyncio.get_event_loop().run_until_complete(
asyncio.gather(*[process(url) for url in urls]))

Make Python Async Requests Faster

I am writing a get method that gets an array of ids and then makes a request for each id. The array of ids can be potentially 500+ and right now the requests are taking 20+ minutes. I have tried several different async methods like aiohttp and async and neither of them have worked to make the requests faster. Here is my code:
async def get(self):
self.set_header("Access-Control-Allow-Origin", "*")
story_list = []
duplicates = []
loop = asyncio.get_event_loop()
ids = loop.run_in_executor(None, requests.get, 'https://hacker-news.firebaseio.com/v0/newstories.json?print=pretty')
response = await ids
response_data = response.json()
print(response.text)
for url in response_data:
if url not in duplicates:
duplicates.append(url)
stories = loop.run_in_executor(None, requests.get, "https://hacker-news.firebaseio.com/v0/item/{}.json?print=pretty".format(
url))
data = await stories
if data.status_code == 200 and len(data.text) > 5:
print(data.status_code)
print(data.text)
story_list.append(data.json())
Is there a way I can use multithreading to make the requests faster?
The main issue here is that the code isn't really async.
After getting your list of URL's you are then fetching them one at a time and then awaiting the response.
A better idea would be to filter out the duplicates (use a set) before queuing all of the URLs in the executor and awaiting all of them to finish eg:
async def get(self):
self.set_header("Access-Control-Allow-Origin", "*")
stories = []
loop = asyncio.get_event_loop()
# Single executor to share resources
executor = ThreadPoolExecutor()
# Get the initial set of ids
response = await loop.run_in_executor(executor, requests.get, 'https://hacker-news.firebaseio.com/v0/newstories.json?print=pretty')
response_data = response.json()
print(response.text)
# Putting them in a set will remove duplicates
urls = set(response_data)
# Build the set of futures (returned by run_in_executor) and wait for them all to complete
responses = await asyncio.gather(*[
loop.run_in_executor(
executor, requests.get,
"https://hacker-news.firebaseio.com/v0/item/{}.json?print=pretty".format(url)
) for url in urls
])
# Process the responses
for response in responses:
if response.status_code == 200 and len(response.text) > 5:
print(response.status_code)
print(response.text)
stories.append(response.json())
return stories

How paginate through api response asynchronously with asyncio and aiohttp

I'm trying to make api calls with python asynchronously. I have multiple endpoints in a list and each endpoint will return paginated results. I'm able to set up going though the multiple endpoints asynchronously, however am not able to return the paginated results of each endpoint.
From debugging, I found that the fetch_more() function runs the while loop, but doesn't actually get past the async with session.get(). So basically. The function fetch_more() is intended to get remaining results from the api call for each endpoint, however I find that the same number of results are returned with or without the fetch_more() function. I've tried looking for examples of pagination with asyncio but have not have much luck.
From my understanding, I should not be doing a request inside a while loop, however, I'm not sure a way around that in order to get paginated results.
if __name__ == 'main':
starter_fun(url, header, endpoints):
starter_func(url, header, endpoints):
loop = asyncio.get_event_loop() #event loop
future = asyncio.ensure_future(fetch_all(url, header, endpoints))
loop.run_until_complete(future) #loop until done
async def fetch_all(url, header, endpoints):
async with ClientSession() as session:
for endpoint in endpoints:
task = asyncio.ensure_future(fetch(url, header, endpoint))
tasks.append(task)
res = await asyncio.gather(*tasks) # gather task responses
return res
async def fetch(url, header, endpoint):
total_tasks = []
async with session.get(url, headers=header, params=params, ssl=False) as response:
response_json = await response.json()
data = response_json['key']
tasks = asyncio.ensure_future(fetch_more(response_json, data, params, header, url, endpoint, session)) //this is where I am getting stuck
total_tasks.append(tasks)
return data
//function to get paginated results of api endpoint
async def fetch_more(response_json, data, params, header, url, endpoint, session): //this is where I am getting stuck
while len(response_json['key']) >= params['limit']:
params['offset'] = response_json['offset'] + len(response_json['key'])
async with session.get(url, headers=header, params=params, ssl=False) as response_continued:
resp_continued_json = await response_continued.json()
data.extend(resp_continued_json[kebab_to_camel(endpoint)])
return data
Currently I am getting 1000 results with or without the fetch_more function, however it should be a lot more with the fetch_more. Any idea as to how to approach asynchronously paginating?
from aiohttp import web
async def fetch(self size: int = 10):
data = "some code to fetch data here"
def paginate(_data, _size):
import itertools
while True:
i1, i2 = itertools.tee(_data)
_data, page = (itertools.islice(i1, _size, None),
list(itertools.islice(i2, _size)))
if len(page) == 0:
break
yield page
return web.json_response(list(paginate(_data=data, _size=size)))

Categories

Resources