Python asyncio retry for 200 response with specific result - python

I'm in a situation where I need to retry async request even when the request returns 200 response. For some specific cases, I need to check if there's a key in the output. If so, we need to retry. The following sample code (which can be executed in a Jupyter notebook) works for retries whenever the request fails (non-200). How can I tweak this to cater to this particular need?
P.S. Ideally, the response should've been non-200 but this is the option I'm left with.
# load required libraries
import json
import asyncio
import aiohttp
from async_retrying import retry
base_url = "http://localhost:1050/hello?rid="
# async ginger call
#retry(attempts=3)
async def async_ginger_call():
connector = aiohttp.TCPConnector(limit=3)
async with aiohttp.ClientSession(connector=connector) as session:
async with session.post(url, raise_for_status=True, timeout=300) as response:
result = await response.text()
# condition here; if key in result then retry
return json.loads(result)
reqs = 2
tasks = []
connector = aiohttp.TCPConnector(limit=reqs)
async with aiohttp.ClientSession(connector=connector) as session:
for i in range(reqs):
url = base_url + str(i)
# encode sentence
tasks.append(async_ginger_call())
results = await asyncio.gather(*tasks, return_exceptions=True)
Sample flask server code
# sample api
import time
import json
import datetime
from flask import Flask, request
from flask import Response
app = Flask(__name__)
#app.route('/hello', methods=['GET', 'POST'])
def welcome():
rid = request.args.get('rid', default=3, type=int)
valid_response = json.dumps({
"Result": {
"Warnings": [
{
"Code": 1100,
"Message": "A technical error occurred during execution."
}
]
}
}
)
# testing for failure
if rid == 2:
# pass
# return valid_response
return Response("{'Result': ''}", status=500, mimetype='application/json')
return valid_response
if __name__ == '__main__':
app.run(host='0.0.0.0', port=1050)

Related

Unit Testing Replace remote API Server with predefined response

So, I have a server running FastAPI which will make a API call to a remote API upon request.
I am developping unit-testing for this application, but here comes the question:
Can I, for the purpose of the test, replace a legit remote API server response by a predefined response ?
Example of the tests runned:
from fastapi.testclient import TestClient
from web_api import app
client = TestClient(app)
def test_get_root():
response = client.get('/')
assert response.status_code == 200
assert response.json() == {"running": True}
And the my server
from fastapi import FastAPI
app = FastAPI()
#app.get("/")
def home():
return {"running": True}
This is a simple example, but on other endpoints of my API I would call an external remote API
def call_api(self, endpoint:str, params:dict):
url = self.BASEURL + urllib.parse.quote(endpoint)
try:
response = requests.get(url, params=params)
response.raise_for_status()
except requests.exceptions.HTTPError as error:
print(error)
return response
Because I want to test the response of MY API, I would like to replace the remote API with a predefined response.
Also, one user request can end-up in multiple background API requests with transformed pieces of data.
Edit
Here are some more details on the structure of the application:
#app.get("/stuff/.......",
# lots of params
)
def get_stuff_from_things(stuff:list, params):
api = API(api_key=...)
# Do some stuff with the params
things = generate_things_list(params)
api.search_things(params)
# Check the result
# do some other stuff
return some_response
class API:
BASE_URL = 'https://api.example.com/'
def search_things(self, params):
# Do some stuff
# like putting stuff in the params
for s in stuff:
s.update(self.get_thing(params)) # -> get_thing()
# Do some more stuff
return stuff
# get_thing <- search_things
def get_thing(self, params...):
# Some stuff
results = self.call_api('something', params) # -> call_api()
json = results.json()
# Some more stuff
things = []
for thing in json['things']:
t = Thing(thing)
things.append(t)
return things
# call_api <- get_thing
def call_api(self, endpoint:str, params:dict):
url = self.BASEURL + urllib.parse.quote(endpoint)
try:
response = requests.get(url, params=params)
response.raise_for_status()
except requests.exceptions.HTTPError as error:
print(error)
self.last_response = response
return response
Nb. That is pseudo-code, I simplified the functions by removing the parameters, etc.
I hope it is clear, thanks for your help.
A complex API method might look like this (please pay attention to the depends mechanism - it is crucial):
import urllib
import requests
from fastapi import FastAPI, Depends
app = FastAPI()
# this can be in a different file
class RemoteCallWrapper:
def call_api(self, baseurl: str, endpoint: str, params: dict):
url = baseurl + urllib.parse.quote(endpoint)
try:
response = requests.get(url, params=params)
response.raise_for_status()
except requests.exceptions.HTTPError as error:
print(error)
return response
#app.get("/complex_api")
def calls_other_api(remote_call_wrapper=Depends(RemoteCallWrapper)):
response = remote_call_wrapper.call_api("https://jsonplaceholder.typicode.com",
"/todos/1", None)
return {"result": response.json()}
Now, we wish to replace the remote call class. I wrote a helper library that simplifies the replacement for tests - pytest-fastapi-deps:
from fastapi.testclient import TestClient
from mock.mock import Mock
from requests import Response
from web_api import app, RemoteCallWrapper
client = TestClient(app)
class MyRemoteCallWrapper:
def call_api(self, baseurl: str, endpoint: str, params: dict):
the_response = Mock(spec=Response)
the_response.json.return_value = {"my": "response"}
return the_response
def test_get_root(fastapi_dep):
with fastapi_dep(app).override({RemoteCallWrapper: MyRemoteCallWrapper}):
response = client.get('/complex_api')
assert response.status_code == 200
assert response.json() == {"result": {"my": "response"}}
You override the RemoteCallWrapper with your MyRemoteCallWrapper implementation for the test, which has the same spec.
As asserted - the response changed to our predefined response.
It sounds like you'd want to mock your call_api() function.
With a small modification to call_api() (returning the result of .json()), you can easily mock the whole function while calling the endpoint in your tests.
I'll use two files, app.py and test_app.py, to demonstrate how I would do this:
# app.py
import requests
import urllib
from fastapi import FastAPI
app = FastAPI()
def call_api(self, endpoint: str, params: dict):
url = self.BASEURL + urllib.parse.quote(endpoint)
try:
response = requests.get(url, params=params)
response.raise_for_status()
except requests.exceptions.HTTPError as error:
print(error)
return response.json() # <-- This is the only change. Makes it easier to test things.
#app.get("/")
def home():
return {"running": True}
#app.get("/call-api")
def make_call_to_external_api():
# `endpoint` and `params` could be anything here and could be different
# depending on the query parameters when calling this endpoint.
response = call_api(endpoint="something", params={})
# Do something with the response...
result = response["some_parameter"]
return result
# test_app.py
from unittest import mock
from fastapi import status
from fastapi.testclient import TestClient
import app as app_module
from app import app
def test_call_api_endpoint():
test_response = {
"some_parameter": "some_value",
"another_parameter": "another_value",
}
# The line below will "replace" the result of `call_api()` with whatever
# is given in `return_value`. The original function is never executed.
with mock.patch.object(app_module, "call_api", return_value=test_response) as mock_call:
with TestClient(app) as client:
res = client.get("/call-api")
assert res.status_code == status.HTTP_200_OK
assert res.json() == "some_value"
# Make sure the function has been called with the right parameters.
# This could be dynamic based on how the endpoint has been called.
mock_call.assert_called_once_with(endpoint="something", params={})
If app.py and test_app.py are in the same directory you can run the tests simply by running pytest inside that directory.

Memory Utilization keeps on increasing while scraping data using asyncio

I'm scraping data using asyncio and storing the data in a Redis database. My scrap is running fine, but memory utilization on the linux server keeps on increasing till it reaches 100% and then it freezes the server. I have to manually reboot the server and restart the script. I'm using 2 credentials to hit an api endpoint to get data fast as possible.
Here is the sample code:
from asyncio import tasks
from datetime import datetime, timedelta
from multiprocessing import Semaphore
from socket import timeout
import time
import asyncio
from aiohttp import ClientSession
from requests.exceptions import HTTPError
import config
import json
import pandas as pd
from loguru import logger
import pytz
import aioredis
from redis import Redis
RESULTS = []
result_dict = {}
redis = Redis(
host="host",
port=6379,
decode_responses=True,
# ssl=True,
username="default",
password="password",
)
async def get(url, session):
try:
response = await session.request(method="GET", url=url, timeout=1)
except Exception as err:
response = await session.request(method="GET", url=url, timeout=3)
pokemon = await response.json()
return pokemon["name"]
async def run_program(url, session, semaphore):
async with semaphore:
try:
pokemon_name = await get(url, session)
await publish(pokemon_name)
except:
pass
async def main():
header_dict = {
"header1": {
# Request headers
# "API-Key-1": config.PRIMARY_API_KEY,
"Cache-Control": "no-cache",
},
"header2": {
# "API-Key-2": config.SECONDARY_API_KEY,
"Cache-Control": "no-cache",
},
}
semaphore = asyncio.BoundedSemaphore(20)
tasks = []
for key, value in header_dict.items():
# logger.info(value)
async with ClientSession(headers=value) as session:
for i in range(0, 5):
URLS = f"https://pokeapi.co/api/v2/pokemon/{i}"
tasks.append(
asyncio.ensure_future(run_program(URLS, session, semaphore))
)
await asyncio.gather(*tasks)
async def publish(data):
if not data.empty:
try:
keyName = "channelName"
value = data
redis.set(keyName, value)
print("inserting")
except:
pass
else:
pass
while True:
try:
asyncio.run(main(), debug=True)
except Exception as e:
time.sleep(1)
asyncio.run(main(), debug=True)
I want to know why memory consumption is increasing and how to stop it.
Here is the image of memory utilization in percent over time. There is no other script running on the same Linux server except this one.
There are many causes of memory licking.
You're connecting to Redis and never close the connection.
When you are setting timeout=1 it much possibly will raise exceptions which can be the main cause of memory licking (see: Python not catching MemoryError)
The session is created on every iteration over headers. In the example, those are two but not sure about the real headers list size.
tasks are not getting empty after gather is called.
I tried to optimize the code and here is what I got.
import asyncio
import time
from aiohttp import ClientSession
from redis import DataError
from redis import Redis
async def publish(data, redis):
if not data.empty:
try:
redis.set("channelName", data)
except (DataError, Exception):
pass
async def run_program(url, session, headers, semaphore, redis):
async with semaphore:
try:
response = await session.request(method="GET", url=url, headers=headers)
pokemon = await response.json()
pokemon_name = pokemon.get("name")
await publish(pokemon_name, redis)
except:
pass
async def main():
header_dict = {
"header1": {
# Request headers
"Cache-Control": "no-cache",
},
"header2": {
"Cache-Control": "no-cache",
},
}
semaphore = asyncio.BoundedSemaphore(20)
async with ClientSession() as session:
for headers in header_dict.values():
with Redis(host="host", port=6379, decode_responses=True, username="default", password="password") as redis:
await asyncio.gather(*[
asyncio.ensure_future(
run_program(f"https://pokeapi.co/api/v2/pokemon/{i}", session, headers, semaphore, redis)
) for i in range(5)
])
while True:
try:
asyncio.run(main(), debug=True)
except Exception as e:
time.sleep(1)
asyncio.run(main(), debug=True)
All these changes should optimize memory usage.

Parallelize checking of dead URLs

The question is quite easy: Is it possible to test a list of URLs and store in a list only dead URLs (response code > 400) using asynchronous function?
I previously use requests library to do it and it works great but I have a big list of URLs to test and if I do it sequentially it takes more than 1 hour.
I saw a lot of article on how to make parallels requests using asyncio and aiohttp but I didn't see many things about how to test URLs with these libraries.
Is it possible to do it?
Using multithreading you could do it like this:
import requests
from concurrent.futures import ThreadPoolExecutor
results = dict()
# test the given url
# add url and status code to the results dictionary if GET succeeds but status code >= 400
# also add url to results dictionary if an exception arises with full exception details
def test_url(url):
try:
r = requests.get(url)
if r.status_code >= 400:
results[url] = f'{r.status_code=}'
except requests.exceptions.RequestException as e:
results[url] = str(e)
# return a list of URLs to be checked. probably get these from a file in reality
def get_list_of_urls():
return ['https://facebook.com', 'https://google.com', 'http://google.com/nonsense', 'http://goooglyeyes.org']
def main():
with ThreadPoolExecutor() as executor:
executor.map(test_url, get_list_of_urls())
print(results)
if __name__ == '__main__':
main()
You could do something like this using aiohttp and asyncio.
Could be done more pythonic I guess but this should work.
import aiohttp
import asyncio
urls = ['url1', 'url2']
async def test_url(session, url):
async with session.get(url) as resp:
if resp.status > 400:
return url
async def main():
async with aiohttp.ClientSession() as session:
tasks = []
for url in urls:
tasks.append(asyncio.ensure_future(test_url(session, url)))
dead_urls = await asyncio.gather(*tasks)
print(dead_urls)
asyncio.run(main())
Very basic example, but this is how I would solve it:
from aiohttp import ClientSession
from asyncio import create_task, gather, run
async def TestUrl(url, session):
async with session.get(url) as response:
if response.status >= 400:
r = await response.text()
print(f"Site: {url} is dead, response code: {str(response.status)} response text: {r}")
async def TestUrls(urls):
resultsList: list = []
async with ClientSession() as session:
# Maybe some rate limiting?
partitionTasks: list = [
create_task(TestUrl(url, session))
for url in urls]
resultsList.append(await gather(*partitionTasks, return_exceptions=False))
# do stuff with the results or return?
return(resultsList)
async def main():
urls = []
test = await TestUrls(urls)
if __name__ == "__main__":
run(main())
Try using a ThreadPoolExecutor
from concurrent.futures import ThreadPoolExecutor
import requests
url_list=[
"https://www.google.com",
"https://www.adsadasdad.com",
"https://www.14fsdfsff.com",
"https://www.ggr723tg.com",
"https://www.yyyyyyyyyyyyyyy.com",
"https://www.78sdf8sf5sf45sf.com",
"https://www.wikipedia.com",
"https://www.464dfgdfg235345.com",
"https://www.tttllldjfh.com",
"https://www.qqqqqqqqqq456.com"
]
def check(url):
r=requests.get(url)
if r.status_code < 400:
print(f"{url} is ALIVE")
with ThreadPoolExecutor(max_workers=5) as e:
for url in url_list:
e.submit(check, url)
Multiprocessing could be the better option for your problem.
from multiprocessing import Process
from multiprocessing import Manager
import requests
def checkURLStatus(url, url_status):
res = requests.get(url)
if res.status_code >= 400:
url_status[url] = "Inactive"
else:
url_status[url] = "Active"
if __name__ == "__main__":
urls = [
"https://www.google.com"
]
manager = Manager()
# to store the results for later usage
url_status = manager.dict()
procs = []
for url in urls:
proc = Process(target=checkURLStatus, args=(url, url_status))
procs.append(proc)
proc.start()
for proc in procs:
proc.join()
print(url_status.values())
url_status is a shared variable to store data for separate threads. Refer this page for more info.

How to get a part of the url using aiohttp_jinja2?

So I am trying to get a part of the URL like http://example/home?code=123456789 this 123456789 changes every time since it is oauth
so I am trying to get it
This is the py file
from aiohttp import web, web_urldispatcher
import discord
from discord.ext import commands
import aiohttp_jinja2
import jinja2
from pathlib import Path
from oauth import Ouath
#aiohttp_jinja2.template('home.html')
async def start(request):
raise web.HTTPSeeOther(location=Ouath.discord_login_url)
#aiohttp_jinja2.template('home.html')
async def login(request):
return
app = web.Application(loop=self.client.loop)
aiohttp_jinja2.setup(app, loader=jinja2.FileSystemLoader(str(here)))
app.router.add_get('/', start)
app.router.add_get('/home', login)
runner = web.AppRunner(app)
await runner.setup()
self.site = web.TCPSite(runner, '127.0.0.1', 5000)
await self.client.wait_until_ready()
await self.site.start()
i want to print it in the html file
but i don't know how to get that part
note: i edit the code box
Since you're using web from aiohttp you can add a route that accepts a parameter
routes = web.RouteTableDef()
#routes.get('/guild/{guild}')
async def guild(request):
gid = request.match_info['guild']
The url would be http://localhost:PORT/guild/123456
Once you've fetched the required details, you're free to render a template or return a response.
After digging in source code of aiohttp_jinja2 and aiohttp it seems you can get it with request.query.get('code')
#aiohttp_jinja2.template('home.html')
async def login(request):
#print('code:', request.query.get('code'))
return {'code': request.query.get('code')}
If there is not ?code=... in URL then it gives None but you can set other default value using request.query.get('code', some_default_value)
Doc aiohttp: web.BaseRequest.query
If you have string then you can use
URL = 'http://example/home?code=123456789'
code = URL.split('?code=')[-1]
or if number is always at the end and it has always the same lenght
URL = 'http://example/home?code=123456789'
code = URL[-9:]
But there is also urllib.parse and
URL = 'http://example/home?code=123456789'
data = urllib.parse.parse_qs(urllib.parse.urlsplit(URL).query)
gives dictionary
{'code': ['123456789']}
and you can do
code = data.get('code')
and it will gives expected code or None if there was no ?code=... in url.
EDIT Probably you have to use request.url
#aiohttp_jinja2.template('home.html')
async def login(request):
data = urllib.parse.parse_qs(urllib.parse.urlsplit(request.url).query)
code = data.get('code')
return {'code': code}
Because data is dictionary with "code" so you could use return data
#aiohttp_jinja2.template('home.html')
async def login(request):
data = urllib.parse.parse_qs(urllib.parse.urlsplit(request.url).query)
return data
#aiohttp_jinja2.template('home.html')
async def login(request):
code = urllib.parse.parse_qs(urllib.parse.urlsplit(request).query)
return {'code': code}
like this ?

How can I fix coroutine was never awaited?

I have a RESTFUL Flask API I am serving with gunicorn and I'm trying to continue running parse_request() after sending a response to whoever made a POST request so they're not left waiting for it to finish
I'm not too sure if this will even achieve what I want but this is the code I have so far.
from threading import Thread
import subprocess
from flask import Flask
import asyncio
application = Flask(__name__)
async def parse_request(data):
try:
command = './webscraper.py -us "{user}" -p "{password}" -url "{url}"'.format(**data)
output = subprocess.check_output(['bash','-c', command])
except Exception as e:
print(e)
#application.route('/scraper/run', methods=['POST'])
def init_scrape():
try:
thread = Thread(target=parse_request, kwargs={'data': request.json})
thread.start()
return jsonify({'Scraping this site: ': request.json["url"]}), 201
except Exception as e:
print(e)
if __name__ == '__main__':
try:
application.run(host="0.0.0.0", port="8080")
except Exception as e:
print(e)
I am sending a POST request similar to this.
localhost:8080/scraper/run
data = {
"user": "username",
"password": "password",
"url": "www.mysite.com"
}
The error I get when sending a POST request is this.
/usr/lib/python3.6/threading.py:864: RuntimeWarning: coroutine 'parse_request' was never awaited
self._target(*self._args, **self._kwargs)
So first things first, why are you calling webscraper.py with subprocess? This is completely pointless. Because webscraper.py is a python script you should be importing the needed functions/classes from webscraper.py and using them directly. Calling it this way is totally defeating what you are wanting to do.
Next, your actual question you have got mixed up between async and threading. I suggest you learn more about it but essentially you want something like the following using Quart which is an async version of Flask, it would suit your situation well.
from quart import Quart, response, jsonify
import asyncio
from webscraper import <Class>, <webscraper_func> # Import what you need or
import webscraper # whatever suits your needs
app = Quart(__name__)
async def parse_request(user, password, url):
webscraper_func(user, password, url)
return 'Success'
#app.route('/scraper/run', methods=['POST'])
async def init_scrape():
user = request.args.get('user')
password = request.args.get('password')
url = request.args.get('url')
asyncio.get_running_loop().run_in_executor(
None,
parse_request(user, password, url)
)
return 'Success'
if __name__ == '__main__':
app.run(host='0.0.0.0', port='8080')

Categories

Resources