Python asynchronous call separate functions - python

I have 4 functions as follows, 3 of them call an external apis to fetch random data:
def get_external_data(zip_code, info_code):
data_url = MY_ENDPOINT_1.format(zip_code)
data_response = requests.request(
'GET',
url=data_url,
headers={
'content-Type': 'application/json',
'info-code': info_code
}
)
return data_response
def get_info(info_code):
info_url = INFO_URL.format(info_code)
info_response = requests.request(
'GET',
url=info_url,
headers={
'content-Type': 'application/json',
'info-code': info_code
}
)
return info_response
def get_zip(zip_code):
zip_url = zip_URL.format(zip_code)
zip_response = requests.request(
'GET',
url=zip_url,
headers={
'content-Type': 'application/json',
'zip-code': zip_code
}
)
return zip_response
def get_all_data(info_cd, zip_code):
data_response = get_external_data(zip_code, info_cd)
info_response = get_info(info_cd)
zip_response = get_zip(zip_code)
return data_response, info_response, zip_response
I am trying to achieve 3 asynchronous calls by calling the 3 data functions using concurrent.futures in the get_all_data() function. From the docs i understand that we can thread one function with a list of inputs as follows:
resp_list = list()
# Asynchronous
with ThreadPoolExecutor(max_workers=3) as executor:
thread_responses = executor.map(self.get_info, info_list)
for x in thread_responses:
resp_list.append(x)
But how will i achieve 3 threads in this particular situation with 3 different functions with different inputs?

You can make each call in its own thread using executor.submit, like this:
with ThreadPoolExecutor(max_workers=3) as executor:
data_future = executor.submit(get_external_data, zip_code, info_cd)
info_future = executor.submit(get_info, info_cd)
zip_future = executor.submit(get_zip, zip_code)
# and then collect results
results = [
f.result()
for f in (data_future, info_future, zip_future)
]
See related docs: https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.Executor.submit

Related

running multiple api calls (dependent on one's output) in parallel to speed up output

I am trying to run a series of Wikipedia API calls and the output of some of the calls are used in another function(for another API call). the individual API calls are fast(>1 minute) but the function that utilizes these API calls together takes approx. 1 hour.
first API call:
def img(cat_title) -> list:
url="https://co.media.org/w/api.php"
params = {
"action": "query",
"format": "json",
}
resp = requests.get(url, params)
response = resp.json()
files_list = [response['query']['pages'][str(pageid)]['title'] for pageid in list(response['query']['pages'].keys())]
while response: #continuation for response length greater than 500
try:
params.update(response['continue'])
resp = requests.get(url, params)
response = resp.json()
files_list += [response['query']['pages'][str(pageid)]['title'] for pageid in list(response['query']['pages'].keys())]
except:
break
return files_list
img('Category:...')
second API call:
def user(title) -> str:
url="https://co.media.org/w/api.php"
params = {
"action": "gpquery",
"format": "json",
}
resp = requests.get(url, params)
response = resp.json()
response_pages = response['query']['pages']
page_id = list(response_pages.keys())[0] # automates the pageid for each file/a file
userinfo = response_pages[page_id]['imageinfo'][0]['user'] # retrieves the value of item imageinfo
return userinfo
user("File:..)
count function: this uses a list comprehension for the above functions. the output is an integer.
def count(name) -> int:
files_list = ['Category:.', 'Category:...', 'Category:...', 'Category:...', 'Category:...', 'Category:...', 'Category:..']
for cat_name in files_list:
users = [user(files) for files in img(cat_name) if name in user(files)]
print(len(users))
count('cute')

How to pass variable from script 1 to script 2 in python

I need help please.
I have 2 scripts. The first script consumes from RabbitMQ and I need to send the body received to a variable in script 2.
However, the variable remains empty. I think that script 1 maybe is calling script 2 before the value is received from RabbitMQ?
How can I achieve this? Thanks
script 1
import pika
import time
from script2 import strQueue
class ReceiveFromMQ(object):
def __init__(self):
credentials = pika.PlainCredentials('xxxx', 'xxxx')
parameters = pika.ConnectionParameters('xxxx', xxx, 'xxx',
credentials)
self.connection = pika.BlockingConnection(parameters)
self.channel = self.connection.channel()
self.channel.basic_qos(prefetch_count=1)
self.channel.basic_consume(
queue='queue',
on_message_callback=self.on_response,
auto_ack=True)
self.response = None
self.channel.start_consuming()
def on_response(self, ch, method, props, body):
self.response = body.decode()
strQueue = body.decode()
print(" [x] Received %r" % body.decode())
# getMsg(body.decode())
time.sleep(body.count(b'.'))
print(" [x] Done")
print(' [*] Waiting for messages. To exit press CTRL+C')
return self.response
def call(self):
self.response = None
self.connection.process_data_events(time_limit=None)
print(str(self.response))
return str(self.response)
receive_mq = ReceiveFromMQ()
response = receive_mq.call()
print(response)
script 2
import requests
import json
strQueue = None
# Function Authenticate
def httpAuthenticate (in_apiusers, in_apipass, in_Tenant, in_URL):
try:
print('retrieve token...')
url = in_URL
payload = json.dumps({
"password": str(in_apipass),
"usernameOrEmailAddress": str(in_apiusers),
"tenancyName": str(in_Tenant)
})
headers = {
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
json_object = json.loads(response.text)
print('token code: ' + str(response.status_code))
return str(json_object["result"])
except Exception as e:
return 'Fail:'
# Function:Add Queue Item on Uipath Orchestrator
def httpAddQueueItems(in_URL, in_Token, in_QueueName, in_strjson):
try:
print('add queue item...')
url = in_URL
payload = json.dumps({
"itemData": {
"Priority": "Normal",
"Name": str(in_QueueName),
"SpecificContent": {
"in_pjsorequest": in_strpjson
},
"Reference": "ggg"
}
})
headers = {
'X-UIPATH-OrganizationUnitId': '',
'Content-Type': 'application/json',
'Authorization': 'Bearer ' + in_Token
}
response = requests.request("POST", url, headers=headers, data=payload)
except Exception as e:
print(e)
return 'Fail'
# CONSTANTS
OnPremuser = "xxxx"
OnPrempass = "xxx!"
OnPremtenant = "Default"
OnPremUrlAuth = "xxxx"
OnPremUrlAddQueue = "https://xxxx"
OnPremQueue = "JSON"
OnPremPJSON = strQueue
OnPremtoken = httpAuthenticate(OnPremuser, OnPrempass, OnPremtenant, OnPremUrlAuth)
httpAddQueueItems(OnPremUrlAddQueue, OnPremtoken, OnPremQueue, OnPremJSON)
What you are trying to achieve is not possible in this way since you are
trying to access a shared variable (Race Condition).
Moreover, only one bytecode instruction can be executed at a time, mean to
say, only one CPU bound task can be run at a time.
P.S:- It can be achieved by running a consumer for the RabbitMQ producer and then assign the json received to a variable.

Python - multiprocessing - API query

I am preparing code for querying some endpoints. Code is ok, works quite good but it takes too much time. I would like to use Python multiprocessing module to speed up the process. My main target is to put 12 API queries to be processed in parallel. Once jobs are processed I would like to fetch the result and put them into the list of dictionaries, one response as one dictionary in the list. API response is in json format. I am new to Python and don't have experience in such kind of cases.
Code I want to run in parallel below.
def api_query_process(cloud_type, api_name, cloud_account, resource_type):
url = "xxx"
payload = {
"limit": 0,
"query": f'config from cloud.resource where cloud.type = \'{cloud_type}\' AND api.name = \'{api_name}\' AND '
f'cloud.account = \'{cloud_account}\'',
"timeRange": {
"relativeTimeType": "BACKWARD",
"type": "relative",
"value": {
"amount": 0,
"unit": "minute"
}
},
"withResourceJson": True
}
headers = {
"content-type": "application/json; charset=UTF-8",
"x-redlock-auth": api_token_input
}
response = requests.request("POST", url, json=payload, headers=headers)
result = response.json()
resource_count = len(result["data"]["items"])
if resource_count:
dictionary = dictionary_create(cloud_type, cloud_account, resource_type, resource_count)
property_list_summary.append(dictionary)
else:
dictionary = dictionary_create(cloud_type, cloud_account, resource_type, 0)
property_list_summary.append(dictionary)
Interesting problem and I think you should think about idempotency. What would happen if you hit the end-point consecutively. You can use multiprocessing with or without lock.
Without Lock:
import multiprocessing
with multiprocessing.Pool(processes=12) as pool:
jobs = []
for _ in range(12):
jobs.append(pool.apply_async(api_query_process(*args))
for job in jobs:
job.wait()
With Lock:
import multiprocessing
multiprocessing_lock = multiprocessing.Lock()
def locked_api_query_process(cloud_type, api_name, cloud_account, resource_type):
with multiprocessing_lock:
api_query_process(cloud_type, api_name, cloud_account, resource_type)
with multiprocessing.Pool(processes=12) as pool:
jobs = []
for _ in range(12):
jobs.append(pool.apply_async(locked_api_query_process(*args)))
for job in jobs:
job.wait()
Can't really do an End-2-End test but hopefully this general setup helps you get it up and running.
Since a HTTP request is an I/O Bound operation, you do not need multiprocessing. You can use threads to get a better performance. Something like the following would help.
MAX_WORKERS would say how many requests you want to send in
parallel
API_INPUTS are all the requests you want to make
Untested code sample:
from concurrent.futures import ThreadPoolExecutor
import requests
API_TOKEN = "xyzz"
MAX_WORKERS = 4
API_INPUTS = (
("cloud_type_one", "api_name_one", "cloud_account_one", "resource_type_one"),
("cloud_type_two", "api_name_two", "cloud_account_two", "resource_type_two"),
("cloud_type_three", "api_name_three", "cloud_account_three", "resource_type_three"),
)
def make_api_query(api_token_input, cloud_type, api_name, cloud_account):
url = "xxx"
payload = {
"limit": 0,
"query": f'config from cloud.resource where cloud.type = \'{cloud_type}\' AND api.name = \'{api_name}\' AND '
f'cloud.account = \'{cloud_account}\'',
"timeRange": {
"relativeTimeType": "BACKWARD",
"type": "relative",
"value": {
"amount": 0,
"unit": "minute"
}
},
"withResourceJson": True
}
headers = {
"content-type": "application/json; charset=UTF-8",
"x-redlock-auth": api_token_input
}
response = requests.request("POST", url, json=payload, headers=headers)
return response.json()
def main():
futures = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as pool:
for (cloud_type, api_name, cloud_account, resource_type) in API_INPUTS:
futures.append(
pool.submit(make_api_query, API_TOKEN, cloud_type, api_name, cloud_account)
)
property_list_summary = []
for future, api_input in zip(futures, API_INPUTS):
api_response = future.result()
cloud_type, api_name, cloud_account, resource_type = api_input
resource_count = len(api_response["data"]["items"])
dictionary = dictionary_create(cloud_type, cloud_account, resource_type, resource_count)
property_list_summary.append(dictionary)
I think using async functions would help a lot in speeding this up.
Your code is blocking while it waits for a response from the external API. So using more processes or threads is overkill. You dont need more resources on your end. Instead you should just make your code execute the next request instead of idling until the response arrives. This can be done using coroutines.
You could use aiohttp instead of requests, collect the individual tasks and execute them in an event loop.
Here is a small example code to run get requests, and collect the json bodies from the responses. Should be easy to adapt to your use case
from aiohttp import ClientSession
import asyncio
RESULTS = dict()
async def get_url(url, session):
async with session.get(url) as response:
print("Status:", response.status)
print("Content-type:", response.headers['content-type'])
result = await response.json()
RESULTS[url] = result
async def get_all_urls(urls):
async with ClientSession() as session:
tasks = [get_url(url, session) for url in urls]
await asyncio.gather(*tasks)
if __name__ == "__main__":
urls = [
"https://accounts.google.com/.well-known/openid-configuration",
"https://www.facebook.com/.well-known/openid-configuration/"
]
asyncio.run(get_all_urls(urls=urls))
print(RESULTS.keys())

Why does Cloud Tasks gives "Deadline Exceeded" error after 60s?

I'm using google-cloud-tasks==2.2.0 with Flask Gunicorn. This is how I send a task to a queue:
def send_task(payload, queue, uri, *args):
url = f'https://www.mywebsite.com/{uri}'
payload = json.dumps(payload)
payload = payload.encode()
parent = client.queue_path(project=project, location=location, queue=queue)
service_account_email = 'myaccount.com'
# Construct the request body.
td = '1800s'
duration = duration_pb2.Duration()
time = duration.FromJsonString(td)
now = datetime.utcnow() + timedelta(seconds=10)
ts = timestamp_pb2.Timestamp()
now = ts.FromDatetime(now)
task = {
'http_request': { # Specify the type of request.
'http_method': tasks_v2beta3.HttpMethod.POST,
'url': url,
'body': payload, # Convert dictionary to string
'headers': { # Add custom header
'Content-Type': 'application/json'
},
'oidc_token': {'service_account_email': service_account_email}
}
}
task['schedule_time'] = now
task['dispatch_deadline'] = time
response = client.create_task(request={"parent": parent, "task": task}, timeout=30.0)
I use dispatch_deadline which is supposed to support 30 minutes timeout, using this API reference.
But no matter how I try, my task fails after 60 seconds with 504 DEADLINE_EXCEEDED error.
Honestly, is this something necessary I'm missing here, or is it a bug?

Get data by pages and merge it into one using Python (pagination)

I'm connecting to API which has 500 rows limit per call.
This is my code for a single API call (Works great):
def getdata(data):
auth_token = access_token
hed = {'Authorization': 'Bearer ' + auth_token, 'Accept': 'application/json'}
urlApi = 'https://..../orders?Offset=0&Limit=499'
datar = requests.get(urlApi, data=data, headers=hed, verify=True)
return datar
Now I want to scale it up so it will get me all the records.
This is what I tried to do:
In order to make sure that I have all the rows, I must iterate until there is no more data:
get 1st page
get 2nd page
merge
get 3rd page
merge
etc...
each page is an API call.
This is what I'm trying to do:
def getData(data):
auth_token = access_token
value_offset = 0
hed = {'Authorization': 'Bearer ' + auth_token, 'Accept': 'application/json'}
datarALL = None
while True:
urlApi = 'https://..../orders?Offset=' + value_offset + '&Limit=499'
responsedata = requests.get(urlApi, data=data, headers=hed, verify=True)
if responsedata.ok:
value_offset = value_offset + 499
#to do: merge the result of the get request
datarALL= datarALL+ responsedata (?)
# to do: check if response is empty then break out.
return datarALL
I couldn't find information about how I merge the results of the API calls nor how do I check if I can break the loop.
Edit:
To clear what I'm after.
I can see the results of the API call using:
logger.debug('response is : {0}'.format(datar.json()))
What I want to be able to do:
logger.debug('response is : {0}'.format(datarALL.json()))
and it will show all results from all calls. This requires generate API calls until there is no more data to get.
This is the return sample of API call:
"offset": 0,
"limit": 0,
"total": 0,
"results": [
{
"field1": 0,
"field2": "string",
"field3": "string",
"field4": "string"
}
]
}
In this case, you are almost correct with the idea.
is_valid = True
while is_valid:
is_valid = False
...
...
responsedata = requests.get(urlApi, data=data, headers=hed, verify=True)
if responsedata.status_code == 200: #Use status code to check request status, 200 for successful call
responsedata = responsedata.text
value_offset = value_offset + 499
#to do: merge the result of the get request
jsondata = json.loads(responsedata)
if "results" in jsondata:
if jsondata["results"]:
is_valid = True
if is_valid:
#concat array by + operand
datarALL = datarALL + jsondata["results"]
As I don't know if "results" still exists when the data ran out, so I checked both level.

Categories

Resources