Extract complete Data from API using python into Json format

Extract complete Data from API using python into Json format - python

I want to extract the data using API and store the data into Json format.
Code:
def run():
responses = asyncio.run(fetch_all())
return responses
output = run()
count = int(output[0]['result']['stats']['count'])
print(count)
import time
async def fetch(session, url):
async with session.get(url) as response:
resp = await response.json()
return resp
async def fetch_all():
limit = 100
async with aiohttp.ClientSession(auth=aiohttp.BasicAuth('***','***')) as session:
tasks = []
for i in range(int(count/limit) + 1):
tasks.append(
fetch(
session,
f"https:XYZ&sysparm_limit={limit}&sysparm_offset={i*limit}",
)
)
responses = await asyncio.gather(*tasks, return_exceptions=True)
return responses
def run():
start = time.time()
responses = asyncio.run(fetch_all())
end = time.time()
return {"responses":responses, "time_duration":end- start}
sap_prod = run()
final_output = []
for i in range(len(sap_prod['responses'])):
final_output = final_output +sap_prod['responses'][i]['result']
json_object = json.dumps(final_output, indent = 4)
Error:
TypeError: 'ClientPayloadError' object is not subscriptable
Data is huge so my IDE is crashing, How can I dump the output into a Json using loops?
Also suggest any alternate method to get complete data from APIs

In fetch you can write the data to a file, producing a line-delimited json file
async def fetch(session, url):
async with session.get(url) as response:
resp = await response.json()
with open('myfile.jsonl', 'a') as f:
f.write(json.dumps(resp) + '\n')
The way after all of the tasks are done you will have a file with many json lines in it, to read it you can do
data = []
with open('myfile.jsonl') as f:
for line in f:
data.append(json.loads(line))

Related

Python async API requests in batches

I m tryin to make async API calls this way:
func to send request:
async def get_data(client, postdata):
res = await client.post(url=_url, headers=_headers, data=postdata)
return res
func to parse JSON:
async def parse_res(client, postdata):
res = await get_data(client, postdata)
if bool(json.loads(res.text)['suggestions']):
_oks = <...grab some JSON fields...>
else:
_oks = {}
return _oks
I wrap this two funcs in MAIN():
async def main(_jobs):
async with httpx.AsyncClient() as client:
batch = []
calls = []
for job in _jobs:
_postdata = '{ "query": "'+ job + '" }'
calls.append(asyncio.create_task(parse_res(client, _postdata)))
batch = await asyncio.gather(*calls)
return batch
and then just run MAIN()
But the API can handle about 30-50 fast (nearly simultaneous requests or throws 429 HTTP error).
So i need to send batches of 30 calls and process 10 000 requests in chunks.
How do i process 10 000 (ten thousand) API calls in batches of 30 ?

One library that comes in handy here is funcy. It offers various helper for working with sequences. One of that would be chunks. This allows you to split a sequence into chunks of equal size or fewer in the end if the totalsize does not divide.
from funcy import chunks
result = []
for job_chunk in chunks(30, _jobs):
calls = [parse_res(client, '{ "query": "'+ job + '" }') for job un job_chunk]
batch = await asyncio.gather(*calls)
result.extend(batch)

You could use Simon Hawe's answer, however here's a different approach without the usage of external libraries
Use asyncio.Semaphore to limit the amount of calls made concurrently, when the semaphore is released it will let another function to run.
import asyncio
sem = asyncio.Semaphore(30) # no. of simultaneous requests
async def get_data(client, postdata):
async with sem:
res = client.post(url=_url, headers=_headers, data=postdata)
return res
async def parse_res(client, postdata):
res = await get_data(client, postdata)
if bool(json.loads(res.text)['suggestions']):
_oks = <...grab some JSON fields...>
else:
_oks = {}
return _oks
async def main(_jobs: int):
async with httpx.AsyncClient() as client:
postdata = '{"query": "' + job + '"}'
calls = [
asyncio.create_task(parse_res(client, postdata)
for _ in range(_jobs)
]
return await asyncio.gather(*calls)

How to parse and save a Multipart/related type=image/jpeg response? (Dicom Wado Response)

I'm making a Get request to Dicom server, which returns a Multipart/Related Type=image/jpeg. I tried using aiohttp libararies Multipart feature to parse but it didnt work. The file saved is corrupted.
Here is my code.
import asyncio
import aiohttp
'''
async def fetch(url,session,header):
async with session.get(url,headers=header) as response:
await response
async def multiHit(urls,header):
tasks = []
async with aiohttp.ClientSession() as session:
for i,url in enumerate(urls):
tasks.append(fetch(url,session,header))
result = await asyncio.gather(*tasks)
return result
loop = asyncio.get_event_loop()
res = loop.run_until_complete(multiHit(["FRAME URL"],{"Accept":"multipart/related;type=image/jpeg"}))
print(res)
'''
async def xyz(loop):
async with aiohttp.ClientSession(loop=loop).get(url="FRAME URL",headers={"Accept":"multipart/related;type=image/jpeg"}) as response:
reader = aiohttp.MultipartReader.from_response(response)
while True:
part = await reader.next()
if part is None:
break
filedata = await part.read(decode=False)
import base64
with open('m.jpeg','wb') as outFile:
outFile.write(part.decode(filedata))
return 1
loop = asyncio.get_event_loop()
res = loop.run_until_complete(xyz(loop))
How to I parse the Multipart/related response and save the images?

I figured out that I was parsing the multi-part response properly, but I had to use another library (library name : imagecodecs , method name : jpegsof3_decode) to decompresses individual part into the images. This is give a numpy array of the image. Here is the updated code
reader = aiohttp.MultipartReader.from_response(response)
while True:
part = await reader.next()
if part is None:
break
data = await part.read()
imageDecompressed = jpegsof3_decode(data)
Further the numpy array can be converted into a image using cv2 libray
success, encoded_image = cv2.imencode('.png',imageDecompressed)
Byte version of the converted image can be obtained this way
imageInBytes = encoded_image.tobytes()

Run two concurrent task groups asynchronously with asyncio

I am trying to write a program using asyncio and was oriented towards this blog post. What I am trying to do is fetch some JSON data concurrently. For one input data frame. however, I would like to process the requested data further as soon as it becomes available.
So basically there are two groups of tasks:
process data in df1 concurrently and do some calc once JSON returned
process data in df2 concurrently
They are more or less independent of each other, but I want to run the group of tasks concurrently as well. Once both task groups are finished I want to further process them.
My question is if my implementation is properly designed in terms of asyncio patterns, where I just used two gather statements? Or whether this is the wrong concept? Here is a scatch:
import asyncio
import aiohttp
from aiohttp import ClientSession
async def fetch_json(url: str, session: ClientSession, data: json.dumps) -> Dict:
resp = await session.get(url=url, headers={"content-type": "application/json"}, data=data)
resp.raise_for_status()
logger.info("Got response [%s] for URL: %s", resp.status, url)
json = await resp.json()
return json
async def some_calc(url: str, session: ClientSession, data: json.dumps):
res = await fetch_json(url=url, session=session, data=data)
return [float(x) for x in res]
async def process_data(df: Dict, url: str, session: ClientSession):
async with session:
tasks = []
for data in df:
try:
if df1:
task = some_calc(url=url, session=session, data=data)
else:
task = fetch_json(url=url, session=session, data=data)
except Exception as e:
# ...
tasks.append(
task
)
res = await asyncio.gather(*tasks)
return res
async def bulk_execute(df1, df2):
url = "http://some.url/"
async with ClientSession() as session:
res = await asyncio.gather(process_data(df1, url, session), process_data(df2, url, session))
return res
if __name__ == "__main__":
res = asyncio.run(bulk_execute(df1, df2))

API calls not running ansynchronously

I'm working on a python client that will asynchronously download vinyl cover art. My problem is that I'm new to python (especially asynchronous python) and I don't think my code is running ansychronously. I have another client written in Node.js that is able to get approx. 40 images/sec whereas this python one is only managing to get around 1.5/sec.
import aiohttp
import asyncio
from os import path,makedirs
caa_base_url = "https://coverartarchive.org/release"
image_download_dir = path.realpath('images')
# small,large, None = Max
image_size = None
async def getImageUrls(release_mbid,session):
async with session.get(f'{caa_base_url}/{release_mbid}') as resp:
if resp.status == 404 or resp.status == 403:
return
return [release_mbid,await resp.json()]
async def getImage(url,session):
try:
async with session.get(url) as resp:
return [url,await resp.read()]
except (aiohttp.ServerDisconnectedError):
return await getImage(url,session)
async def getMBIDs(mb_page_url):
async with aiohttp.ClientSession() as session:
async with session.get(mb_page_url) as resp:
mb_json = await resp.json()
tasks = []
async with aiohttp.ClientSession() as caa_session:
for release in mb_json["releases"]:
task = asyncio.ensure_future(getImageUrls(release["id"],caa_session))
tasks.append(task)
responses = await asyncio.gather(*tasks)
async with aiohttp.ClientSession() as caa_image_session:
for response in responses:
if response is not None:
caaTasks = []
release_mbid = response[0]
result = response[1]
for image in result["images"]:
if image["front"] == True:
caaTask = asyncio.ensure_future(getImage(image["image"],caa_session))
caaTasks.append(caaTask)
image_responses = await asyncio.gather(*caaTasks)
for image_response in image_responses:
image_url = image_response[0]
image_binary = image_response[1]
new_file_dir = path.join(image_download_dir,release_mbid)
if not path.isdir(new_file_dir):
makedirs(new_file_dir)
file_name = image_url[image_url.rfind("/")+1:]
file_path = path.join(new_file_dir,file_name)
new_file = open(file_path,'wb')
new_file.write(image_binary)
mb_base_url = "https://musicbrainz.org/ws/2/release"
num_pages = 100
releases_per_page = 100
mb_page_urls = []
async def getMBPages():
for page_index in range(num_pages):
await getMBIDs('%s?query=*&type=album&format=Vinyl&limit=%s&offset=%s&fmt=json' % (mb_base_url,releases_per_page,page_index*releases_per_page))
await asyncio.sleep(1)
loop = asyncio.get_event_loop()
loop.run_until_complete(getMBPages())
P.S. The sleep is because musicbrainz api limits to 1 request/sec

Asyncio exception handling, possible to not gather exceptions?

I have some code, which makes some API calls with asyncio and aiohttp. For some urls, asyncio will raise an exception, so I allow it to return it (with asyncio.gather(return_exceptions = True)), so it doesn't break the event loop. Is it possible to no gather the returned exceptions, so it returns only the results which worked? Or do I need to clean up the list manually afterwards?
This is the code:
import asyncio
import aiohttp
import ssl
import datetime as dt
limit = 30
start_epoch = int(dt.datetime(2018,7,1).timestamp())
end_epoch = int(dt.datetime.now().timestamp())
epoch_step = 40000
url_list = []
while True:
url = "https://api.pushshift.io/reddit/search/comment/?q=" + "Nestle" + "&size=" + str(limit) + "&after=" + str(start_epoch) + "&before=" + str(start_epoch + epoch_step)
url_list.append(url)
start_epoch += epoch_step
if start_epoch > end_epoch:
break
async def fetch(session, url):
async with session.get(url, ssl=ssl.SSLContext()) as response:
return await response.json()
async def fetch_all(urls, loop):
async with aiohttp.ClientSession(loop=loop) as session:
results = await asyncio.gather(*[fetch(session, url) for url in urls], return_exceptions=True)
return results
if __name__ == '__main__':
loop = asyncio.get_event_loop()
urls = url_list
htmls = loop.run_until_complete(fetch_all(urls, loop))
print(htmls)
and it returns a list which looks something like this:
[ContentTypeError("0, message='Attempt to decode JSON with unexpected mimetype: text/html'",), {'data': [{'author':...]

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Extract complete Data from API using python into Json format - python

Related

Python async API requests in batches

How to parse and save a Multipart/related type=image/jpeg response? (Dicom Wado Response)

Run two concurrent task groups asynchronously with asyncio

API calls not running ansynchronously

Asyncio exception handling, possible to not gather exceptions?

Categories

Resources