I have some code, which makes some API calls with asyncio and aiohttp. For some urls, asyncio will raise an exception, so I allow it to return it (with asyncio.gather(return_exceptions = True)), so it doesn't break the event loop. Is it possible to no gather the returned exceptions, so it returns only the results which worked? Or do I need to clean up the list manually afterwards?
This is the code:
import asyncio
import aiohttp
import ssl
import datetime as dt
limit = 30
start_epoch = int(dt.datetime(2018,7,1).timestamp())
end_epoch = int(dt.datetime.now().timestamp())
epoch_step = 40000
url_list = []
while True:
url = "https://api.pushshift.io/reddit/search/comment/?q=" + "Nestle" + "&size=" + str(limit) + "&after=" + str(start_epoch) + "&before=" + str(start_epoch + epoch_step)
url_list.append(url)
start_epoch += epoch_step
if start_epoch > end_epoch:
break
async def fetch(session, url):
async with session.get(url, ssl=ssl.SSLContext()) as response:
return await response.json()
async def fetch_all(urls, loop):
async with aiohttp.ClientSession(loop=loop) as session:
results = await asyncio.gather(*[fetch(session, url) for url in urls], return_exceptions=True)
return results
if __name__ == '__main__':
loop = asyncio.get_event_loop()
urls = url_list
htmls = loop.run_until_complete(fetch_all(urls, loop))
print(htmls)
and it returns a list which looks something like this:
[ContentTypeError("0, message='Attempt to decode JSON with unexpected mimetype: text/html'",), {'data': [{'author':...]
Related
I have a block of codes that works well in fetching data from API requests to a specific site. The issue is that the site only gives me a limit of 50 objects for each call, and I have to make multiple calls. As a result, it takes me too long to finish the fetching work (sometimes I have to wait nearly 20 minutes). Here are my codes:
import concurrent.futures
import requests
supply = 3000
offset = 0
token_ids = []
while offset < supply:
url = "url_1" + str(offset)
response = requests.request("GET", url)
a = response.json()
assets = a["assets"]
def get_token_ids(an):
if str(an['sell_orders']) == 'None' and str(an['last_sale']) == 'None' and str(an['num_sales']) == '0':
token_ids.append(str(an['token_id']))
with concurrent.futures.ThreadPoolExecutor() as executor:
results = [executor.submit(get_token_ids, asset) for asset in assets]
offset += 50
print(token_ids)
The problem is that the codes run through and wait for all actions to be finished before making another request. I am thinking of an improvement that when the request is sent, the offset value gets added, and the loop processes to another request, thus I don't have to wait. I don't know how to do it, I studied 'asyncio', but it is still a challenge for me. Can anyone help me with this?
The problem is that Requests is not asynchronous code, so each of its network calls blocks the loop until its completion.
https://docs.python-requests.org/en/latest/user/advanced/#blocking-or-non-blocking
Therefore, it is better to try asynchronous libraries, for example, aiohttp:
https://github.com/aio-libs/aiohttp
Example
Create session for all connections:
async with aiohttp.ClientSession() as session:
and run all desired requests:
results = await asyncio.gather(
*[get_data(session, offset) for offset in range(0, supply, step)]
)
here, requests are executed asynchronously, with session.get(url) gets only the response headers, and the content gets await response.json():
async with session.get(url) as response:
a = await response.json()
And in the main block main loop starts:
loop = asyncio.get_event_loop()
token_ids = loop.run_until_complete(main())
loop.close()
The full code
import aiohttp
import asyncio
async def get_data(session, offset):
token_ids = []
url = "url_1" + str(offset)
async with session.get(url) as response:
# For tests:
# print("Status:", response.status)
# print("Content-type:", response.headers['content-type'])
a = await response.json()
assets = a["assets"]
for asset in assets:
if str(asset['sell_orders']) == 'None' and str(asset['last_sale']) == 'None' and str(asset['num_sales']) == '0':
token_ids.append(str(asset['token_id']))
return token_ids
async def main():
supply = 3000
step = 50
token_ids = []
# Create session for all connections and pass it to "get" function
async with aiohttp.ClientSession() as session:
results = await asyncio.gather(
*[get_data(session, offset) for offset in range(0, supply, step)]
)
for ids in results:
token_ids.extend(ids)
return token_ids
if __name__ == "__main__":
# asynchronous code start here
loop = asyncio.get_event_loop()
token_ids = loop.run_until_complete(main())
loop.close()
# asynchronous code end here
print(token_ids)
I need to check several hundred proxy servers and get the number of not working. Script for this
import urllib.request
import socket
net = ['http://192.168.1.1:8080',
'http://192.168.1.2:8080',
'http://192.168.1.3:8080',
'http://192.168.1.4:8080',
'http://192.168.1.5:8080',
'http://192.168.1.6:8080',
'http://192.168.1.7:8080',
'http://192.168.1.8:8080',
'http://192.168.1.9:8080',
'http://192.168.1.10:8080']
fail = 0
socket.setdefaulttimeout(3)
for x in net:
try:
print(x)
proxy = urllib.request.ProxyHandler({'http': (x)})
opener = urllib.request.build_opener(proxy)
urllib.request.install_opener(opener)
urllib.request.urlretrieve('http://google.com')
except IOError:
print ("Connection error")
fail+=1
print(fail)
Proxies in the list, I have given a simple version.
It takes 55 seconds to check 250 working proxies. I can't wait that long, need to increase the execution speed.
How can this be done using async?
This should give you an idea of how to approach it. You have to wrap the various connection blocks in try, except yourself.
NOTE: This code is not tested as I do not have any way of doing so.
import asyncio, aiohttp
def returnPartionedList(inputlist, x=100):
return([inputlist[i:i + x] for i in range(0, len(inputlist), x)])
# Returns: Original list split into segments of x.
async def TestProxy(url, proxy, session):
async with session.get(url, proxy=proxy, timeout=3) as response:
if response.status == 200:
_ = await response.text()
return(proxy)
async def TestProxies(listofproxies):
returnResults = []
url = "https://google.com" # Test proxy with this url
ProxyPartitions = returnPartionedList(listofproxies, 20) # Rate limit 20 per second
for partition in ProxyPartitions:
ProxyTasks = []
async with aiohttp.ClientSession() as session:
for proxy in partition:
ProxyTasks.append(asyncio.create_task(TestProxy(url, proxy, session)))
results = await asyncio.gather(*ProxyTasks, return_exceptions=False)
if results:
for result in results:
if result:
returnResults.append(result)
await asyncio.sleep(1)
return(returnResults)
async def main():
listofproxies = [
'http://10.10.1.1:8080',
'http://10.10.1.2:8080',
'http://10.10.1.3:8080',
'http://10.10.1.4:8080',
'http://10.10.1.5:8080',
'http://10.10.1.6:8080',
'http://10.10.1.7:8080',
'http://10.10.1.8:8080',
'http://10.10.1.9:8080',
'http://10.10.1.10:8080'
]
test_proxies = await TestProxies(listofproxies)
print(test_proxies)
if __name__ == "__main__":
asyncio.run(main())
I am trying to using async and await, I am still new to it I cannot figure out what I am doing wrong
import requests
import bs4
import colorama
from colorama import Fore
import time
import datetime
import asyncio
async def get_html(episode_number: int) -> str:
print(Fore.YELLOW + f"Getting HTML for episode {episode_number}", flush=True)
url = f'https://talkpython.fm/{episode_number}'
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
resp.raise_for_status()
return await resp.text
await resp.raise_for_status()
# return await resp.text
def get_title(html: str, episode_number: int) -> str:
print(colorama.Fore.CYAN + f"Getting TITLE for episode {episode_number}", flush=True)
soup = bs4.BeautifulSoup(html, 'html.parser')
header = soup.select_one('h1')
if not header:
return "MISSING"
return header.text.strip()
def main():
t0 = datetime.datetime.now()
print(colorama.Fore.WHITE + ' App started.', flush=True )
loop = asyncio.get_event_loop()
final_task = asyncio.gather(loop)
#get_title_range()
dt = datetime.datetime.now() - t0
loop.run_until_complete(final_task)
print(colorama.Fore.CYAN + "Done. " + ' App exiting total time: {:,.2f} sec.'.format(dt.total_seconds()), flush=True)
def get_title_range():
for n in range(150, 170):
html = get_html(n)
title = get_title(html, n)
print(Fore.CYAN + f"Title found: {title}", flush=True)
if __name__ == '__main__':
main()
It looks like you're not initializing tasks for your event loop to run on. I typically follow this pattern:
async def main():
headers = {'Connection': 'keep-alive', 'Content-Type': 'application/json', 'Authorization': auth}
url = 'some-api.com/post-request-something'
# We use a session to take advantage of tcp keep-alive
timeout = aiohttp.ClientTimeout(total=10000)
async with aiohttp.ClientSession(timeout=timeout) as session:
tasks = [async_wrap(session, q, url, headers) for q in queue]
# gather literally 'gathers' all the tasks and schedules them in the event loop
await asyncio.gather(*tasks, return_exceptions=True)
if __name__ == '__main__':
ts = time()
# Create the asyncio event loop - from the main function
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(main())
finally:
# Lets avoid an unclosed loop running a DDoS attack on ourselves
loop.close()
logger.info('Took %s seconds to complete', time() - ts)
note the line containing, takes those tasks gathered to schedule as coroutines in the main event loop:
loop.run_until_complete(main())
and then this, which calls my function, async_wrap() for each record I wanted to send in the http client (which I had stored in a list), but in your case it would call your asynchronous function get_html() using each record from get_title_range():
tasks = [async_wrap(session, q, url, headers) for q in queue] # -> mine
await asyncio.gather(*tasks, return_exceptions=True) # -> gather those tasks!
tasks = [get_html(episode_number=episode) for episode in list_of_episode_nums] # -> yours
await asyncio.gather(*tasks, return_exceptions=True) # -> gather those tasks!
Hope this helps you shore some details up, but unfortunately, asynchronous code can be quite a headache, requiring lots of trial-and-error.
I want to extract the data using API and store the data into Json format.
Code:
def run():
responses = asyncio.run(fetch_all())
return responses
output = run()
count = int(output[0]['result']['stats']['count'])
print(count)
import time
async def fetch(session, url):
async with session.get(url) as response:
resp = await response.json()
return resp
async def fetch_all():
limit = 100
async with aiohttp.ClientSession(auth=aiohttp.BasicAuth('***','***')) as session:
tasks = []
for i in range(int(count/limit) + 1):
tasks.append(
fetch(
session,
f"https:XYZ&sysparm_limit={limit}&sysparm_offset={i*limit}",
)
)
responses = await asyncio.gather(*tasks, return_exceptions=True)
return responses
def run():
start = time.time()
responses = asyncio.run(fetch_all())
end = time.time()
return {"responses":responses, "time_duration":end- start}
sap_prod = run()
final_output = []
for i in range(len(sap_prod['responses'])):
final_output = final_output +sap_prod['responses'][i]['result']
json_object = json.dumps(final_output, indent = 4)
Error:
TypeError: 'ClientPayloadError' object is not subscriptable
Data is huge so my IDE is crashing, How can I dump the output into a Json using loops?
Also suggest any alternate method to get complete data from APIs
In fetch you can write the data to a file, producing a line-delimited json file
async def fetch(session, url):
async with session.get(url) as response:
resp = await response.json()
with open('myfile.jsonl', 'a') as f:
f.write(json.dumps(resp) + '\n')
The way after all of the tasks are done you will have a file with many json lines in it, to read it you can do
data = []
with open('myfile.jsonl') as f:
for line in f:
data.append(json.loads(line))
I'm working on a python client that will asynchronously download vinyl cover art. My problem is that I'm new to python (especially asynchronous python) and I don't think my code is running ansychronously. I have another client written in Node.js that is able to get approx. 40 images/sec whereas this python one is only managing to get around 1.5/sec.
import aiohttp
import asyncio
from os import path,makedirs
caa_base_url = "https://coverartarchive.org/release"
image_download_dir = path.realpath('images')
# small,large, None = Max
image_size = None
async def getImageUrls(release_mbid,session):
async with session.get(f'{caa_base_url}/{release_mbid}') as resp:
if resp.status == 404 or resp.status == 403:
return
return [release_mbid,await resp.json()]
async def getImage(url,session):
try:
async with session.get(url) as resp:
return [url,await resp.read()]
except (aiohttp.ServerDisconnectedError):
return await getImage(url,session)
async def getMBIDs(mb_page_url):
async with aiohttp.ClientSession() as session:
async with session.get(mb_page_url) as resp:
mb_json = await resp.json()
tasks = []
async with aiohttp.ClientSession() as caa_session:
for release in mb_json["releases"]:
task = asyncio.ensure_future(getImageUrls(release["id"],caa_session))
tasks.append(task)
responses = await asyncio.gather(*tasks)
async with aiohttp.ClientSession() as caa_image_session:
for response in responses:
if response is not None:
caaTasks = []
release_mbid = response[0]
result = response[1]
for image in result["images"]:
if image["front"] == True:
caaTask = asyncio.ensure_future(getImage(image["image"],caa_session))
caaTasks.append(caaTask)
image_responses = await asyncio.gather(*caaTasks)
for image_response in image_responses:
image_url = image_response[0]
image_binary = image_response[1]
new_file_dir = path.join(image_download_dir,release_mbid)
if not path.isdir(new_file_dir):
makedirs(new_file_dir)
file_name = image_url[image_url.rfind("/")+1:]
file_path = path.join(new_file_dir,file_name)
new_file = open(file_path,'wb')
new_file.write(image_binary)
mb_base_url = "https://musicbrainz.org/ws/2/release"
num_pages = 100
releases_per_page = 100
mb_page_urls = []
async def getMBPages():
for page_index in range(num_pages):
await getMBIDs('%s?query=*&type=album&format=Vinyl&limit=%s&offset=%s&fmt=json' % (mb_base_url,releases_per_page,page_index*releases_per_page))
await asyncio.sleep(1)
loop = asyncio.get_event_loop()
loop.run_until_complete(getMBPages())
P.S. The sleep is because musicbrainz api limits to 1 request/sec