Is it possible to have multiple loops with asyncio? If the response is yes how can I do that?
My use case is:
* I extract urls from a list of websites in async
* For each "sub url list", I would crawl them in async/
Example to extract urls:
import asyncio
import aiohttp
from suburls import extractsuburls
#asyncio.coroutine
def extracturls(url):
subtasks = []
response = yield from aiohttp.request('GET', url)
suburl_list = yield from response.text()
for suburl in suburl_list:
subtasks.append(asyncio.Task(extractsuburls(suburl)))
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.gather(*subtasks))
if __name__ == '__main__':
urls_list = ['http://example1.com', 'http://example2.com']
for url in url_list:
subtasks.append(asyncio.Task(extractsuburls(url)))
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.gather(*subtasks))
loop.close()
If I execute this code I'll have an error when python will try to launch the second loop witch says that a loop is already running.
P.S: my module "extractsuburls" uses aiohttp to perform web request.
EDIT:
Well, I've try this solution:
import asyncio
import aiohttp
from suburls import extractsuburls
#asyncio.coroutine
def extracturls( url ):
subtasks = []
response = yield from aiohttp.request('GET', url)
suburl_list = yield from response.text()
jobs_loop = asyncio.new_event_loop()
for suburl in suburl_list:
subtasks.append(asyncio.Task(extractsuburls(suburl)))
asyncio.new_event_loop(jobs_loop)
jobs_loop.run_until_complete(asyncio.gather(*subtasks))
jobs_loop.close()
if __name__ == '__main__':
urls_list = ['http://example1.com', 'http://example2.com']
for url in url_list:
subtasks.append(asyncio.Task(extractsuburls(url)))
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.gather(*subtasks))
loop.close()
But I've this error: loop argument must agree with Future
Any idea?
You don't need several event loops, just use yield from gather(*subtasks) in extracturls() coroutine:
import asyncio
import aiohttp
from suburls import extractsuburls
#asyncio.coroutine
def extracturls(url):
subtasks = []
response = yield from aiohttp.request('GET', url)
suburl_list = yield from response.text()
for suburl in suburl_list:
subtasks.append(extractsuburls(suburl))
yield from asyncio.gather(*subtasks)
if __name__ == '__main__':
urls_list = ['http://example1.com', 'http://example2.com']
for url in url_list:
subtasks.append(extractsuburls(url))
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.gather(*subtasks))
loop.close()
As result you get waiting for subtasks until extracturls finished.
Related
I have a block of codes that works well in fetching data from API requests to a specific site. The issue is that the site only gives me a limit of 50 objects for each call, and I have to make multiple calls. As a result, it takes me too long to finish the fetching work (sometimes I have to wait nearly 20 minutes). Here are my codes:
import concurrent.futures
import requests
supply = 3000
offset = 0
token_ids = []
while offset < supply:
url = "url_1" + str(offset)
response = requests.request("GET", url)
a = response.json()
assets = a["assets"]
def get_token_ids(an):
if str(an['sell_orders']) == 'None' and str(an['last_sale']) == 'None' and str(an['num_sales']) == '0':
token_ids.append(str(an['token_id']))
with concurrent.futures.ThreadPoolExecutor() as executor:
results = [executor.submit(get_token_ids, asset) for asset in assets]
offset += 50
print(token_ids)
The problem is that the codes run through and wait for all actions to be finished before making another request. I am thinking of an improvement that when the request is sent, the offset value gets added, and the loop processes to another request, thus I don't have to wait. I don't know how to do it, I studied 'asyncio', but it is still a challenge for me. Can anyone help me with this?
The problem is that Requests is not asynchronous code, so each of its network calls blocks the loop until its completion.
https://docs.python-requests.org/en/latest/user/advanced/#blocking-or-non-blocking
Therefore, it is better to try asynchronous libraries, for example, aiohttp:
https://github.com/aio-libs/aiohttp
Example
Create session for all connections:
async with aiohttp.ClientSession() as session:
and run all desired requests:
results = await asyncio.gather(
*[get_data(session, offset) for offset in range(0, supply, step)]
)
here, requests are executed asynchronously, with session.get(url) gets only the response headers, and the content gets await response.json():
async with session.get(url) as response:
a = await response.json()
And in the main block main loop starts:
loop = asyncio.get_event_loop()
token_ids = loop.run_until_complete(main())
loop.close()
The full code
import aiohttp
import asyncio
async def get_data(session, offset):
token_ids = []
url = "url_1" + str(offset)
async with session.get(url) as response:
# For tests:
# print("Status:", response.status)
# print("Content-type:", response.headers['content-type'])
a = await response.json()
assets = a["assets"]
for asset in assets:
if str(asset['sell_orders']) == 'None' and str(asset['last_sale']) == 'None' and str(asset['num_sales']) == '0':
token_ids.append(str(asset['token_id']))
return token_ids
async def main():
supply = 3000
step = 50
token_ids = []
# Create session for all connections and pass it to "get" function
async with aiohttp.ClientSession() as session:
results = await asyncio.gather(
*[get_data(session, offset) for offset in range(0, supply, step)]
)
for ids in results:
token_ids.extend(ids)
return token_ids
if __name__ == "__main__":
# asynchronous code start here
loop = asyncio.get_event_loop()
token_ids = loop.run_until_complete(main())
loop.close()
# asynchronous code end here
print(token_ids)
I am trying to using async and await, I am still new to it I cannot figure out what I am doing wrong
import requests
import bs4
import colorama
from colorama import Fore
import time
import datetime
import asyncio
async def get_html(episode_number: int) -> str:
print(Fore.YELLOW + f"Getting HTML for episode {episode_number}", flush=True)
url = f'https://talkpython.fm/{episode_number}'
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
resp.raise_for_status()
return await resp.text
await resp.raise_for_status()
# return await resp.text
def get_title(html: str, episode_number: int) -> str:
print(colorama.Fore.CYAN + f"Getting TITLE for episode {episode_number}", flush=True)
soup = bs4.BeautifulSoup(html, 'html.parser')
header = soup.select_one('h1')
if not header:
return "MISSING"
return header.text.strip()
def main():
t0 = datetime.datetime.now()
print(colorama.Fore.WHITE + ' App started.', flush=True )
loop = asyncio.get_event_loop()
final_task = asyncio.gather(loop)
#get_title_range()
dt = datetime.datetime.now() - t0
loop.run_until_complete(final_task)
print(colorama.Fore.CYAN + "Done. " + ' App exiting total time: {:,.2f} sec.'.format(dt.total_seconds()), flush=True)
def get_title_range():
for n in range(150, 170):
html = get_html(n)
title = get_title(html, n)
print(Fore.CYAN + f"Title found: {title}", flush=True)
if __name__ == '__main__':
main()
It looks like you're not initializing tasks for your event loop to run on. I typically follow this pattern:
async def main():
headers = {'Connection': 'keep-alive', 'Content-Type': 'application/json', 'Authorization': auth}
url = 'some-api.com/post-request-something'
# We use a session to take advantage of tcp keep-alive
timeout = aiohttp.ClientTimeout(total=10000)
async with aiohttp.ClientSession(timeout=timeout) as session:
tasks = [async_wrap(session, q, url, headers) for q in queue]
# gather literally 'gathers' all the tasks and schedules them in the event loop
await asyncio.gather(*tasks, return_exceptions=True)
if __name__ == '__main__':
ts = time()
# Create the asyncio event loop - from the main function
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(main())
finally:
# Lets avoid an unclosed loop running a DDoS attack on ourselves
loop.close()
logger.info('Took %s seconds to complete', time() - ts)
note the line containing, takes those tasks gathered to schedule as coroutines in the main event loop:
loop.run_until_complete(main())
and then this, which calls my function, async_wrap() for each record I wanted to send in the http client (which I had stored in a list), but in your case it would call your asynchronous function get_html() using each record from get_title_range():
tasks = [async_wrap(session, q, url, headers) for q in queue] # -> mine
await asyncio.gather(*tasks, return_exceptions=True) # -> gather those tasks!
tasks = [get_html(episode_number=episode) for episode in list_of_episode_nums] # -> yours
await asyncio.gather(*tasks, return_exceptions=True) # -> gather those tasks!
Hope this helps you shore some details up, but unfortunately, asynchronous code can be quite a headache, requiring lots of trial-and-error.
I want to use the ProxyBroker lib in my python program to generate a list/queue of 10 working proxies.
Unfortunately I was not able to find anything similar in the example page of the lib.
This is what I got right now, but it feels like I'm using asyncio the wrong way to complete my task. Especially the gather function I'm using in combination with the collect(proxies) call.
def get_proxies(self, limit=10):
async def collect(proxies):
p = []
while True:
proxy = await proxies.get()
if proxy is None:
break
p.append(proxy)
return p
proxies = asyncio.Queue()
broker = Broker(proxies)
tasks = asyncio.gather(
broker.find(types=['HTTP', 'HTTPS'], limit=10),
collect(proxies))
loop = asyncio.get_event_loop()
proxy_list = loop.run_until_complete(tasks)
loop.close()
return proxy_list
What would be the preferred/correct way of generating the proxy list?
You can do it this:
"""Find and show 10 working HTTP(S) proxies."""
import asyncio
from proxybroker import Broker
async def show(proxies):
while True:
proxy = await proxies.get()
if proxy is None: break
print('Found proxy: %s' % proxy)
proxies = asyncio.Queue()
broker = Broker(proxies)
tasks = asyncio.gather(
broker.find(types=['HTTP', 'HTTPS'], limit=10),
show(proxies))
loop = asyncio.get_event_loop()
loop.run_until_complete(tasks)
or if you want to generate a file:
import asyncio
from proxybroker import Broker
async def save(proxies, filename):
"""Save proxies to a file."""
with open(filename, 'w') as f:
while True:
proxy = await proxies.get()
if proxy is None:
break
proto = 'https' if 'HTTPS' in proxy.types else 'http'
row = '%s://%s:%d\n' % (proto, proxy.host, proxy.port)
f.write(row)
def main():
proxies = asyncio.Queue()
broker = Broker(proxies)
tasks = asyncio.gather(broker.find(types=['HTTP', 'HTTPS'], limit=10),
save(proxies, filename='proxies.txt'))
loop = asyncio.get_event_loop()
loop.run_until_complete(tasks)
if __name__ == '__main__':
main()
Is all!
Good look!
How do I add parameters like verify and proxies to the below requests.get?
In a non-async setting I would just do requests.get(url, proxies='some_proxy', verify=False), but I don't know how to put that in the below.
import asyncio
import concurrent.futures
import requests
ids = [2048854772, 2042055933, 2036234693, 2007740886, 2006259847, 2003100744]
token = '111111'
max_workers = len(ids)
async def main():
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
loop = asyncio.get_event_loop()
futures = [
loop.run_in_executor(
executor,
requests.get,
'https://www.strava.com/api/v3/activities/{id}?include_all_efforts=true&access_token={token}'.format(id=id, token=token)
)
for id in ids
]
for response in await asyncio.gather(*futures):
print(response.text)
pass
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
You can use a partial :
from functools import partial
def sum(a, b):
return a + b
sum_with_two = partial(sum, 2)
sum_with_two(5)
>>> 7
sum_two_and_four = partial(sum, 2, 4)
sum_two_and_four()
>>> 6
In your case :
my_request = partial(requests.get, proxies='...', verify=False)
loop.run_in_executor(
executor,
my_request, # Arguments of the partials will be used
'...url...'
)
I have some code, which makes some API calls with asyncio and aiohttp. For some urls, asyncio will raise an exception, so I allow it to return it (with asyncio.gather(return_exceptions = True)), so it doesn't break the event loop. Is it possible to no gather the returned exceptions, so it returns only the results which worked? Or do I need to clean up the list manually afterwards?
This is the code:
import asyncio
import aiohttp
import ssl
import datetime as dt
limit = 30
start_epoch = int(dt.datetime(2018,7,1).timestamp())
end_epoch = int(dt.datetime.now().timestamp())
epoch_step = 40000
url_list = []
while True:
url = "https://api.pushshift.io/reddit/search/comment/?q=" + "Nestle" + "&size=" + str(limit) + "&after=" + str(start_epoch) + "&before=" + str(start_epoch + epoch_step)
url_list.append(url)
start_epoch += epoch_step
if start_epoch > end_epoch:
break
async def fetch(session, url):
async with session.get(url, ssl=ssl.SSLContext()) as response:
return await response.json()
async def fetch_all(urls, loop):
async with aiohttp.ClientSession(loop=loop) as session:
results = await asyncio.gather(*[fetch(session, url) for url in urls], return_exceptions=True)
return results
if __name__ == '__main__':
loop = asyncio.get_event_loop()
urls = url_list
htmls = loop.run_until_complete(fetch_all(urls, loop))
print(htmls)
and it returns a list which looks something like this:
[ContentTypeError("0, message='Attempt to decode JSON with unexpected mimetype: text/html'",), {'data': [{'author':...]