I need to check several hundred proxy servers and get the number of not working. Script for this
import urllib.request
import socket
net = ['http://192.168.1.1:8080',
'http://192.168.1.2:8080',
'http://192.168.1.3:8080',
'http://192.168.1.4:8080',
'http://192.168.1.5:8080',
'http://192.168.1.6:8080',
'http://192.168.1.7:8080',
'http://192.168.1.8:8080',
'http://192.168.1.9:8080',
'http://192.168.1.10:8080']
fail = 0
socket.setdefaulttimeout(3)
for x in net:
try:
print(x)
proxy = urllib.request.ProxyHandler({'http': (x)})
opener = urllib.request.build_opener(proxy)
urllib.request.install_opener(opener)
urllib.request.urlretrieve('http://google.com')
except IOError:
print ("Connection error")
fail+=1
print(fail)
Proxies in the list, I have given a simple version.
It takes 55 seconds to check 250 working proxies. I can't wait that long, need to increase the execution speed.
How can this be done using async?
This should give you an idea of how to approach it. You have to wrap the various connection blocks in try, except yourself.
NOTE: This code is not tested as I do not have any way of doing so.
import asyncio, aiohttp
def returnPartionedList(inputlist, x=100):
return([inputlist[i:i + x] for i in range(0, len(inputlist), x)])
# Returns: Original list split into segments of x.
async def TestProxy(url, proxy, session):
async with session.get(url, proxy=proxy, timeout=3) as response:
if response.status == 200:
_ = await response.text()
return(proxy)
async def TestProxies(listofproxies):
returnResults = []
url = "https://google.com" # Test proxy with this url
ProxyPartitions = returnPartionedList(listofproxies, 20) # Rate limit 20 per second
for partition in ProxyPartitions:
ProxyTasks = []
async with aiohttp.ClientSession() as session:
for proxy in partition:
ProxyTasks.append(asyncio.create_task(TestProxy(url, proxy, session)))
results = await asyncio.gather(*ProxyTasks, return_exceptions=False)
if results:
for result in results:
if result:
returnResults.append(result)
await asyncio.sleep(1)
return(returnResults)
async def main():
listofproxies = [
'http://10.10.1.1:8080',
'http://10.10.1.2:8080',
'http://10.10.1.3:8080',
'http://10.10.1.4:8080',
'http://10.10.1.5:8080',
'http://10.10.1.6:8080',
'http://10.10.1.7:8080',
'http://10.10.1.8:8080',
'http://10.10.1.9:8080',
'http://10.10.1.10:8080'
]
test_proxies = await TestProxies(listofproxies)
print(test_proxies)
if __name__ == "__main__":
asyncio.run(main())
Related
I have a block of codes that works well in fetching data from API requests to a specific site. The issue is that the site only gives me a limit of 50 objects for each call, and I have to make multiple calls. As a result, it takes me too long to finish the fetching work (sometimes I have to wait nearly 20 minutes). Here are my codes:
import concurrent.futures
import requests
supply = 3000
offset = 0
token_ids = []
while offset < supply:
url = "url_1" + str(offset)
response = requests.request("GET", url)
a = response.json()
assets = a["assets"]
def get_token_ids(an):
if str(an['sell_orders']) == 'None' and str(an['last_sale']) == 'None' and str(an['num_sales']) == '0':
token_ids.append(str(an['token_id']))
with concurrent.futures.ThreadPoolExecutor() as executor:
results = [executor.submit(get_token_ids, asset) for asset in assets]
offset += 50
print(token_ids)
The problem is that the codes run through and wait for all actions to be finished before making another request. I am thinking of an improvement that when the request is sent, the offset value gets added, and the loop processes to another request, thus I don't have to wait. I don't know how to do it, I studied 'asyncio', but it is still a challenge for me. Can anyone help me with this?
The problem is that Requests is not asynchronous code, so each of its network calls blocks the loop until its completion.
https://docs.python-requests.org/en/latest/user/advanced/#blocking-or-non-blocking
Therefore, it is better to try asynchronous libraries, for example, aiohttp:
https://github.com/aio-libs/aiohttp
Example
Create session for all connections:
async with aiohttp.ClientSession() as session:
and run all desired requests:
results = await asyncio.gather(
*[get_data(session, offset) for offset in range(0, supply, step)]
)
here, requests are executed asynchronously, with session.get(url) gets only the response headers, and the content gets await response.json():
async with session.get(url) as response:
a = await response.json()
And in the main block main loop starts:
loop = asyncio.get_event_loop()
token_ids = loop.run_until_complete(main())
loop.close()
The full code
import aiohttp
import asyncio
async def get_data(session, offset):
token_ids = []
url = "url_1" + str(offset)
async with session.get(url) as response:
# For tests:
# print("Status:", response.status)
# print("Content-type:", response.headers['content-type'])
a = await response.json()
assets = a["assets"]
for asset in assets:
if str(asset['sell_orders']) == 'None' and str(asset['last_sale']) == 'None' and str(asset['num_sales']) == '0':
token_ids.append(str(asset['token_id']))
return token_ids
async def main():
supply = 3000
step = 50
token_ids = []
# Create session for all connections and pass it to "get" function
async with aiohttp.ClientSession() as session:
results = await asyncio.gather(
*[get_data(session, offset) for offset in range(0, supply, step)]
)
for ids in results:
token_ids.extend(ids)
return token_ids
if __name__ == "__main__":
# asynchronous code start here
loop = asyncio.get_event_loop()
token_ids = loop.run_until_complete(main())
loop.close()
# asynchronous code end here
print(token_ids)
I am using the following code to make requests with aiohttp client. The server that I am trying to send request has a 30k request limit per hour per IP. So I am getting 429 too many request error. I want to put the job on sleep whenever it hits the limit.
I can extract the x_rateLimit_reset from the header so I thought I could use it to put the job on sleep but I observed very strange behavior. Sometimes the job the sleep time becomes negative and sometimes it gets stuck in sleeping mode.
For example, the last time that I ran the job, it first slept for 2000 seconds and then after the time passed, it again tried to sleep for another 2500 seconds and got stuck in sleeping mode. I think maybe the other parallel processes caused the issue so was wondering how to deal with too many request error msg when using Asyncio.
#backoff.on_exception(backoff.expo, (asyncio.TimeoutError, aiohttp.client_exceptions.ServerDisconnectedError,TooManyRequests),
max_time=300)
async def fetch(self, url, session, params):
try:
async with session.get(url, params=params) as response:
now = int(time.time())
print(response)
output = await response.read()
output = json.loads(output)
if 'X-RateLimit-Remaining' in response.headers:
rate = response.headers['X-RateLimit-Remaining']
if 'status' in output and output['status'] == 429:
x_rateLimit_reset = int(response.headers['X-RateLimit-Reset'])
print("sleep mode")
seconds = x_rateLimit_reset - now
LOGGER.info("The job will sleep for {} seconds".format(seconds))
time.sleep(max(seconds,0))
raise TooManyRequests()
return output
except (asyncio.TimeoutError, TypeError, json.decoder.JSONDecodeError,
aiohttp.client_exceptions.ServerDisconnectedError) as e:
print(str(e))
async def bound_fetch(self, sem, url, session, params):
# Getter function with semaphore.
async with sem:
output = await self.fetch(url, session, params)
return {"url": url, "output": output}
Edited:
This is how I initiate bound_fetch and define the URLs:
def get_responses(self, urls, office_token, params=None):
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(self.run(office_token, urls, params))
responses = loop.run_until_complete(future)
return responses
async def run(self, office_token, urls, params):
tasks = []
# create instance of Semaphore
sem = asyncio.BoundedSemaphore(200)
timeout = ClientTimeout(total=1000)
async with ClientSession(auth=BasicAuth(office_token, password=' '), timeout=timeout,
connector=TCPConnector(ssl=False)) as session:
for url in urls:
# pass Semaphore and session to every GET request
task = asyncio.ensure_future(self.bound_fetch(sem, url, session, params))
tasks.append(task)
responses = await asyncio.gather(*tasks)
return responses
urls = [
"{}/{}".format(self.base_url, "{}?page={}&api_key={}".format(object_name, page_number, self.api_keys))
for page_number in range(batch * chunk_size + 1, chunk_size * (1 + batch) + 1)]
Main reason you are using time.sleep() instead await asyncio.sleep().
UPDATE
Here is minimal working solution and some comment how it works.
Please use it to adopt your solution.
Take a look on asyncio-throttle
import aiohttp
import asyncio
from datetime import datetime
async def fetch(session, task): # fetching urls and mark result of execution
async with session.get(task['url']) as response:
if response.status != 200:
# response.raise_for_status()
# Here you need to somehow handle 429 code if it acquired
# In my example I just skip it.
task['result'] = response.status
task['status'] = 'done'
await response.text() # just to be sure we acquire data
print(f"{str(datetime.now())}: Got result of {task['url']}") # logging
task['result'] = response.status
task['status'] = 'done'
async def fetch_all(session, urls, persecond):
# convert to list of dicts
url_tasks = [{'url': i, 'result': None, 'status': 'new'} for i in urls]
n = 0 # counter
while True:
# calc how many tasks are fetching right now
running_tasks = len([i for i in url_tasks if i['status'] in ['fetch']])
# calc how many tasks are still need to be executed
is_tasks_to_wait = len([i for i in url_tasks if i['status'] != 'done'])
# check we are not in the end of list n < len()
# check we have room for one more task
if n < len(url_tasks) and running_tasks < persecond:
url_tasks[n]['status'] = 'fetch'
#
# Here is main trick
# If you schedule task inside running loop
# it will start to execute sync code until find some await
#
asyncio.create_task(fetch(session, url_tasks[n]))
n += 1
print(f'Schedule tasks {n}. '
f'Running {running_tasks} '
f'Remain {is_tasks_to_wait}')
# Check persecond constrain and wait a sec (or period)
if running_tasks >= persecond:
print('Throttling')
await asyncio.sleep(1)
#
# Here is another main trick
# To keep asyncio.run (or loop.run_until_complete) executing
# we need to wait a little than check that all tasks are done and
# wait and so on
if is_tasks_to_wait != 0:
await asyncio.sleep(0.1) # wait all tasks done
else:
# All tasks done
break
return url_tasks
async def main():
urls = ['http://google.com/?1',
'http://google.com/?2',
'http://google.com/?3']*3
async with aiohttp.ClientSession() as session:
res = await fetch_all(session, urls, 3)
print(res)
if __name__ == '__main__':
asyncio.run(main())
# (asyncio.run) do cancel all pending tasks (we do not have them,
# because we check all task done)
# (asyncio.run) do await canceling all tasks
# (asyncio.run) do stop loop
# exit program
Word of notice: This is my first approach with asyncio, so I might have done something really stupid.
Scenario is as follows:
I need to "http-ping" a humongous list of urls to check if they respond 200 or any other value. I get timeouts for each and every request, though tools like gobuster report 200,403, etc.
My code is sth similar to this:
import asyncio,aiohttp
import datetime
#-------------------------------------------------------------------------------------
async def get_data_coroutine(session,url,follow_redirects,timeout_seconds,retries):
#print('#DEBUG '+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')+' '+url)
try:
async with session.get(url,allow_redirects=False,timeout=timeout_seconds) as response:
status = response.status
#res = await response.text()
if( status==404):
pass
elif(300<=status and status<400):
location = str(response).split("Location': \'")[1].split("\'")[0]
print('#HIT '+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')+' '+str(status)+' '+url+' ---> '+location)
if(follow_redirects==True):
return await get_data_coroutine(session,location,follow_redirects,timeout_seconds,retries)
else:
print('#HIT '+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')+' '+str(status)+' '+url)
return None
except asyncio.exceptions.TimeoutError as e:
print('#ERROR '+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')+' '+' '+' '+url+' TIMEOUT '+str(e))
return None
#---------------------------------------------------------------------------
async def main(loop):
base_url = 'http://192.168.59.37'
extensions = ['','.html','php']
fd = open('/usr/share/wordlists/dirb/common.txt','r')
words_without_suffix = [x.strip() for x in fd.readlines()]#[-5:] #DEBUG!
words_with_suffix = [base_url+'/'+x+y for x in words_without_suffix for y in extensions]
follow = True
total_timeout = aiohttp.ClientTimeout(total=60*60*24)
timeout_seconds = 10
retries = 1
async with aiohttp.ClientSession(loop=loop,timeout=total_timeout) as session:
tasks = [get_data_coroutine(session,url,follow,timeout_seconds,retries) for url in words_with_suffix]
await asyncio.gather(*tasks)
print('DONE')
#---------------------------------------------------------------------------
if(__name__=='__main__'):
loop = asyncio.get_event_loop()
result = loop.run_until_complete(main(loop))
Did I do something really wrong?
Any word of advice?
Thank you SO much!
Actually, I ended up finding an open issue in aio-libs/aiohttp:
https://github.com/aio-libs/aiohttp/issues/3203
This way, they suggest a workaround that achieves my needs:
session_timeout = aiohttp.ClientTimeout(total=None,sock_connect=timeout_seconds,sock_read=timeout_seconds)
async with aiohttp.ClientSession(timeout=session_timeout) as session:
async with session.get(url,allow_redirects=False,timeout=1) as response:
...
To answer your question - no you did nothing wrong. I can't see anything wrong with your code in terms of http request/response/timeout handling.
If indeed all your requests are timing out to the host (http://192.168.59.37) I suspect the issues are you are experiencing are most likely down to how your network is resolving requests (or how your code is building the url).
You can confirm whether requests are independently succeeding/failing using a tool like curl, eg:
curl "http://192.168.59.37/abc.html"
I tested it locally by using
python3 -m http.server 8080
and placing an empty files 'abc' and 'abc.html' in the same directory, updating the base_url
base_url = "http://127.0.0.1:8080"
with my minor updates (code below) here's the output.
http://127.0.0.1:8080/.bashrc.php
#404
http://127.0.0.1:8080/.bashrc
#404
http://127.0.0.1:8080/.bashrc.html
#404
http://127.0.0.1:8080/abc
#HIT 2020-11-03 12:57:33 200 http://127.0.0.1:8080/abc
http://127.0.0.1:8080/zt.php
#404
http://127.0.0.1:8080/zt.html
#404
http://127.0.0.1:8080/zt
#404
http://127.0.0.1:8080/abc.html
#HIT 2020-11-03 12:57:33 200 http://127.0.0.1:8080/abc.html
http://127.0.0.1:8080/abc.php
#404
DONE
My updates are mostly minor but it might help with further debugging.
For debug, print the url. Important to determine if the code was building the url correctly. This highlighted to me that 'php' extension is missing a ".", so it would be looking for abcphp, not abc.php.
Use response.ok to test a successful http response, your code wasn't handling 500 errors (instead it was returning hit).
using python f-string for cleaner formatting
import asyncio
import aiohttp
import datetime
async def get_data_coroutine(session, url, follow_redirects, timeout_seconds, retries):
try:
async with session.get(
url, allow_redirects=False, timeout=timeout_seconds
) as response:
print(url)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
if response.ok:
print(f"#HIT {now} {response.status} {url}")
else:
status = response.status
if status == 404:
print("#404")
elif 300 <= status and status < 400:
location = str(response).split("Location': '")[1].split("'")[0]
print(f"#HIT {now} {status} {url} ---> {location}")
if follow_redirects is True:
return await get_data_coroutine(
session, location, follow_redirects, timeout_seconds, retries
)
else:
print("#ERROR ", response.status)
return None
except asyncio.TimeoutError as e:
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"#ERROR {now} {url} TIMEOUT ", e)
return None
async def main(loop):
base_url = "http://127.0.0.1:8080"
extensions = ["", ".html", ".php"]
fd = open("/usr/share/wordlists/dirb/common.txt", "r")
words_without_suffix = [x.strip() for x in fd.readlines()]
words_with_suffix = [
base_url + "/" + x + y for x in words_without_suffix for y in extensions
]
follow = True
total_timeout = aiohttp.ClientTimeout(total=60 * 60 * 24)
timeout_seconds = 10
retries = 1
async with aiohttp.ClientSession(loop=loop, timeout=total_timeout) as session:
tasks = [
get_data_coroutine(session, url, follow, timeout_seconds, retries)
for url in words_with_suffix
]
await asyncio.gather(*tasks)
print("DONE")
if __name__ == "__main__":
loop = asyncio.get_event_loop()
result = loop.run_until_complete(main(loop))
I am writing a web crawler that is running parallel fetches for many different domains. I want to limit the number of requests-per-second that are made to each individual domain, but I do not care about the total number of connections that are open, or the total requests per second that are made across all domains. I want to maximize the number of open connections and requests-per-second overall, while limiting the number of requests-per-second made to individual domains.
All of the currently existing examples I can find either (1) limit the number of open connections or (2) limit the total number of requests-per-second made in the fetch loop. Examples include:
aiohttp: rate limiting parallel requests
aiohttp: set maximum number of requests per second
Neither of them do what I am requesting which is to limit requests-per-second on a per domain basis. The first question only answers how to limit requests-per-second overall. The second one doesn't even have answers to the actual question (the OP asks about requests per second and the answers all talk about limiting # of connections).
Here is the code that I tried, using a simple rate limiter I made for a synchronous version, which doesn't work when the DomainTimer code is run in an async event loop:
from collections import defaultdict
from datetime import datetime, timedelta
import asyncio
import async_timeout
import aiohttp
from urllib.parse import urlparse
from queue import Queue, Empty
from HTMLProcessing import processHTML
import URLFilters
SEED_URLS = ['http://www.bbc.co.uk', 'http://www.news.google.com']
url_queue = Queue()
for u in SEED_URLS:
url_queue.put(u)
# number of pages to download per run of crawlConcurrent()
BATCH_SIZE = 100
DELAY = timedelta(seconds = 1.0) # delay between requests from single domain, in seconds
HTTP_HEADERS = {'Referer': 'http://www.google.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0'}
class DomainTimer():
def __init__(self):
self.timer = None
def resetTimer(self):
self.timer = datetime.now()
def delayExceeded(self, delay):
if not self.timer: #We haven't fetched this before
return True
if (datetime.now() - self.timer) >= delay:
return True
else:
return False
crawl_history = defaultdict(dict) # given a URL, when is last time crawled?
domain_timers = defaultdict(DomainTimer)
async def fetch(session, url):
domain = urlparse(url).netloc
print('here fetching ' + url + "\n")
dt = domain_timers[domain]
if dt.delayExceeded(DELAY) or not dt:
with async_timeout.timeout(10):
try:
dt.resetTimer() # reset domain timer
async with session.get(url, headers=HTTP_HEADERS) as response:
if response.status == 200:
crawl_history[url] = datetime.now()
html = await response.text()
return {'url': url, 'html': html}
else:
# log HTTP response, put into crawl_history so
# we don't attempt to fetch again
print(url + " failed with response: " + str(response.status) + "\n")
return {'url': url, 'http_status': response.status}
except aiohttp.ClientConnectionError as e:
print("Connection failed " + str(e))
except aiohttp.ClientPayloadError as e:
print("Recieved bad data from server # " + url + "\n")
else: # Delay hasn't passed yet: skip for now & put # end of q
url_queue.put(url);
return None
async def fetch_all(urls):
"""Launch requests for all web pages."""
tasks = []
async with aiohttp.ClientSession() as session:
for url in urls:
task = asyncio.ensure_future(fetch(session, url))
tasks.append(task) # create list of tasks
return await asyncio.gather(*tasks) # gather task responses
def batch_crawl():
"""Launch requests for all web pages."""
start_time = datetime.now()
# Here we build the list of URLs to crawl for this batch
urls = []
for i in range(BATCH_SIZE):
try:
next_url = url_queue.get_nowait() # get next URL from queue
urls.append(next_url)
except Empty:
print("Processed all items in URL queue.\n")
break;
loop = asyncio.get_event_loop()
asyncio.set_event_loop(loop)
pages = loop.run_until_complete(fetch_all(urls))
crawl_time = (datetime.now() - start_time).seconds
print("Crawl completed. Fetched " + str(len(pages)) + " pages in " + str(crawl_time) + " seconds.\n")
return pages
def parse_html(pages):
""" Parse the HTML for each page downloaded in this batch"""
start_time = datetime.now()
results = {}
for p in pages:
if not p or not p['html']:
print("Received empty page")
continue
else:
url, html = p['url'], p['html']
results[url] = processHTML(html)
processing_time = (datetime.now() - start_time).seconds
print("HTML processing finished. Processed " + str(len(results)) + " pages in " + str(processing_time) + " seconds.\n")
return results
def extract_new_links(results):
"""Extract links from """
# later we could track where links were from here, anchor text, etc,
# and weight queue priority based on that
links = []
for k in results.keys():
new_urls = [l['href'] for l in results[k]['links']]
for u in new_urls:
if u not in crawl_history.keys():
links.append(u)
return links
def filterURLs(urls):
urls = URLFilters.filterDuplicates(urls)
urls = URLFilters.filterBlacklistedDomains(urls)
return urls
def run_batch():
pages = batch_crawl()
results = parse_html(pages)
links = extract_new_links(results)
for l in filterURLs(links):
url_queue.put(l)
return results
There are no errors or exceptions thrown, and the rate-limiting code works fine in for synchronous fetches, but the DomainTimer has no apparent effect when run in async loop. The delay of one request-per-second per domain is not upheld...
How would I modify this synchronous rate limiting code to work within the async event loop? Thanks!
It's hard to debug your code since it contains many unrelated stuff, it's easier to show idea on a new simple example.
Main idea:
write your Semaphore-like class using __aenter__, __aexit__
that accepts url (domain)
use domain-specific Lock to prevent multiple requests to the same domain
sleep before allowing next request according to domain's last request and RPS
track time of last request for each domain
Code:
import asyncio
import aiohttp
from urllib.parse import urlparse
from collections import defaultdict
class Limiter:
# domain -> req/sec:
_limits = {
'httpbin.org': 4,
'eu.httpbin.org': 1,
}
# domain -> it's lock:
_locks = defaultdict(lambda: asyncio.Lock())
# domain -> it's last request time
_times = defaultdict(lambda: 0)
def __init__(self, url):
self._host = urlparse(url).hostname
async def __aenter__(self):
await self._lock
to_wait = self._to_wait_before_request()
print(f'Wait {to_wait} sec before next request to {self._host}')
await asyncio.sleep(to_wait)
async def __aexit__(self, *args):
print(f'Request to {self._host} just finished')
self._update_request_time()
self._lock.release()
#property
def _lock(self):
"""Lock that prevents multiple requests to same host."""
return self._locks[self._host]
def _to_wait_before_request(self):
"""What time we need to wait before request to host."""
request_time = self._times[self._host]
request_delay = 1 / self._limits[self._host]
now = asyncio.get_event_loop().time()
to_wait = request_time + request_delay - now
to_wait = max(0, to_wait)
return to_wait
def _update_request_time(self):
now = asyncio.get_event_loop().time()
self._times[self._host] = now
# request that uses Limiter instead of Semaphore:
async def get(url):
async with Limiter(url):
async with aiohttp.ClientSession() as session: # TODO reuse session for different requests.
async with session.get(url) as resp:
return await resp.text()
# main:
async def main():
coros = [
get('http://httpbin.org/get'),
get('http://httpbin.org/get'),
get('http://httpbin.org/get'),
get('http://httpbin.org/get'),
get('http://httpbin.org/get'),
get('http://eu.httpbin.org/get'),
get('http://eu.httpbin.org/get'),
get('http://eu.httpbin.org/get'),
get('http://eu.httpbin.org/get'),
get('http://eu.httpbin.org/get'),
]
await asyncio.gather(*coros)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(main())
finally:
loop.run_until_complete(loop.shutdown_asyncgens())
loop.close()
I developed a library named octopus-api (https://pypi.org/project/octopus-api/), that enables you to rate limit and set the number of connections to the endpoint using aiohttp under the hood. The goal of it is to simplify all the aiohttp setup needed.
Here is an example of how to use it, where the get_ethereum is the user-defined request function. It could have also been a web crawler function request or what ever fits:
from octopus_api import TentacleSession, OctopusApi
from typing import Dict, List
if __name__ == '__main__':
async def get_ethereum(session: TentacleSession, request: Dict):
async with session.get(url=request["url"], params=request["params"]) as response:
body = await response.json()
return body
client = OctopusApi(rate=50, resolution="sec", connections=6)
result: List = client.execute(requests_list=[{
"url": "https://api.pro.coinbase.com/products/ETH-EUR/candles?granularity=900&start=2021-12-04T00:00:00Z&end=2021-12-04T00:00:00Z",
"params": {}}] * 1000, func=get_ethereum)
print(result)
The TentacleSession works the same as how you write POST, GET, PUT and PATCH for aiohttp.ClientSession.
Let me know if it helps your issue related to rate limits and connection for crawling.
I followed up this tutorial: https://pawelmhm.github.io/asyncio/python/aiohttp/2016/04/22/asyncio-aiohttp.html and everything works fine when I am doing like 50 000 requests. But I need to do 1 milion API calls and then I have problem with this code:
url = "http://some_url.com/?id={}"
tasks = set()
sem = asyncio.Semaphore(MAX_SIM_CONNS)
for i in range(1, LAST_ID + 1):
task = asyncio.ensure_future(bound_fetch(sem, url.format(i)))
tasks.add(task)
responses = asyncio.gather(*tasks)
return await responses
Because Python needs to create 1 milion tasks, it basically just lags and then prints Killed message in terminal. Is there any way to use a generator insted of pre-made set (or list) of urls? Thanks.
Schedule all 1 million tasks at once
This is the code you are talking about. It takes up to 3 GB RAM so it is easily possible that it will be terminated by the operating system if you have low free memory.
import asyncio
from aiohttp import ClientSession
MAX_SIM_CONNS = 50
LAST_ID = 10**6
async def fetch(url, session):
async with session.get(url) as response:
return await response.read()
async def bound_fetch(sem, url, session):
async with sem:
await fetch(url, session)
async def fetch_all():
url = "http://localhost:8080/?id={}"
tasks = set()
async with ClientSession() as session:
sem = asyncio.Semaphore(MAX_SIM_CONNS)
for i in range(1, LAST_ID + 1):
task = asyncio.create_task(bound_fetch(sem, url.format(i), session))
tasks.add(task)
return await asyncio.gather(*tasks)
if __name__ == '__main__':
asyncio.run(fetch_all())
Use queue to streamline the work
This is my suggestion how to use asyncio.Queue to pass URLs to worker tasks. The queue is filled as-needed, there is no pre-made list of URLs.
It takes only 30 MB RAM :)
import asyncio
from aiohttp import ClientSession
MAX_SIM_CONNS = 50
LAST_ID = 10**6
async def fetch(url, session):
async with session.get(url) as response:
return await response.read()
async def fetch_worker(url_queue):
async with ClientSession() as session:
while True:
url = await url_queue.get()
try:
if url is None:
# all work is done
return
response = await fetch(url, session)
# ...do something with the response
finally:
url_queue.task_done()
# calling task_done() is necessary for the url_queue.join() to work correctly
async def fetch_all():
url = "http://localhost:8080/?id={}"
url_queue = asyncio.Queue(maxsize=100)
worker_tasks = []
for i in range(MAX_SIM_CONNS):
wt = asyncio.create_task(fetch_worker(url_queue))
worker_tasks.append(wt)
for i in range(1, LAST_ID + 1):
await url_queue.put(url.format(i))
for i in range(MAX_SIM_CONNS):
# tell the workers that the work is done
await url_queue.put(None)
await url_queue.join()
await asyncio.gather(*worker_tasks)
if __name__ == '__main__':
asyncio.run(fetch_all())
asyncio is memory bound (like any other program). You can not spawn more task that memory can hold. My guess is that you hit a memory limit. Check dmesg for more information.
1 millions RPS doesn't mean there is 1M tasks. A task can do several request in the same second.