GitHub cannot authorize with oauth - python

I'm trying to make a GitHub bot that posts issues when a complementary app throws errors, but I'm having trouble trying to get an access token. This is my code so far:
import asyncio
import random
import webbrowser
import aiohttp.web
randgen = random.SystemRandom()
# The task is relatively simple. Query for the client ID, client secret, and then produces a URL.
# It opens this URL in a web-browser, then sets up a server on port 12345, waiting for an appropriate response.
# Using this code, it finishes the rest of the flow.
class Authorizer:
def __init__(self):
self.client_id = self.client_secret = ""
self.redirect_url = "http://localhost:12345/callback"
self.scopes = ["public_repo"]
self.state = 0
self.done = False
async def handler(self, request: aiohttp.web.Request):
code = request.query["code"]
async with aiohttp.ClientSession() as client:
async with client.post("https://github.com/login/oauth/access_token",
data=dict(client_id=self.client_id, client_secret=self.client_secret, code=code, state=self.state),
headers=dict(Accept="application/json")) as request:
json = await request.json()
assert str(json["state"]) == self.state
resp = "Access Token: " + json["access_token"]
print(resp)
self.done = True
return aiohttp.web.Response(text=resp)
async def start(self):
self.client_id = input("App Client ID: ")
self.client_secret = input("App Client Secret: ")
self.state = randgen.randint(1, 1000)
scope_str = ' '.join(self.scopes)
url = f"https://github.com/login/oauth/authorize?client_id={self.client_secret}&redirect_uri={self.redirect_url}&scope=" \
f"{scope_str}&state={self.state}"
print("Opening URL: " + url)
webbrowser.open(url)
self.server = aiohttp.web.Server(self.handler)
self.runner = aiohttp.web.ServerRunner(self.server)
await self.runner.setup()
self.site = aiohttp.web.TCPSite(self.runner, 'localhost', 12345)
await self.site.start()
while not self.done:
await asyncio.sleep(1)
await self.site.stop()
input("Complete! Make sure you take your access token with you! Now hit enter to exit.")
if __name__ == '__main__':
auth = Authorizer()
loop = asyncio.get_event_loop()
loop.run_until_complete(auth.start())
I get an error, where the constructed URL gives me a 404 error. Getting rid of the state, redirect URL, and scopes don't change the error. I looked at the official docs when making this, so I don't understand why the constructed error is giving me a 404.

Related

Python aiohttp how to handle client session token timeout

I am making several 100's of http request using aiohttp. I am relatively new to the async world but have managed to get the basic code working.
First I am generating a token. Then, making aiohttp calls using this token.
Token has a validity of 30 mins. So I am assuming if my calls run for more than 30 mins then they will start failing.
How do I update my code to plug-in a new token after 30 mins then resume the remaining calls. This my first time implementing async calls, so relatively clueless on how to handle this.
async def a_get_all_user_details(urls):
results = []
connector = aiohttp.TCPConnector(limit=70)
timeout = aiohttp.ClientTimeout(total=None, connect=300, sock_connect=300, sock_read=None)
auth_token = get_token() # token expires in 30 mins
headers = {
'accept': 'application/json',
'Authorization': 'Bearer ' + auth_token
}
async with aiohttp.ClientSession(trust_env=True, headers=headers, connector=connector, timeout=timeout) as session:
for url in urls:
result = asyncio.ensure_future(a_get_user_details(url, session))
results.append(result)
responses = await asyncio.gather(*results)
return responses
def main():
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(a_get_all_user_details(search_urls))
user_details = loop.run_until_complete(future)
Maybe there's simpler way to do it but here's my take:
The problem is that there are many connections in the fly when you want to refresh session. When you close the session and create new one, active connections which are waiting for data throw an exception.
In my example I have a list of all sessions and when time arrives I simply create new session (with new token) and append it to the list. The new connections will use the last (freshest) session.
At the end of script I close all sessions.
import aiohttp
import asyncio
sessions = []
async def get_token():
return "XYZ"
async def refresh_session():
# this function periodically refreshes the token every X sec
connector = aiohttp.TCPConnector(limit=3)
timeout = aiohttp.ClientTimeout(
total=None, connect=300, sock_connect=300, sock_read=None
)
while True:
headers = {
"accept": "application/json",
"Authorization": "Bearer " + await get_token(),
}
sessions.append(
aiohttp.ClientSession(
trust_env=True,
headers=headers,
connector=connector,
timeout=timeout,
)
)
print("New session created")
await asyncio.sleep(5) # every 5 seconds refresh session
async def get_user_detail(url):
# wait for session to show up:
while not sessions:
await asyncio.sleep(1)
# use last (freshest) session:
async with sessions[-1].get(url) as resp:
assert resp.status == 200
html = await resp.text()
return f"some result for {url} length of data {len(html)}"
async def get_user_details(urls):
results = []
for url in urls:
results.append(asyncio.ensure_future(get_user_detail(url)))
responses = await asyncio.gather(*results)
return responses
async def main():
# some urls to gather:
urls = [
"https://www.google.com",
"https://www.microsoft.com",
"https://www.yahoo.com",
] * 30
t1 = asyncio.create_task(refresh_session())
t2 = asyncio.create_task(get_user_details(urls))
# finish when first task ends (in this case get_user_details())
done, _ = await asyncio.wait([t1, t2], return_when=asyncio.FIRST_COMPLETED)
# close all opened sessions:
for s in sessions:
await s.close()
# print the result
print("Domains gathered ", len(done.pop().result()))
if __name__ == "__main__":
asyncio.run(main())
This prints:
New session created
New session created
Domains gathered 90

python asyncio aiohttp timeout

Word of notice: This is my first approach with asyncio, so I might have done something really stupid.
Scenario is as follows:
I need to "http-ping" a humongous list of urls to check if they respond 200 or any other value. I get timeouts for each and every request, though tools like gobuster report 200,403, etc.
My code is sth similar to this:
import asyncio,aiohttp
import datetime
#-------------------------------------------------------------------------------------
async def get_data_coroutine(session,url,follow_redirects,timeout_seconds,retries):
#print('#DEBUG '+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')+' '+url)
try:
async with session.get(url,allow_redirects=False,timeout=timeout_seconds) as response:
status = response.status
#res = await response.text()
if( status==404):
pass
elif(300<=status and status<400):
location = str(response).split("Location': \'")[1].split("\'")[0]
print('#HIT '+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')+' '+str(status)+' '+url+' ---> '+location)
if(follow_redirects==True):
return await get_data_coroutine(session,location,follow_redirects,timeout_seconds,retries)
else:
print('#HIT '+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')+' '+str(status)+' '+url)
return None
except asyncio.exceptions.TimeoutError as e:
print('#ERROR '+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')+' '+' '+' '+url+' TIMEOUT '+str(e))
return None
#---------------------------------------------------------------------------
async def main(loop):
base_url = 'http://192.168.59.37'
extensions = ['','.html','php']
fd = open('/usr/share/wordlists/dirb/common.txt','r')
words_without_suffix = [x.strip() for x in fd.readlines()]#[-5:] #DEBUG!
words_with_suffix = [base_url+'/'+x+y for x in words_without_suffix for y in extensions]
follow = True
total_timeout = aiohttp.ClientTimeout(total=60*60*24)
timeout_seconds = 10
retries = 1
async with aiohttp.ClientSession(loop=loop,timeout=total_timeout) as session:
tasks = [get_data_coroutine(session,url,follow,timeout_seconds,retries) for url in words_with_suffix]
await asyncio.gather(*tasks)
print('DONE')
#---------------------------------------------------------------------------
if(__name__=='__main__'):
loop = asyncio.get_event_loop()
result = loop.run_until_complete(main(loop))
Did I do something really wrong?
Any word of advice?
Thank you SO much!
Actually, I ended up finding an open issue in aio-libs/aiohttp:
https://github.com/aio-libs/aiohttp/issues/3203
This way, they suggest a workaround that achieves my needs:
session_timeout = aiohttp.ClientTimeout(total=None,sock_connect=timeout_seconds,sock_read=timeout_seconds)
async with aiohttp.ClientSession(timeout=session_timeout) as session:
async with session.get(url,allow_redirects=False,timeout=1) as response:
...
To answer your question - no you did nothing wrong. I can't see anything wrong with your code in terms of http request/response/timeout handling.
If indeed all your requests are timing out to the host (http://192.168.59.37) I suspect the issues are you are experiencing are most likely down to how your network is resolving requests (or how your code is building the url).
You can confirm whether requests are independently succeeding/failing using a tool like curl, eg:
curl "http://192.168.59.37/abc.html"
I tested it locally by using
python3 -m http.server 8080
and placing an empty files 'abc' and 'abc.html' in the same directory, updating the base_url
base_url = "http://127.0.0.1:8080"
with my minor updates (code below) here's the output.
http://127.0.0.1:8080/.bashrc.php
#404
http://127.0.0.1:8080/.bashrc
#404
http://127.0.0.1:8080/.bashrc.html
#404
http://127.0.0.1:8080/abc
#HIT 2020-11-03 12:57:33 200 http://127.0.0.1:8080/abc
http://127.0.0.1:8080/zt.php
#404
http://127.0.0.1:8080/zt.html
#404
http://127.0.0.1:8080/zt
#404
http://127.0.0.1:8080/abc.html
#HIT 2020-11-03 12:57:33 200 http://127.0.0.1:8080/abc.html
http://127.0.0.1:8080/abc.php
#404
DONE
My updates are mostly minor but it might help with further debugging.
For debug, print the url. Important to determine if the code was building the url correctly. This highlighted to me that 'php' extension is missing a ".", so it would be looking for abcphp, not abc.php.
Use response.ok to test a successful http response, your code wasn't handling 500 errors (instead it was returning hit).
using python f-string for cleaner formatting
import asyncio
import aiohttp
import datetime
async def get_data_coroutine(session, url, follow_redirects, timeout_seconds, retries):
try:
async with session.get(
url, allow_redirects=False, timeout=timeout_seconds
) as response:
print(url)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
if response.ok:
print(f"#HIT {now} {response.status} {url}")
else:
status = response.status
if status == 404:
print("#404")
elif 300 <= status and status < 400:
location = str(response).split("Location': '")[1].split("'")[0]
print(f"#HIT {now} {status} {url} ---> {location}")
if follow_redirects is True:
return await get_data_coroutine(
session, location, follow_redirects, timeout_seconds, retries
)
else:
print("#ERROR ", response.status)
return None
except asyncio.TimeoutError as e:
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"#ERROR {now} {url} TIMEOUT ", e)
return None
async def main(loop):
base_url = "http://127.0.0.1:8080"
extensions = ["", ".html", ".php"]
fd = open("/usr/share/wordlists/dirb/common.txt", "r")
words_without_suffix = [x.strip() for x in fd.readlines()]
words_with_suffix = [
base_url + "/" + x + y for x in words_without_suffix for y in extensions
]
follow = True
total_timeout = aiohttp.ClientTimeout(total=60 * 60 * 24)
timeout_seconds = 10
retries = 1
async with aiohttp.ClientSession(loop=loop, timeout=total_timeout) as session:
tasks = [
get_data_coroutine(session, url, follow, timeout_seconds, retries)
for url in words_with_suffix
]
await asyncio.gather(*tasks)
print("DONE")
if __name__ == "__main__":
loop = asyncio.get_event_loop()
result = loop.run_until_complete(main(loop))

Will Redis Pub/Sub keep or persist past data when client is unsubscribed from topic?

I have a Django project with Tornado websocket opened and is subscribed to a topic in my Redis Pub/Sub. I am using asyncio and aioredis. Before page refresh, browser close or navigation out of that page, I will call for the websocket to close, which will unsubscribe from that topic.
The issue here is sometimes when I change pages or refresh the page, a bunch of messages from the past will be pumped back into the newly opened websocket. It doesn't happen every time, and I'm not sure what else I can do to make sure the past messages don't come back on page refresh. I've already made sure the websocket will close and unsubscribe from the topic on page refresh/window unload.
Does Redis Pub/Sub keep old messages somewhere while the client is unsubscribed? And when the client subscribes back to the same topic, the old messages are sent out? Is this normal behaviour for Redis Pub/Sub? I'm under the impression that Redis Pub/Sub doesn't persist messages and if client is unsubscribed, the messages just get dropped for that client.
I need to make sure when page reloads, the old messages don't get pumped back to the websocket.
This is how I wrote the RedisChannel to execute the pub/sub functions:
import aioredis
class RedisChannel(object):
'''
Redis backed pub-sub websocket channel.
'''
async def subscribe(self, **kwargs):
'''
Subscribe to topics
'''
topics = kwargs.get('topics')
return await self.conn.subscribe(*topics)
async def unsubscribe(self, **kwargs):
'''
Unsubscribe to topics
'''
topics = kwargs.get('topics')
return await self.conn.unsubscribe(*topics)
async def send(self, **kwargs):
data = {}
# If client socket is provided, only send to this socket.
ws = kwargs.get('ws')
# Topic for this message. Compulsory for broadcast.
topic = kwargs.get('topic')
# Usually JSON
if kwargs.get('data'):
data['data'] = kwargs.get('data')
# I'm using 60 seconds right now just to try to limit the list of past messages
# But the behaviour I need is 0 past messages on page reload in browser
push_event = True
if kwargs.get('timestamp'):
event_timestamp = kwargs.get("timestamp", 0)
data['timestamp'] = event_timestamp
# logger.debug(data)
current_time = timezone.now()
if event_timestamp:
event_dt = get_utc_time(datetime.utcfromtimestamp(event_timestamp))
if event_dt:
time_apart = current_time - event_dt
duration = abs(time_apart.total_seconds())
logger.debug("Time apart between event and current time = {}".format(duration))
if duration >= 60:
push_event = False
if not push_event:
data = {}
return await self.conn.publish_json(topic, json.dumps(data, separators=(',', ': ')))
async def connect(self):
redis_settings = settings['redis']['channel']
self.conn = await aioredis.create_redis_pool(
(
redis_settings.get('host'),
redis_settings.get('port')
),
db=redis_settings.get('db'),
minsize=2,
maxsize=redis_settings.get('max_connections'),
encoding='utf-8'
)
This is how I wrote the websocket handler to subscribe/unsubscribe to a Redis topic:
import asyncio, json
ws_channel = RedisChannel()
asyncio.get_event_loop().create_task(ws_channel.connect())
async def reader(ch, ws):
while (await ch.wait_message()):
data = await ch.get_json()
if data:
ws.write_message(data)
await asyncio.sleep(0.001)
# time.sleep
class ResultsWsHandler(tornado.websocket.WebSocketHandler):
def open(self):
try:
self.write_message(json.dumps('Websocket opened.'))
except Exception as e:
logger.error(str(e))
def on_message(self, message):
asyncio.ensure_future(self.on_message_async(message))
async def on_message_async(self, message):
# async def on_message(self, message):
data = json.loads(message)
action = data.get('action', None)
topics = data.get('cameras', [])
if topics or action is not None:
try:
action = int(action)
if action == 0: # 0 - heartbeat
logger.debug('Heartbeat.')
param = {'type': 0}
self.write_message(json.dumps(param))
elif action == 1: # 1 - subscribe
channels = await ws_channel.subscribe(topics=topics)
logger.debug(f'Successfully subscribed from {topics}.')
self.write_message(json.dumps(f'Successfully subscribed to {topics}.'))
task_list = []
for c in channels:
task_list.append(asyncio.ensure_future(reader(c, self)))
await asyncio.wait(task_list)
elif action == 2: # 2 - unsubscribe
await ws_channel.unsubscribe(topics=topics)
logger.debug(f'Successfully unsubscribe from {topics}.')
self.write_message(json.dumps(f'Successfully unsubscribe from {topics}.'))
else:
logger.debug(f'Other: {data}')
except Exception as e:
logger.error(json.dumps(str(e), separators=(',', ': ')))
self.write_message(json.dumps(str(e), separators=(',', ': ')))

how to implement a websocket aware reverse-proxy with aiohttp (python 3.6)

I am trying to implement an application specific reverse-proxy for jupyter notebooks using aiohttp. It works fine for http requests, but the websocket forwarding does not work. Requests from the browser arrive and get forwarded, but there are no responses from jupyter forthcoming. I assume my websocket client code somehow does not react to incoming messages from jupyter.
The only indication on the jupyter side that something is amiss are messages like this:
WebSocket ping timeout after 90009 ms.
so here is my attempt at writing the proxy
from aiohttp import web
from aiohttp import client
import aiohttp
import logging
import pprint
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
baseUrl = 'http://0.0.0.0:8888'
mountPoint = '/fakeUuid'
async def handler(req):
proxyPath = req.match_info.get('proxyPath','no proxyPath placeholder defined')
reqH = req.headers.copy()
if reqH['connection'] == 'Upgrade' and reqH['upgrade'] == 'websocket' and req.method == 'GET':
ws_server = web.WebSocketResponse()
await ws_server.prepare(req)
logger.info('##### WS_SERVER %s' % pprint.pformat(ws_server))
client_session = aiohttp.ClientSession()
async with client_session.ws_connect(baseUrl+req.path_qs,
headers = { 'cookie': reqH['cookie'] },
) as ws_client:
logger.info('##### WS_CLIENT %s' % pprint.pformat(ws_client))
async for server_msg in ws_server:
logger.info('>>> msg from browser: %s',pprint.pformat(server_msg))
if server_msg.type == aiohttp.WSMsgType.TEXT:
await ws_client.send_str(server_msg.data)
else:
await ws_client.send_bytes(server_msg.data)
async for client_msg in ws_client:
logger.info('>>> msg from jupyter: %s',pprint.pformat(client_msg))
if client_msg.tp == aiohttp.WSMsgType.TEXT:
await ws_server.send_str(client_msg.data)
else:
await ws_server.send_bytes(client_msg.data)
return ws_server
else:
async with client.request(
req.method,baseUrl+mountPoint+proxyPath,
headers = reqH,
allow_redirects=False,
data = await req.read()
) as res:
headers = res.headers.copy()
body = await res.read()
return web.Response(
headers = headers,
status = res.status,
body = body
)
return ws_server
app = web.Application()
app.router.add_route('*',mountPoint + '{proxyPath:.*}', handler)
web.run_app(app,port=3984)
Lesson learned: the two async for are blocking in the flow of the current function. By running them with asyncio.wait I can get them to run at the same time. The resulting program looks like this:
from aiohttp import web
from aiohttp import client
import aiohttp
import asyncio
import logging
import pprint
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
baseUrl = 'http://0.0.0.0:8888'
mountPoint = '/fakeUuid'
async def handler(req):
proxyPath = req.match_info.get('proxyPath','no proxyPath placeholder defined')
reqH = req.headers.copy()
if reqH['connection'] == 'Upgrade' and reqH['upgrade'] == 'websocket' and req.method == 'GET':
ws_server = web.WebSocketResponse()
await ws_server.prepare(req)
logger.info('##### WS_SERVER %s' % pprint.pformat(ws_server))
client_session = aiohttp.ClientSession(cookies=req.cookies)
async with client_session.ws_connect(
baseUrl+req.path_qs,
},
) as ws_client:
logger.info('##### WS_CLIENT %s' % pprint.pformat(ws_client))
async def wsforward(ws_from,ws_to):
async for msg in ws_from:
logger.info('>>> msg: %s',pprint.pformat(msg))
mt = msg.type
md = msg.data
if mt == aiohttp.WSMsgType.TEXT:
await ws_to.send_str(md)
elif mt == aiohttp.WSMsgType.BINARY:
await ws_to.send_bytes(md)
elif mt == aiohttp.WSMsgType.PING:
await ws_to.ping()
elif mt == aiohttp.WSMsgType.PONG:
await ws_to.pong()
elif ws_to.closed:
await ws_to.close(code=ws_to.close_code,message=msg.extra)
else:
raise ValueError('unexpecte message type: %s',pprint.pformat(msg))
finished,unfinished = await asyncio.wait([wsforward(ws_server,ws_client),wsforward(ws_client,ws_server)],return_when=asyncio.FIRST_COMPLETED)
return ws_server
else:
async with client.request(
req.method,baseUrl+mountPoint+proxyPath,
headers = reqH,
allow_redirects=False,
data = await req.read()
) as res:
headers = res.headers.copy()
body = await res.read()
return web.Response(
headers = headers,
status = res.status,
body = body
)
return ws_server
app = web.Application()
app.router.add_route('*',mountPoint + '{proxyPath:.*}', handler)
web.run_app(app,port=3984)

aiohttp: rate limiting requests-per-second by domain

I am writing a web crawler that is running parallel fetches for many different domains. I want to limit the number of requests-per-second that are made to each individual domain, but I do not care about the total number of connections that are open, or the total requests per second that are made across all domains. I want to maximize the number of open connections and requests-per-second overall, while limiting the number of requests-per-second made to individual domains.
All of the currently existing examples I can find either (1) limit the number of open connections or (2) limit the total number of requests-per-second made in the fetch loop. Examples include:
aiohttp: rate limiting parallel requests
aiohttp: set maximum number of requests per second
Neither of them do what I am requesting which is to limit requests-per-second on a per domain basis. The first question only answers how to limit requests-per-second overall. The second one doesn't even have answers to the actual question (the OP asks about requests per second and the answers all talk about limiting # of connections).
Here is the code that I tried, using a simple rate limiter I made for a synchronous version, which doesn't work when the DomainTimer code is run in an async event loop:
from collections import defaultdict
from datetime import datetime, timedelta
import asyncio
import async_timeout
import aiohttp
from urllib.parse import urlparse
from queue import Queue, Empty
from HTMLProcessing import processHTML
import URLFilters
SEED_URLS = ['http://www.bbc.co.uk', 'http://www.news.google.com']
url_queue = Queue()
for u in SEED_URLS:
url_queue.put(u)
# number of pages to download per run of crawlConcurrent()
BATCH_SIZE = 100
DELAY = timedelta(seconds = 1.0) # delay between requests from single domain, in seconds
HTTP_HEADERS = {'Referer': 'http://www.google.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0'}
class DomainTimer():
def __init__(self):
self.timer = None
def resetTimer(self):
self.timer = datetime.now()
def delayExceeded(self, delay):
if not self.timer: #We haven't fetched this before
return True
if (datetime.now() - self.timer) >= delay:
return True
else:
return False
crawl_history = defaultdict(dict) # given a URL, when is last time crawled?
domain_timers = defaultdict(DomainTimer)
async def fetch(session, url):
domain = urlparse(url).netloc
print('here fetching ' + url + "\n")
dt = domain_timers[domain]
if dt.delayExceeded(DELAY) or not dt:
with async_timeout.timeout(10):
try:
dt.resetTimer() # reset domain timer
async with session.get(url, headers=HTTP_HEADERS) as response:
if response.status == 200:
crawl_history[url] = datetime.now()
html = await response.text()
return {'url': url, 'html': html}
else:
# log HTTP response, put into crawl_history so
# we don't attempt to fetch again
print(url + " failed with response: " + str(response.status) + "\n")
return {'url': url, 'http_status': response.status}
except aiohttp.ClientConnectionError as e:
print("Connection failed " + str(e))
except aiohttp.ClientPayloadError as e:
print("Recieved bad data from server # " + url + "\n")
else: # Delay hasn't passed yet: skip for now & put # end of q
url_queue.put(url);
return None
async def fetch_all(urls):
"""Launch requests for all web pages."""
tasks = []
async with aiohttp.ClientSession() as session:
for url in urls:
task = asyncio.ensure_future(fetch(session, url))
tasks.append(task) # create list of tasks
return await asyncio.gather(*tasks) # gather task responses
def batch_crawl():
"""Launch requests for all web pages."""
start_time = datetime.now()
# Here we build the list of URLs to crawl for this batch
urls = []
for i in range(BATCH_SIZE):
try:
next_url = url_queue.get_nowait() # get next URL from queue
urls.append(next_url)
except Empty:
print("Processed all items in URL queue.\n")
break;
loop = asyncio.get_event_loop()
asyncio.set_event_loop(loop)
pages = loop.run_until_complete(fetch_all(urls))
crawl_time = (datetime.now() - start_time).seconds
print("Crawl completed. Fetched " + str(len(pages)) + " pages in " + str(crawl_time) + " seconds.\n")
return pages
def parse_html(pages):
""" Parse the HTML for each page downloaded in this batch"""
start_time = datetime.now()
results = {}
for p in pages:
if not p or not p['html']:
print("Received empty page")
continue
else:
url, html = p['url'], p['html']
results[url] = processHTML(html)
processing_time = (datetime.now() - start_time).seconds
print("HTML processing finished. Processed " + str(len(results)) + " pages in " + str(processing_time) + " seconds.\n")
return results
def extract_new_links(results):
"""Extract links from """
# later we could track where links were from here, anchor text, etc,
# and weight queue priority based on that
links = []
for k in results.keys():
new_urls = [l['href'] for l in results[k]['links']]
for u in new_urls:
if u not in crawl_history.keys():
links.append(u)
return links
def filterURLs(urls):
urls = URLFilters.filterDuplicates(urls)
urls = URLFilters.filterBlacklistedDomains(urls)
return urls
def run_batch():
pages = batch_crawl()
results = parse_html(pages)
links = extract_new_links(results)
for l in filterURLs(links):
url_queue.put(l)
return results
There are no errors or exceptions thrown, and the rate-limiting code works fine in for synchronous fetches, but the DomainTimer has no apparent effect when run in async loop. The delay of one request-per-second per domain is not upheld...
How would I modify this synchronous rate limiting code to work within the async event loop? Thanks!
It's hard to debug your code since it contains many unrelated stuff, it's easier to show idea on a new simple example.
Main idea:
write your Semaphore-like class using __aenter__, __aexit__
that accepts url (domain)
use domain-specific Lock to prevent multiple requests to the same domain
sleep before allowing next request according to domain's last request and RPS
track time of last request for each domain
Code:
import asyncio
import aiohttp
from urllib.parse import urlparse
from collections import defaultdict
class Limiter:
# domain -> req/sec:
_limits = {
'httpbin.org': 4,
'eu.httpbin.org': 1,
}
# domain -> it's lock:
_locks = defaultdict(lambda: asyncio.Lock())
# domain -> it's last request time
_times = defaultdict(lambda: 0)
def __init__(self, url):
self._host = urlparse(url).hostname
async def __aenter__(self):
await self._lock
to_wait = self._to_wait_before_request()
print(f'Wait {to_wait} sec before next request to {self._host}')
await asyncio.sleep(to_wait)
async def __aexit__(self, *args):
print(f'Request to {self._host} just finished')
self._update_request_time()
self._lock.release()
#property
def _lock(self):
"""Lock that prevents multiple requests to same host."""
return self._locks[self._host]
def _to_wait_before_request(self):
"""What time we need to wait before request to host."""
request_time = self._times[self._host]
request_delay = 1 / self._limits[self._host]
now = asyncio.get_event_loop().time()
to_wait = request_time + request_delay - now
to_wait = max(0, to_wait)
return to_wait
def _update_request_time(self):
now = asyncio.get_event_loop().time()
self._times[self._host] = now
# request that uses Limiter instead of Semaphore:
async def get(url):
async with Limiter(url):
async with aiohttp.ClientSession() as session: # TODO reuse session for different requests.
async with session.get(url) as resp:
return await resp.text()
# main:
async def main():
coros = [
get('http://httpbin.org/get'),
get('http://httpbin.org/get'),
get('http://httpbin.org/get'),
get('http://httpbin.org/get'),
get('http://httpbin.org/get'),
get('http://eu.httpbin.org/get'),
get('http://eu.httpbin.org/get'),
get('http://eu.httpbin.org/get'),
get('http://eu.httpbin.org/get'),
get('http://eu.httpbin.org/get'),
]
await asyncio.gather(*coros)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(main())
finally:
loop.run_until_complete(loop.shutdown_asyncgens())
loop.close()
I developed a library named octopus-api (https://pypi.org/project/octopus-api/), that enables you to rate limit and set the number of connections to the endpoint using aiohttp under the hood. The goal of it is to simplify all the aiohttp setup needed.
Here is an example of how to use it, where the get_ethereum is the user-defined request function. It could have also been a web crawler function request or what ever fits:
from octopus_api import TentacleSession, OctopusApi
from typing import Dict, List
if __name__ == '__main__':
async def get_ethereum(session: TentacleSession, request: Dict):
async with session.get(url=request["url"], params=request["params"]) as response:
body = await response.json()
return body
client = OctopusApi(rate=50, resolution="sec", connections=6)
result: List = client.execute(requests_list=[{
"url": "https://api.pro.coinbase.com/products/ETH-EUR/candles?granularity=900&start=2021-12-04T00:00:00Z&end=2021-12-04T00:00:00Z",
"params": {}}] * 1000, func=get_ethereum)
print(result)
The TentacleSession works the same as how you write POST, GET, PUT and PATCH for aiohttp.ClientSession.
Let me know if it helps your issue related to rate limits and connection for crawling.

Categories

Resources