I am attempting to retrieve historical data concurrently from Binance for each crypto pair in my database. I am running into bans with APIErrors, stating "APIError(code=-1003): Way too much request weight used; IP banned until 1629399758399. Please use the websocket for live updates to avoid bans."
How can I add a time delay to prevent reaching the API request weight limit which is 1200 per 1 Minute?
here's what I have as of now
import numpy as np
import json
import requests
import datetime, time
import aiohttp, asyncpg, asyncio
from asyncio import gather, create_task
from binance.client import AsyncClient
from multiprocessing import Process
import time
import config
async def main():
# create database connection pool
pool = await asyncpg.create_pool(user=config.DB_USER, password=config.DB_PASS, database=config.DB_NAME, host=config.DB_HOST, command_timeout=60)
# get a connection
async with pool.acquire() as connection:
cryptos = await connection.fetch("SELECT * FROM crypto")
symbols = {}
for crypto in cryptos:
symbols[crypto['id']] = crypto['symbol']
await get_prices(pool, symbols)
async def get_prices(pool, symbols):
try:
# schedule requests to run concurrently for all symbols
tasks = [create_task(get_price(pool, crypto_id, symbols[crypto_id])) for crypto_id in symbols]
await gather(*tasks)
print("Finalized all. Retrieved price data of {} outputs.".format(len(tasks)))
except Exception as e:
print("Unable to fetch crypto prices due to {}.".format(e.__class__))
print(e)
async def get_price(pool, crypto_id, url):
try:
candlesticks = []
client = await AsyncClient.create(config.BINANCE_API_KEY, config.BINANCE_SECRET_KEY)
async for kline in await client.get_historical_klines_generator(f"{crypto_id}".format(), AsyncClient.KLINE_INTERVAL_1HOUR, "18 Aug, 2021", "19 Aug, 2021"):
candlesticks.append(kline)
df = pd.DataFrame(candlesticks, columns = ["date","open","high","low","close","volume","Close time","Quote Asset Volume","Number of Trades","Taker buy base asset volume","Taker buy quote asset volume","Ignore"])
df["date"] = pd.to_datetime(df.loc[:, "date"], unit ='ms')
df.drop(columns=['Close time','Ignore', 'Quote Asset Volume', 'Number of Trades', 'Taker buy base asset volume', 'Taker buy quote asset volume'], inplace=True)
df.loc[:, "id"] = crypto_id
df
print(df)
except Exception as e:
print("Unable to get {} prices due to {}.".format(url, e.__class__))
print(e)
start = time.time()
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
end = time.time()
print("Took {} seconds.".format(end - start))
You can create an instance of a custom class that will keep the count of currently active requests (and timing of requests) - and only allow one request to proceed if that guard says it is ok.
Python´s async with command would be nice to use in such a construct since it can both guard a block, and decrease the active request count with minimal intervention in the code you already have.
This can proceed like this- the line in your code that actually trigger the requests is:
client = await AsyncClient.create(config.BINANCE_API_KEY, config.BINANCE_SECRET_KEY)
So, if we can ensure this line is called at most 1200 times per minute, having to yield to the mainloop while it does not happen, we are good.
While it would be possible to burst 1200 (-1) calls and them waiut one minute, the code will be both easier to write, and the API limit will be more respected in its spirit, if we simply yield one call each (60s / 1200) ( x 90% for a 10% nice margin) seconds.
The async with will call the class' __aenter__ method. In there we can simply check the time interval since the last API call and sleep until this time has passed.
(Actually, we will need one instance of the class per task, as __aenter__ needs to be called in each instance). But in order not to depend on a global "counter", we can create a factory function that will create a guard per API that needs limiting - and we keep that one in a global variable)
So, you can add this factory function to your program, and then create a guard-class on your main function and use "async with" inside the tasks code:
def create_rate_limit_guard(rate_limit=1200, safety_margin=0.9):
"""Rate limit is given in maximum requests per minute.
"""
# TBD: it would easy to have code to throttle by maximum active requests
# instead of total requests per minute.
# I will just let the accounting of concurrent_requests in place, though
class Guard:
request_interval = (60 / rate_limit) * safety_margin
current_requests = 0
max_concurrent_requests = 0
last_request = 0
async def __aenter__(self):
cls = self.__class__
cls.current_requests += 1
if (throttle_wait:= time.time() - last_request) < cls.request_interval:
await asyncio.sleep(throttle_wait)
cls.current_requests += 1
cls.last_request = time.time()
async def __aexit__(self, exc_type, exc, tb):
cls = self.__class__
cls.max_concurrent_requests = max(cls.max_concurrent_requests, cls.current_requests)
cls.current_requests -= 1
return Guard
And in your code, you could just change get_price to this, and create the guard class (last line before if ...__main__:
async def get_price(pool, crypto_id, url):
try:
candlesticks = []
# consider having a single client application wise - you are creating one per task.
with BinanceLimitGuard():
client = await AsyncClient.create(config.BINANCE_API_KEY, config.BINANCE_SECRET_KEY)
# as the actual calls to the remote endpoint are done inside the client code itself,
# we can't just run "async for" on the generator - instead we have to throttle
# all the "for" interactions. So we "unfold" the async for in a while/anext
# structure so that we can place the guard before each interation:
klines_generator = await client.get_historical_klines_generator(
f"{crypto_id}".format(), AsyncClient.KLINE_INTERVAL_1HOUR, "18 Aug, 2021", "19 Aug, 2021")
while True:
try:
with BinanceLimitGuard():
kline = await klines_generator.__anext__()
except StopAsyncIteration:
break
candlesticks.append(kline)
df = pd.DataFrame(candlesticks, columns = ["date","open","high","low","close","volume","Close time","Quote Asset Volume","Number of Trades","Taker buy base asset volume","Taker buy quote asset volume","Ignore"])
df["date"] = pd.to_datetime(df.loc[:, "date"], unit ='ms')
df.drop(columns=['Close time','Ignore', 'Quote Asset Volume', 'Number of Trades', 'Taker buy base asset volume', 'Taker buy quote asset volume'], inplace=True)
df.loc[:, "id"] = crypto_id
print(df)
except Exception as e:
print("Unable to get {} prices due to {}.".format(url, e.__class__))
print(e)
BinanceLimitGuard = create_rate_limit_guard(300)
if __name__ == "__main__":
# all code that is meant to take place when your file is run as a program
# should be guarded in this if block. Importing your file should not "print"
start = time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
end = time.time()
print("Took {} seconds.".format(end - start))
Note that while I designed the guard to "1200 requests per minute" - I sugested a limit of "300" parallel tasks per minute above, in BinanceLimitGuard = create_rate_limit_guard(300) - because, checking the source code for the binance client itself, it does perform several requests of itself in a call to "get_historical_klines" - and that code has embedded a limit of 3 calls per second - but which take place per generator, so we can't account for them on the outside code.
If this still not work, it can be made to work by subclassing (or monkeypatching) the AsyncClient itself and placing the limit rate on its internal _request_api internal method, at this place https://github.com/sammchardy/python-binance/blob/a6f3048527f0f2fd9bc6591ac1fdd926b2a29f3e/binance/client.py#L330 - then you can go back to the "1200 limit" as it will account all internal calls. (drop a comment if you need to resort to this, I could complete this answer or add another one)
I am very new to asynchronous programming and I was playing around with httpx. I have the following code and I am sure I am doing something wrong - just don't know what it is. There are two methods, one synchronous and other asynchronous. They are both pull from google finance. On my system I am seeing the time spent as following:
Asynchronous: 5.015218734741211
Synchronous: 5.173618316650391
Here is the code:
import httpx
import asyncio
import time
#
#--------------------------------------------------------------------
#
#--------------------------------------------------------------------
#
def sync_pull(url):
r = httpx.get(url)
print(r.status_code)
#
#--------------------------------------------------------------------
#
#--------------------------------------------------------------------
#
async def async_pull(url):
async with httpx.AsyncClient() as client:
r = await client.get(url)
print(r.status_code)
#
#--------------------------------------------------------------------
#
#--------------------------------------------------------------------
#
if __name__ == "__main__":
goog_fin_nyse_url = 'https://www.google.com/finance/quote/'
tickers = ['F', 'TWTR', 'CVX', 'VZ', 'GME', 'GM', 'PG', 'AAL',
'MARK', 'AAP', 'THO', 'NGD', 'ZSAN', 'SEAC',
]
print("Running asynchronously...")
async_start = time.time()
for ticker in tickers:
url = goog_fin_nyse_url + ticker + ':NYSE'
asyncio.run(async_pull(url))
async_end = time.time()
print(f"Time lapsed is: {async_end - async_start}")
print("Running synchronously...")
sync_start = time.time()
for ticker in tickers:
url = goog_fin_nyse_url + ticker + ':NYSE'
sync_pull(url)
sync_end = time.time()
print(f"Time lapsed is: {sync_end - sync_start}")
I had hoped the asynchronous method approach would require a fraction of the time the synchronous approach is requiring. What am I doing wrong?
When you say asyncio.run(async_pull) you're saying run 'async_pull' and wait for the result to come back. Since you do this once per each ticker in your loop, you're essentially using asyncio to run things synchronously and won't see performance benefits.
What you need to do is create several async calls and run them concurrently. There are several ways to do this, the easiest is to use asyncio.gather (see https://docs.python.org/3/library/asyncio-task.html#asyncio.gather) which takes in a sequence of coroutines and runs them concurrently. Adapting your code is fairly straightforward, you create an async function to take a list of urls and then call async_pull on each of them and then pass that in to asyncio.gather and await the results. Adapting your code to this looks like the following:
import httpx
import asyncio
import time
def sync_pull(url):
r = httpx.get(url)
print(r.status_code)
async def async_pull(url):
async with httpx.AsyncClient() as client:
r = await client.get(url)
print(r.status_code)
async def async_pull_all(urls):
return await asyncio.gather(*[async_pull(url) for url in urls])
if __name__ == "__main__":
goog_fin_nyse_url = 'https://www.google.com/finance/quote/'
tickers = ['F', 'TWTR', 'CVX', 'VZ', 'GME', 'GM', 'PG', 'AAL',
'MARK', 'AAP', 'THO', 'NGD', 'ZSAN', 'SEAC',
]
print("Running asynchronously...")
async_start = time.time()
results = asyncio.run(async_pull_all([goog_fin_nyse_url + ticker + ':NYSE' for ticker in tickers]))
async_end = time.time()
print(f"Time lapsed is: {async_end - async_start}")
print("Running synchronously...")
sync_start = time.time()
for ticker in tickers:
url = goog_fin_nyse_url + ticker + ':NYSE'
sync_pull(url)
sync_end = time.time()
print(f"Time lapsed is: {sync_end - sync_start}")
Running this way, the asynchronous version runs in about a second for me as opposed to seven synchronously.
Here's a nice pattern I use (I tend to change it a little each time). In general, I make a module async_utils.py and just import the top-level fetching function (e.g. here fetch_things), and then my code is free to forget about the internals (other than error handling). You can do it in other ways, but I like the 'functional' style of aiostream, and often find the repeated calls to the process function take certain defaults I set using functools.partial.
Note: async currying with partials is Python 3.8+ only
You can pass in a tqdm.tqdm progress bar to pbar (initialised with known size total=len(things)) to have it update when each async response is processed.
import asyncio
import httpx
from aiostream import stream
from functools import partial
__all__ = ["fetch", "process", "async_fetch_urlset", "fetch_things"]
async def fetch(session, url, raise_for_status=False):
response = await session.get(str(url))
if raise_for_status:
response.raise_for_status()
return response
async def process_thing(data, things, pbar=None, verbose=False):
# Map the response back to the thing it came from in the things list
source_url = data.history[0].url if data.history else data.url
thing = next(t for t in things if source_url == t.get("thing_url"))
# Handle `data.content` here, where `data` is the `httpx.Response`
if verbose:
print(f"Processing {source_url=}")
build.update({"computed_value": "result goes here"})
if pbar:
pbar.update()
async def async_fetch_urlset(urls, things, pbar=None, verbose=False, timeout_s=10.0):
timeout = httpx.Timeout(timeout=timeout_s)
async with httpx.AsyncClient(timeout=timeout) as session:
ws = stream.repeat(session)
xs = stream.zip(ws, stream.iterate(urls))
ys = stream.starmap(xs, fetch, ordered=False, task_limit=20)
process = partial(process_thing, things=things, pbar=pbar, verbose=verbose)
zs = stream.map(ys, process)
return await zs
def fetch_things(urls, things, pbar=None, verbose=False):
return asyncio.run(async_fetch_urlset(urls, things, pbar, verbose))
In this example, the input is a list of dicts (with string keys and values), things: list[dict[str,str]], and the key "thing_url" is accessed to retrieve the URL. Having a dict or object is desirable instead of just the URL string for when you want to 'map' the result back to the object it came from. The process_thing function is able to modify the input list things in-place (i.e. any changes are not scoped within the function, they change it back in the scope that called it).
You'll often find errors arise during async runs that you don't get when running synchronously, so you'll need to catch them, and re-try. A common gotcha is to retry at the wrong level (e.g. around the entire loop)
In particular, you'll want to import and catch httpcore.ConnectTimeout, httpx.ConnectTimeout, httpx.RemoteProtocolError, and httpx.ReadTimeout.
Increasing the timeout_s parameter will reduce the frequency of the timeout errors by letting the AsyncClient 'wait' for longer, but doing so may in fact slow down your program (it won't "fail fast" quite as fast).
Here's an example of how to use the async_utils module given above:
from async_utils import fetch_things
import httpx
import httpcore
# UNCOMMENT THIS TO SEE ALL THE HTTPX INTERNAL LOGGING
#import logging
#log = logging.getLogger()
#log.setLevel(logging.DEBUG)
#log_format = logging.Formatter('[%(asctime)s] [%(levelname)s] - %(message)s')
#console = logging.StreamHandler()
#console.setLevel(logging.DEBUG)
#console.setFormatter(log_format)
#log.addHandler(console)
things = [
{"url": "https://python.org", "name": "Python"},
{"url": "https://www.python-httpx.org/", "name": "HTTPX"},
]
#log.debug("URLSET:" + str(list(t.get("url") for t in things)))
def make_urlset(things):
"""Make a URL generator (empty if all have been fetched)"""
urlset = (t.get("url") for t in things if "computed_value" not in t)
return urlset
retryable_errors = (
httpcore.ConnectTimeout,
httpx.ConnectTimeout, httpx.RemoteProtocolError, httpx.ReadTimeout,
)
# ASYNCHRONOUS:
max_retries = 100
for i in range(max_retries):
print(f"Retry {i}")
try:
urlset = make_urlset(things)
foo = fetch_things(urls=urlset, things=things, verbose=True)
except retryable_errors as exc:
print(f"Caught {exc!r}")
if i == max_retries - 1:
raise
except Exception:
raise
# SYNCHRONOUS:
#for t in things:
# resp = httpx.get(t["url"])
In this example I set a key "computed_value" on a dictionary once the async response has successfully been processed which then prevents that URL from being entered into the generator on the next round (when make_urlset is called again). In this way, the generator gets progressively smaller. You can also do it with lists but I find a generator of the URLs to be pulled works reliably. For an object you'd change the dictionary key assignment/access (update/in) to attribute assignment/access (settatr/hasattr).
I wanted to post working version of the coding using futures - virtually the same run-time:
import httpx
import asyncio
import time
#
#--------------------------------------------------------------------
# Synchronous pull
#--------------------------------------------------------------------
#
def sync_pull(url):
r = httpx.get(url)
print(r.status_code)
#
#--------------------------------------------------------------------
# Asynchronous Pull
#--------------------------------------------------------------------
#
async def async_pull(url):
async with httpx.AsyncClient() as client:
r = await client.get(url)
print(r.status_code)
#
#--------------------------------------------------------------------
# Build tasks queue & execute coroutines
#--------------------------------------------------------------------
#
async def build_task() -> None:
goog_fin_nyse_url = 'https://www.google.com/finance/quote/'
tickers = ['F', 'TWTR', 'CVX', 'VZ', 'GME', 'GM', 'PG', 'AAL',
'MARK', 'AAP', 'THO', 'NGD', 'ZSAN', 'SEAC',
]
tasks= []
#
## Following block of code will create a queue full of function
## call
for ticker in tickers:
url = goog_fin_nyse_url + ticker + ':NYSE'
tasks.append(asyncio.ensure_future(async_pull(url)))
start_time = time.time()
#
## This block of code will derefernce the function calls
## from the queue, which will cause them all to run
## rapidly
await asyncio.gather(*tasks)
#
## Calculate time lapsed
finish_time = time.time()
elapsed_time = finish_time - start_time
print(f"\n Time spent processing: {elapsed_time} ")
# Start from here
if __name__ == "__main__":
asyncio.run(build_task())
So here is my use case:
I read from a database rows containing information to make a complex SOAP call (I'm using zeep to do these calls).
One row from the database corresponds to a request to the service.
There can be up to 20 thousand lines, so I don't want to read everything in memory before making the calls.
I need to process the responses - when the
response is OK, I need to store some returned information back into
my database, and when there is an exception I need to process the
exception for that particular request/response pair.
I need also to capture some external information at the time of the request creation, so that I know where to store the response from the request. In my current code I'm using the delightful property of gather() that makes the results come in the same order.
I read the relevant PEPs and Python documentation but I'm still very confused, as there seems to be multiple ways to solve the same problem.
I also went through countless exercises on the web, but the examples are all trivial - it's either asyncio.sleep() or some webscraping with a finite list of urls.
The solution that I have come up so far kinda works - the asyncio.gather() method is very, very, useful, but I have not been able to 'feed' it from a generator. I'm currently just counting to an arbitrary size and then starting a .gather() operation. I've transcribed the code, with boring parts left out and I've tried to anonymise the code
I've tried solutions involving semaphores, queues, different event loops, but I'm failing every time. Ideally I'd like to be able to create Futures 'continuously' - I think I'm missing the logic of 'convert this awaitable call to a future'
I'd be grateful for any help!
import asyncio
from asyncio import Future
import zeep
from zeep.plugins import HistoryPlugin
history = HistoryPlugin()
max_concurrent_calls = 5
provoke_errors = True
def export_data_async(db_variant: str, order_nrs: set):
st = time.time()
results = []
loop = asyncio.get_event_loop()
def get_client1(service_name: str, system: Systems = Systems.ACME) -> Tuple[zeep.Client, zeep.client.Factory]:
client1 = zeep.Client(wsdl=system.wsdl_url(service_name=service_name),
transport=transport,
plugins=[history],
)
factory_ns2 = client1.type_factory(namespace='ns2')
return client1, factory_ns2
table = 'ZZZZ'
moveback_table = 'EEEEEE'
moveback_dict = create_default_empty_ordered_dict('attribute1 attribute2 attribute3 attribute3')
client, factory = get_client1(service_name='ACMEServiceName')
if log.isEnabledFor(logging.DEBUG):
client.wsdl.dump()
zeep_log = logging.getLogger('zeep.transports')
zeep_log.setLevel(logging.DEBUG)
with Db(db_variant) as db:
db.open_db(CON_STRING[db_variant])
db.init_table_for_read(table, order_list=order_nrs)
counter_failures = 0
tasks = []
sids = []
results = []
def handle_future(future: Future) -> None:
results.extend(future.result())
def process_tasks_concurrently() -> None:
nonlocal tasks, sids, counter_failures, results
futures = asyncio.gather(*tasks, return_exceptions=True)
futures.add_done_callback(handle_future)
loop.run_until_complete(futures)
for i, response_or_fault in enumerate(results):
if type(response_or_fault) in [zeep.exceptions.Fault, zeep.exceptions.TransportError]:
counter_failures += 1
log_webservice_fault(sid=sids[i], db=db, err=response_or_fault, object=table)
else:
db.write_dict_to_table(
moveback_table,
{'sid': sids[i],
'attribute1': response_or_fault['XXX']['XXX']['xxx'],
'attribute2': response_or_fault['XXX']['XXX']['XXXX']['XXX'],
'attribute3': response_or_fault['XXXX']['XXXX']['XXX'],
}
)
db.commit_db_con()
tasks = []
sids = []
results = []
return
for row in db.rows(table):
if int(row.id) % 2 == 0 and provoke_errors:
payload = faulty_message_payload(row=row,
factory=factory,
)
else:
payload = message_payload(row=row,
factory=factory,
)
tasks.append(client.service.myRequest(
MessageHeader=factory.MessageHeader(**message_header_arguments(row=row)),
myRequestPayload=payload,
_soapheaders=[security_soap_header],
))
sids.append(row.sid)
if len(tasks) == max_concurrent_calls:
process_tasks_concurrently()
if tasks: # this is the remainder of len(db.rows) % max_concurrent_calls
process_tasks_concurrently()
loop.run_until_complete(transport.session.close())
db.execute_this_statement(statement=update_sql)
db.commit_db_con()
log.info(db.activity_log)
if counter_failures:
log.info(f"{table :<25} Count failed: {counter_failures}")
print("time async: %.2f" % (time.time() - st))
return results
Failed attempt with Queue: (blocks at await client.service)
loop = asyncio.get_event_loop()
counter = 0
results = []
async def payload_generator(db_variant: str, order_nrs: set):
# code that generates the data for the request
yield counter, row, payload
async def service_call_worker(queue, results):
while True:
counter, row, payload = await queue.get()
results.append(await client.service.myServicename(
MessageHeader=calculate_message_header(row=row)),
myPayload=payload,
_soapheaders=[security_soap_header],
)
)
print(colorama.Fore.BLUE + f'after result returned {counter}')
# Here do the relevant processing of response or error
queue.task_done()
async def main_with_q():
n_workers = 3
queue = asyncio.Queue(n_workers)
e = pprint.pformat(queue)
p = payload_generator(DB_VARIANT, order_list_from_args())
results = []
workers = [asyncio.create_task(service_call_worker(queue, results))
for _ in range(n_workers)]
async for c in p:
await queue.put(c)
await queue.join() # wait for all tasks to be processed
for worker in workers:
worker.cancel()
if __name__ == '__main__':
try:
loop.run_until_complete(main_with_q())
loop.run_until_complete(transport.session.close())
finally:
loop.close()
I am trying to use aiohttp to make asynchronous HTTP requests over multiple SOCKS proxies. Basically, I am creating a pool of Tor clients with different IP addresses, and want to be able to route HTTP requests through them using aiohttp.
Based on the suggestions here and here, I have been trying to use aiosocks, but the examples in those threads do not work (if they ever did) because they are based on an old version of aiosocks with a different API. Documentation and examples of using aiosocks online are very sparse (it doesn't seem widely used). But I haven't been able to find any other solutions for using aiohttp with SOCKS proxies.
Below is the code I have so far (sorry for the large amount of code - I tried to slim down the example as much as I could!). First I initialize the Tor clients with stem:
from datetime import datetime
import stem.process
from TorUtils import printCircuits, cleanShutdown
NUM_TOR_CLIENTS = 3
# create list of (source_port, control_port) tuples
tor_ports = [(str(9050 + i), str(9050 + NUM_TOR_CLIENTS + i)) for i in range(NUM_TOR_CLIENTS)]
# Every ISO 3166 country code except for {US} and {CA}
country_codes = '{AF}, {AX}, {AL}, {DZ}, {AS}, {AD}, {AO}, {AI}, {AQ}, {AG}, {AR}, {AM}, {AW}, {AU}, {AT}, {AZ}, {BS}, {BH}, {BD}, {BB}, {BY}, {BE}, {BZ}, {BJ}, {BM}, {BT}, {BO}, {BQ}, {BA}, {BW}, {BV}, {BR}, {IO}, {BN}, {BG}, {BF}, {BI}, {KH}, {CM}, {CV}, {KY}, {CF}, {TD}, {CL}, {CN}, {CX}, {CC}, {CO}, {KM}, {CG}, {CD}, {CK}, {CR}, {CI}, {HR}, {CU}, {CW}, {CY}, {CZ}, {DK}, {DJ}, {DM}, {DO}, {EC}, {EG}, {SV}, {GQ}, {ER}, {EE}, {ET}, {FK}, {FO}, {FJ}, {FI}, {FR}, {GF}, {PF}, {TF}, {GA}, {GM}, {GE}, {DE}, {GH}, {GI}, {GR}, {GL}, {GD}, {GP}, {GU}, {GT}, {GG}, {GN}, {GW}, {GY}, {HT}, {HM}, {VA}, {HN}, {HK}, {HU}, {IS}, {IN}, {ID}, {IR}, {IQ}, {IE}, {IM}, {IL}, {IT}, {JM}, {JP}, {JE}, {JO}, {KZ}, {KE}, {KI}, {KP}, {KR}, {KW}, {KG}, {LA}, {LV}, {LB}, {LS}, {LR}, {LY}, {LI}, {LT}, {LU}, {MO}, {MK}, {MG}, {MW}, {MY}, {MV}, {ML}, {MT}, {MH}, {MQ}, {MR}, {MU}, {YT}, {MX}, {FM}, {MD}, {MC}, {MN}, {ME}, {MS}, {MA}, {MZ}, {MM}, {NA}, {NR}, {NP}, {NL}, {NC}, {NZ}, {NI}, {NE}, {NG}, {NU}, {NF}, {MP}, {NO}, {OM}, {PK}, {PW}, {PS}, {PA}, {PG}, {PY}, {PE}, {PH}, {PN}, {PL}, {PT}, {PR}, {QA}, {RE}, {RO}, {RU}, {RW}, {BL}, {SH}, {KN}, {LC}, {MF}, {PM}, {VC}, {WS}, {SM}, {ST}, {SA}, {SN}, {RS}, {SC}, {SL}, {SG}, {SX}, {SK}, {SI}, {SB}, {SO}, {ZA}, {GS}, {SS}, {ES}, {LK}, {SD}, {SR}, {SJ}, {SZ}, {SE}, {CH}, {SY}, {TW}, {TJ}, {TZ}, {TH}, {TL}, {TG}, {TK}, {TO}, {TT}, {TN}, {TR}, {TM}, {TC}, {TV}, {UG}, {UA}, {AE}, {GB}, {UM}, {UY}, {UZ}, {VU}, {VE}, {VN}, {VG}, {VI}, {WF}, {EH}, {YE}, {ZM}, {ZW}'
tor_configs = [{'SOCKSPort': p[0], 'ControlPort': p[1], 'DataDirectory': './.tordata' + p[0],
'CookieAuthentication' : '1', 'MaxCircuitDirtiness': '3600', 'ExcludeNodes': country_codes,
'EntryNodes': '{us}, {ca}', 'ExitNodes': '{us}, {ca}', 'StrictNodes': '1',
'GeoIPExcludeUnknown': '1', 'EnforceDistinctSubnets': '0'
} for p in tor_ports]
print(f"Spawning {NUM_TOR_CLIENTS} tor clients ...")
start_time = datetime.now()
tor_clients = []
for cfg in tor_configs:
tor_clients.append({'config': cfg, 'process': stem.process.launch_tor_with_config(config = cfg)})
... and then I am trying to use the following code to make the HTTP requests with aiohttp:
from collections import defaultdict, deque
from datetime import datetime, timedelta
import asyncio
import aiohttp
import aiosocks
from aiosocks.connector import ProxyConnector, ProxyClientRequest
import async_timeout
TIMEOUT = 10
async def _get(url, session, proxy, request_limiter):
try:
async with request_limiter: # semaphore to limit number of concurrent requests
async with async_timeout.timeout(TIMEOUT):
async with session.get(url, proxy=proxy, proxy_auth=None) as resp:
status = int(resp.status)
headers = dict(resp.headers)
content_type = str(resp.content_type)
text = await resp.text()
return {'url': url, 'status': status, 'headers': headers, 'text': str(text), 'errors': None}
except asyncio.TimeoutError as e:
queue.visited_urls[url] = datetime.now()
return {'url': url, 'status': None, 'headers': None, 'text': None, 'errors': str(e)}
async def _getPagesTasks(url_list, tor_clients, request_limiter, loop):
"""Launch requests for all web pages."""
#deque rotates continuously through SOCKS sessions for each tor client ...
sessions = deque()
for tor_client in tor_clients:
conn = ProxyConnector()
session = aiohttp.ClientSession(connector=conn, request_class=ProxyClientRequest)
sessions.append({'proxy': 'http://127.0.0.1:' + tor_client['config']['SOCKSPort'], 'session': session})
tasks = []
task_count = 0
for url in url_list:
s = sessions.popleft();
session = s['session']
proxy = s['proxy']
task = loop.create_task(_get(url, session, proxy, request_limiter))
tasks.append(task)
task_count += 1
session.append(s)
results = await asyncio.gather(*tasks)
for s in sessions:
s.close()
return results
def getPages(url_list, tor_clients):
"""Given a URL list, dispatch pool of tor clients to concurrently fetch URLs"""
request_limiter = asyncio.Semaphore(len(tor_clients)) # limit to one request per client at a time
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
responses = loop.run_until_complete(_getPagesTasks(url_list, tor_clients, request_limiter, loop))
loop.close()
return responses
This code is not running, however. When I try to run it, I get the error below. I'm wondering if I'm doing something wrong, or if this is some problem with aiosocks (which seems like it's been unmaintained for a while, and might be targetting an older version of aiohttp or something ...):
~/Code/gis project/code/TorGetQueue.py in _getPagesTasks(url_list, tor_clients, request_limiter, loop)
50 sessions = deque()
51 for client in tor_clients:
---> 52 conn = ProxyConnector()
53 session = aiohttp.ClientSession(connector=conn, request_class=ProxyClientRequest)
54 sessions.append({'proxy': 'http://127.0.0.1:' + client['config']['SOCKSPort'], 'session': session})
~/.local/share/virtualenvs/code-pIyQci_2/lib/python3.6/site-packages/aiosocks/connector.py in __init__(self, verify_ssl, fingerprint, resolve, use_dns_cache, family, ssl_context, local_addr, resolver, keepalive_timeout, force_close, limit, limit_per_host, enable_cleanup_closed, loop, remote_resolve)
54 force_close=force_close, limit=limit, loop=loop,
55 limit_per_host=limit_per_host, use_dns_cache=use_dns_cache,
---> 56 enable_cleanup_closed=enable_cleanup_closed)
57
58 self._remote_resolve = remote_resolve
TypeError: __init__() got an unexpected keyword argument 'resolve'
What am I doing wrong here? Is there an easier way to use SOCKS proxies with aiohttp? What do I need to change to make this code work with aiosocks?
Thanks!
I tried using aiosocks for my project to get the same error as yours only to later discover that aiosocks has been abandoned.
You can use aiosocksy instead.
import asyncio
import aiohttp
from aiosocksy import Socks5Auth
from aiosocksy.connector import ProxyConnector, ProxyClientRequest
async def fetch(url):
connector = ProxyConnector()
socks = 'socks5://127.0.0.1:9050'
async with aiohttp.ClientSession(connector=connector, request_class=ProxyClientRequest) as session:
async with session.get(url, proxy=socks) as response:
print(await response.text())
loop = asyncio.get_event_loop()
loop.run_until_complete(fetch('http://httpbin.org/ip'))