I want to use the ProxyBroker lib in my python program to generate a list/queue of 10 working proxies.
Unfortunately I was not able to find anything similar in the example page of the lib.
This is what I got right now, but it feels like I'm using asyncio the wrong way to complete my task. Especially the gather function I'm using in combination with the collect(proxies) call.
def get_proxies(self, limit=10):
async def collect(proxies):
p = []
while True:
proxy = await proxies.get()
if proxy is None:
break
p.append(proxy)
return p
proxies = asyncio.Queue()
broker = Broker(proxies)
tasks = asyncio.gather(
broker.find(types=['HTTP', 'HTTPS'], limit=10),
collect(proxies))
loop = asyncio.get_event_loop()
proxy_list = loop.run_until_complete(tasks)
loop.close()
return proxy_list
What would be the preferred/correct way of generating the proxy list?
You can do it this:
"""Find and show 10 working HTTP(S) proxies."""
import asyncio
from proxybroker import Broker
async def show(proxies):
while True:
proxy = await proxies.get()
if proxy is None: break
print('Found proxy: %s' % proxy)
proxies = asyncio.Queue()
broker = Broker(proxies)
tasks = asyncio.gather(
broker.find(types=['HTTP', 'HTTPS'], limit=10),
show(proxies))
loop = asyncio.get_event_loop()
loop.run_until_complete(tasks)
or if you want to generate a file:
import asyncio
from proxybroker import Broker
async def save(proxies, filename):
"""Save proxies to a file."""
with open(filename, 'w') as f:
while True:
proxy = await proxies.get()
if proxy is None:
break
proto = 'https' if 'HTTPS' in proxy.types else 'http'
row = '%s://%s:%d\n' % (proto, proxy.host, proxy.port)
f.write(row)
def main():
proxies = asyncio.Queue()
broker = Broker(proxies)
tasks = asyncio.gather(broker.find(types=['HTTP', 'HTTPS'], limit=10),
save(proxies, filename='proxies.txt'))
loop = asyncio.get_event_loop()
loop.run_until_complete(tasks)
if __name__ == '__main__':
main()
Is all!
Good look!
Related
I need to check several hundred proxy servers and get the number of not working. Script for this
import urllib.request
import socket
net = ['http://192.168.1.1:8080',
'http://192.168.1.2:8080',
'http://192.168.1.3:8080',
'http://192.168.1.4:8080',
'http://192.168.1.5:8080',
'http://192.168.1.6:8080',
'http://192.168.1.7:8080',
'http://192.168.1.8:8080',
'http://192.168.1.9:8080',
'http://192.168.1.10:8080']
fail = 0
socket.setdefaulttimeout(3)
for x in net:
try:
print(x)
proxy = urllib.request.ProxyHandler({'http': (x)})
opener = urllib.request.build_opener(proxy)
urllib.request.install_opener(opener)
urllib.request.urlretrieve('http://google.com')
except IOError:
print ("Connection error")
fail+=1
print(fail)
Proxies in the list, I have given a simple version.
It takes 55 seconds to check 250 working proxies. I can't wait that long, need to increase the execution speed.
How can this be done using async?
This should give you an idea of how to approach it. You have to wrap the various connection blocks in try, except yourself.
NOTE: This code is not tested as I do not have any way of doing so.
import asyncio, aiohttp
def returnPartionedList(inputlist, x=100):
return([inputlist[i:i + x] for i in range(0, len(inputlist), x)])
# Returns: Original list split into segments of x.
async def TestProxy(url, proxy, session):
async with session.get(url, proxy=proxy, timeout=3) as response:
if response.status == 200:
_ = await response.text()
return(proxy)
async def TestProxies(listofproxies):
returnResults = []
url = "https://google.com" # Test proxy with this url
ProxyPartitions = returnPartionedList(listofproxies, 20) # Rate limit 20 per second
for partition in ProxyPartitions:
ProxyTasks = []
async with aiohttp.ClientSession() as session:
for proxy in partition:
ProxyTasks.append(asyncio.create_task(TestProxy(url, proxy, session)))
results = await asyncio.gather(*ProxyTasks, return_exceptions=False)
if results:
for result in results:
if result:
returnResults.append(result)
await asyncio.sleep(1)
return(returnResults)
async def main():
listofproxies = [
'http://10.10.1.1:8080',
'http://10.10.1.2:8080',
'http://10.10.1.3:8080',
'http://10.10.1.4:8080',
'http://10.10.1.5:8080',
'http://10.10.1.6:8080',
'http://10.10.1.7:8080',
'http://10.10.1.8:8080',
'http://10.10.1.9:8080',
'http://10.10.1.10:8080'
]
test_proxies = await TestProxies(listofproxies)
print(test_proxies)
if __name__ == "__main__":
asyncio.run(main())
I've inherited some code that utilizes Python Bleak to scan for advertisements emitted from a certain device. Whenever an advertisement from the Bluetooth mac address and service id we're looking for is detected and a certain condition from the extracted payload information is true, we want to terminate and return. In the attached code, I've masked the Bluetooth and service ID:s.
Not being too familiar with the event loop, is there a way to exit before the timer runs out? I suppose there's probably a better way to approach this problem.
Sample code:
import asyncio
import struct
from bleak import BleakScanner
timeout_seconds = 10
address_to_look_for = 'masked'
service_id_to_look_for = 'masked'
def detection_callback(device, advertisement_data):
if device.address == address_to_look_for:
byte_data = advertisement_data.service_data.get(service_id_to_look_for)
num_to_test = struct.unpack_from('<I', byte_data, 0)
if num_to_test == 1:
print('here we want to terminate')
async def run():
scanner = BleakScanner()
scanner.register_detection_callback(detection_callback)
await scanner.start()
await asyncio.sleep(timeout_seconds)
await scanner.stop()
if __name__=='__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(run())
I'm sure there are many ways this can be done. A small mod to your code would be rather than having the asyncio.sleep for the full period before you stop the scan, you could could have a while loop that ends on time elapsed or device found event.
For example:
import asyncio
import struct
from bleak import BleakScanner
timeout_seconds = 20
address_to_look_for = 'F1:D9:3B:39:4D:A2'
service_id_to_look_for = '0000feaa-0000-1000-8000-00805f9b34fb'
class MyScanner:
def __init__(self):
self._scanner = BleakScanner()
self._scanner.register_detection_callback(self.detection_callback)
self.scanning = asyncio.Event()
def detection_callback(self, device, advertisement_data):
# Looking for:
# AdvertisementData(service_data={
# '0000feaa-0000-1000-8000-00805f9b34fb': b'\x00\xf6\x00\x00\x00Jupiter\x00\x00\x00\x00\x00\x0b'},
# service_uuids=['0000feaa-0000-1000-8000-00805f9b34fb'])
if device.address == address_to_look_for:
byte_data = advertisement_data.service_data.get(service_id_to_look_for)
num_to_test, = struct.unpack_from('<I', byte_data, 0)
if num_to_test == 62976:
print('\t\tDevice found so we terminate')
self.scanning.clear()
async def run(self):
await self._scanner.start()
self.scanning.set()
end_time = loop.time() + timeout_seconds
while self.scanning.is_set():
if loop.time() > end_time:
self.scanning.clear()
print('\t\tScan has timed out so we terminate')
await asyncio.sleep(0.1)
await self._scanner.stop()
if __name__ == '__main__':
my_scanner = MyScanner()
loop = asyncio.get_event_loop()
loop.run_until_complete(my_scanner.run())
I have a set of CPU-intensive processes that once in a while depend on each other to proceed. So something like
def run():
while True:
do stuff
wake up some other process
wait for some other process to wake me up
do stuff
Within each process I'd like to use async, so that I can always have an instance of run running while others are waiting to be woken up. Looking at the asyncio docs, the only IPC option in the "High-level APIs" section that I see uses sockets. I'd much rather use a pipe, which it looks like I can perhaps do with the low-level API, but that documentation is chock full of warnings that if you're just writing an application then it's a mistake to be using it. Can someone weigh in on the idiomatic thing to do here? (And also, speed is an important factor, so if there's some less-idiomatic-but-more-performant thing I'd like to know about that option as well.)
I would like to mention the aioprocessing library, as I successfully used it in one of my projects. It provides an anync interface to the multiprocessing primitives including IPC, such as Process, Pipe, Lock, Queue and etc. It uses thread pool to do this:
...
#staticmethod
def coro_maker(func):
def coro_func(self, *args, loop=None, **kwargs):
return self.run_in_executor(
getattr(self, func), *args, loop=loop, **kwargs
)
return coro_func
But to be honest, a lot depends on the problem being solved, on what tasks are being performed concurrently, since the intensive IPC itself within the async approach is less effective than the synchronous approach due to overhead of event loop, thread pool and etc. Sometimes it is better to make all IPC operations synchronous and put it all in a separate thread. Again, it all depends on the problem and the environment. Below is a benchmark that is far from comprehensive, but it can give an approximate picture of the problem that is being solved in it (intensive exchange of buffers).
note: I wrote about the difference between a Queue and SimpleQueue here
Sync SimpleQueue: 1.4309470653533936
AioSimpleQueue: 12.32670259475708
AioQueue: 14.342737436294556
AioPipe: 11.747064590454102
subprocess pipe stream: 7.344956159591675
socket stream: 4.360717058181763
# main.py
import sys
import time
import asyncio
import aioprocessing as ap
import multiprocessing as mp
import proc
count = 5*10**4
data = b'*'*100
async def sync_simple_queue_func():
out_ = mp.SimpleQueue()
in_ = mp.SimpleQueue()
p = ap.AioProcess(target=proc.start_sync_queue_func, args=(out_, in_))
p.start()
begin_ts = time.time()
for i in range(count):
out_.put(data)
res = in_.get()
print('Sync SimpleQueue: ', time.time() - begin_ts)
out_.put(None)
async def simple_queue_func():
out_ = ap.AioSimpleQueue()
in_ = ap.AioSimpleQueue()
p = ap.AioProcess(target=proc.start_queue_func, args=(out_, in_))
p.start()
begin_ts = time.time()
for i in range(count):
await out_.coro_put(data)
res = await in_.coro_get()
print('AioSimpleQueue: ', time.time() - begin_ts)
await out_.coro_put(None)
async def queue_func():
out_ = ap.AioQueue()
in_ = ap.AioQueue()
p = ap.AioProcess(target=proc.start_queue_func, args=(out_, in_))
p.start()
begin_ts = time.time()
for i in range(count):
await out_.coro_put(data)
res = await in_.coro_get()
print('AioQueue: ', time.time() - begin_ts)
await out_.coro_put(None)
async def pipe_func():
main_, child_ = ap.AioPipe()
p = ap.AioProcess(target=proc.start_pipe_func, args=(child_,))
p.start()
begin_ts = time.time()
for i in range(count):
await main_.coro_send(data)
res = await main_.coro_recv()
print('AioPipe: ', time.time() - begin_ts)
await main_.coro_send(None)
await p.coro_join()
server = None
async def handle_child(reader, writer):
begin_ts = time.time()
for i in range(count):
writer.write(data)
res = await reader.read(len(data))
print('socket stream: ', time.time() - begin_ts)
writer.close()
async def socket_func():
global server
addr = ('127.0.0.1', 8888)
server = await asyncio.start_server(handle_child, *addr)
p = ap.AioProcess(target=proc.start_socket_func, args=(addr,))
p.start()
async with server:
await server.serve_forever()
async def subprocess_func():
prog = await asyncio.create_subprocess_shell(
'python proc.py',
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE)
begin_ts = time.time()
for i in range(count):
prog.stdin.write(data)
res = await prog.stdout.read(len(data))
print('subprocess pipe stream: ', time.time() - begin_ts)
prog.stdin.close()
async def main():
await sync_simple_queue_func()
await simple_queue_func()
await queue_func()
await pipe_func()
await subprocess_func()
await socket_func()
asyncio.run(main())
# proc.py
import asyncio
import sys
import aioprocessing as ap
async def sync_queue_func(in_, out_):
while True:
n = in_.get()
if n is None:
return
out_.put(n)
async def queue_func(in_, out_):
while True:
n = await in_.coro_get()
if n is None:
return
await out_.coro_put(n)
async def pipe_func(child):
while True:
n = await child.coro_recv()
if n is None:
return
await child.coro_send(n)
data = b'*' * 100
async def socket_func(addr):
reader, writer = await asyncio.open_connection(*addr)
while True:
n = await reader.read(len(data))
if not n:
break
writer.write(n)
def start_sync_queue_func(in_, out_):
asyncio.run(sync_queue_func(in_, out_))
def start_queue_func(in_, out_):
asyncio.run(queue_func(in_, out_))
def start_pipe_func(child):
asyncio.run(pipe_func(child))
def start_socket_func(addr):
asyncio.run(socket_func(addr))
async def connect_stdin_stdout():
loop = asyncio.get_event_loop()
reader = asyncio.StreamReader()
protocol = asyncio.StreamReaderProtocol(reader)
dummy = asyncio.Protocol()
await loop.connect_read_pipe(lambda: protocol, sys.stdin) # sets read_transport
w_transport, _ = await loop.connect_write_pipe(lambda: dummy, sys.stdout)
writer = asyncio.StreamWriter(w_transport, protocol, reader, loop)
return reader, writer
async def main():
reader, writer = await connect_stdin_stdout()
while True:
res = await reader.read(len(data))
if not res:
break
writer.write(res)
if __name__ == "__main__":
asyncio.run(main())
I have some equipment with http interface that frequently generate infinite http page with values I want to parse and save to the database.
I started with requests:
import asyncio
import asyncpg
import requests
class node_http_mtr():
def __init__(self, ip, nsrc, ndst):
self.ip = ip
self.nsrc = nsrc
self.ndst = ndst
try:
self.data = requests.get('http://' + self.ip + '/nph-cgi_mtr?duration=-1&interval=0', stream=True, timeout=10)
except:
return
def __iter__(self):
return self
def __next__(self):
mtr = list()
try:
for chunk in self.data.iter_content(32 * (self.nsrc + self.ndst), '\n'):
# DEBUG log chunk
for line in chunk.split('\n'):
# DEBUG log line
if line.startswith('MTR'):
try:
_, io, num, val = line.split(' ')
l, r = val.split(':')[1], val.split(':')[2]
mtr.append((self.ip, io+num, l, r))
except:
# ERROR log line
pass
if len(mtr) == self.nsrc + self.ndst:
break
if len(mtr) == self.nsrc + self.ndst:
yield mtr
else:
continue
except:
# ERROR connection lost
return
async def save_to_db(data_to_save):
global pool
try:
async with pool.acquire() as conn:
await conn.execute('''INSERT INTO mtr (ip, io, l, r) VALUES %s''' % ','.join(str(row) for row in data_to_save))
finally:
await pool.release(conn)
async def remove_from_db(ip):
global pool
try:
async with pool.acquire() as conn:
await conn.execute('''DELETE FROM httpmtr WHERE ip = $1''', ip)
finally:
await pool.release(conn)
async def http_mtr_worker():
global workers_list
global loop
while True:
await asyncio.sleep(0)
for ip in list(workers_list):
data_to_save = next(workers_list[ip])
if data_to_save:
asyncio.ensure_future(save_to_db(next(data_to_save)))
await asyncio.sleep(0)
async def check_for_workers():
global workers_list
global pool
while True:
await asyncio.sleep(0)
try:
async with pool.acquire() as conn:
workers = await conn.fetch('''SELECT ip FROM httpmtr''')
finally:
await pool.release(conn)
for worker in workers:
if worker['ip'] not in list(workers_list):
workers_list[worker['ip']] = node_http_mtr(worker['ip'], 8, 8)
await asyncio.sleep(0)
print('Add worker', worker['ip'])
await asyncio.sleep(0)
ips_to_delete = set(workers_list.keys()) - set([i[0] for i in workers])
if len(ips_to_delete) != 0:
for ip in ips_to_delete:
print('Delete worker ', ip)
workers_list.pop(ip)
await asyncio.sleep(0)
async def make_db_connection():
pool = await asyncpg.create_pool(user='postgres', password='data', database='test', host='localhost', max_queries=50000, command_timeout=60)
return pool
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
pool = loop.run_until_complete(make_db_connection())
workers_list = {}
try:
asyncio.ensure_future(check_for_workers())
asyncio.ensure_future(http_mtr_worker())
loop.run_forever()
except Exception as e:
print(e)
pass
finally:
print("Closing Loop")
loop.close()
I have triggered procedure in DB which deletes all data older then 1 second, the final result with one worker in PostgreSQL is:
test=# select count(*) from mtr;
count
-------
384
(1 ёЄЁюър)
It means 384 results per second. There are 16 different kinds of data in each device, so I have 384/16 = 24 values per second. It's appropriate result. But the more workers I add the worse performance I have: with 10 workers I have 2-3 times less values. The goal is to have hundreds of workers and 24-25 values/sec.
Next I tried to do is to use aiohttp. I expected to get much better result. Hastily I wrote test code:
import asyncio
from aiohttp import ClientSession
import asyncpg
async def parse(line):
if line.startswith('MTR'):
_, io, num, val = line.split(' ')
l, r = val.split(':')[1], val.split(':')[2]
return ('ip.will.be.here', io + num, l, r)
async def run():
url = "http://10.150.20.130/nph-cgi_mtr?duration=-1&interval=0"
async with ClientSession() as session:
while True:
async with session.get(url) as response:
buffer = b''
start = False
async for line in response.content.iter_any():
if line.startswith(b'\n'):
start = True
buffer += line
elif start and line.endswith(b'\n'):
buffer += line
mtr = [await parse(line) for line in buffer.decode().split('\n')[1:-1]]
await save_to_db(mtr)
break
elif start:
buffer += line
async def make_db_connection():
pool = await asyncpg.create_pool(user='postgres', password='data', database='test', host='localhost', max_queries=50000, command_timeout=60)
return pool
async def save_to_db(data_to_save):
global pool
try:
async with pool.acquire() as conn:
await conn.execute('''INSERT INTO mtr (ip, io, l, r) VALUES %s''' % ','.join(str(row) for row in data_to_save))
finally:
await pool.release(conn)
loop = asyncio.get_event_loop()
pool = loop.run_until_complete(make_db_connection())
future = asyncio.ensure_future(run())
loop.run_until_complete(future)
And I've got this:
test=# select count(*) from mtr;
count
-------
80
(1 ёЄЁюър)
i.e. I've gotten 5 time worse performance with asynchronous requests. I'm stuck. I don't understand how to solve it.
UPDATE. Profiling didn't make the situation more clear at all.
requests:
aiohttp:
With requests the situation is more or less clear. But what the problem with async aiohttp I don't understand at all.
UPDATE 16/05/18. Finally I came back to multithreading and I got what I need - constant performance with big amount of workers. Asynchronous calls is not a panacea indeed.
Is it possible to have multiple loops with asyncio? If the response is yes how can I do that?
My use case is:
* I extract urls from a list of websites in async
* For each "sub url list", I would crawl them in async/
Example to extract urls:
import asyncio
import aiohttp
from suburls import extractsuburls
#asyncio.coroutine
def extracturls(url):
subtasks = []
response = yield from aiohttp.request('GET', url)
suburl_list = yield from response.text()
for suburl in suburl_list:
subtasks.append(asyncio.Task(extractsuburls(suburl)))
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.gather(*subtasks))
if __name__ == '__main__':
urls_list = ['http://example1.com', 'http://example2.com']
for url in url_list:
subtasks.append(asyncio.Task(extractsuburls(url)))
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.gather(*subtasks))
loop.close()
If I execute this code I'll have an error when python will try to launch the second loop witch says that a loop is already running.
P.S: my module "extractsuburls" uses aiohttp to perform web request.
EDIT:
Well, I've try this solution:
import asyncio
import aiohttp
from suburls import extractsuburls
#asyncio.coroutine
def extracturls( url ):
subtasks = []
response = yield from aiohttp.request('GET', url)
suburl_list = yield from response.text()
jobs_loop = asyncio.new_event_loop()
for suburl in suburl_list:
subtasks.append(asyncio.Task(extractsuburls(suburl)))
asyncio.new_event_loop(jobs_loop)
jobs_loop.run_until_complete(asyncio.gather(*subtasks))
jobs_loop.close()
if __name__ == '__main__':
urls_list = ['http://example1.com', 'http://example2.com']
for url in url_list:
subtasks.append(asyncio.Task(extractsuburls(url)))
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.gather(*subtasks))
loop.close()
But I've this error: loop argument must agree with Future
Any idea?
You don't need several event loops, just use yield from gather(*subtasks) in extracturls() coroutine:
import asyncio
import aiohttp
from suburls import extractsuburls
#asyncio.coroutine
def extracturls(url):
subtasks = []
response = yield from aiohttp.request('GET', url)
suburl_list = yield from response.text()
for suburl in suburl_list:
subtasks.append(extractsuburls(suburl))
yield from asyncio.gather(*subtasks)
if __name__ == '__main__':
urls_list = ['http://example1.com', 'http://example2.com']
for url in url_list:
subtasks.append(extractsuburls(url))
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.gather(*subtasks))
loop.close()
As result you get waiting for subtasks until extracturls finished.