I'm trying to add a delay between requests in an asynchronous way.
When I use Tornado gen.sleep(x) my function (launch) doesn't get executed.
If I remove yield from yield gen.sleep(1.0), function is called, but no delay is added.
How to add delay between requests in my for loop? I need to control Request per second to external API.
If I use time.sleep the response is delayed after all requests are completed.
Tried to add #gen.engine decorator to launch function and no results.
Code:
import collections
import tornado.httpclient
class BacklogClient(object):
MAX_CONCURRENT_REQUESTS = 20
def __init__(self, ioloop):
self.ioloop = ioloop
self.client = tornado.httpclient.AsyncHTTPClient(max_clients=self.MAX_CONCURRENT_REQUESTS)
self.client.configure(None, defaults=dict(connect_timeout=20, request_timeout=30))
self.backlog = collections.deque()
self.concurrent_requests = 0
def __get_callback(self, function):
def wrapped(*args, **kwargs):
self.concurrent_requests -= 1
self.try_run_request()
return function(*args, **kwargs)
return wrapped
def try_run_request(self):
while self.backlog and self.concurrent_requests < self.MAX_CONCURRENT_REQUESTS:
request, callback = self.backlog.popleft()
self.client.fetch(request, callback=callback)
self.concurrent_requests += 1
def fetch(self, request, callback=None):
wrapped = self.__get_callback(callback)
self.backlog.append((request, wrapped))
self.try_run_request()
import time
from tornado import ioloop, httpclient, gen
class TornadoBacklog:
def __init__(self):
self.queue = 0
self.debug = 1
self.toProcess = [
'http://google.com',
'http://yahoo.com',
'http://nytimes.com',
'http://msn.com',
'http://cnn.com',
'http://twitter.com',
'http://facebook.com',
]
def handle_request(self, response):
print response.code
if not self.backlog.backlog and self.backlog.concurrent_requests == 0:
ioloop.IOLoop.instance().stop()
def launch(self):
self.ioloop = ioloop.IOLoop.current()
self.backlog = BacklogClient(self.ioloop)
for item in self.toProcess:
yield gen.sleep(1.0)
print item
self.backlog.fetch(
httpclient.HTTPRequest(
item,
method='GET',
headers=None,
),
self.handle_request
)
self.ioloop.start()
def main():
start_time = time.time()
scraper = TornadoBacklog()
scraper.launch()
elapsed_time = time.time() - start_time
print('Process took %f seconds processed %d items.' % (elapsed_time, len(scraper.toProcess)))
if __name__ == "__main__":
main()
Reference: https://github.com/tornadoweb/tornado/issues/1400
Tornado coroutines have two components:
They contain "yield" statements
They are decorated with "gen.coroutine"
Use the "coroutine" decorator on your "launch" function:
#gen.coroutine
def launch(self):
Run a Tornado coroutine from start to finish like this:
tornado.ioloop.IOLoop.current().run_sync(launch)
Remove the call to "ioloop.start" from your "launch" function: the loop runs the "launch" function, not vice-versa.
Related
i have a simple grpc server that has two services:
signin, ping, encapsulated in the following class that also has a private method to authenticate the requests:
class Listener(pingpong_pb2_grpc.PingPongServiceServicer):
def __init__(self):
self.counter = counter_g
self.last_print_time = time.time()
def __str__(self):
return self.__class__.__name__
def auth_request(self, request, context):
metadata_dict = dict(context.invocation_metadata())
if metadata_dict.get("authorization").split(" ")[1] == "jf90845h5gfip345t8":
pass
else:
print("Auth Failed")
context.abort(grpc.StatusCode.UNAUTHENTICATED, "Auth Failed")
def signin(self, request, context):
"""The signin function is the rpc call that is called by the client"""
if request.username == "test" and request.password == "test":
print('Signin Success')
return pingpong_pb2.SignInResponse(token="jf90845h5gfip345t8", success=True)
else:
print('Signin Failed')
return pingpong_pb2.SignInResponse(token="bad token", success=False)
def ping(self, request, context):
"""The ping function is the rpc call that is called by the client"""#
self.auth_request(request, context)
self.counter += 1
if self.counter > 1000:
print("1000 calls in %3f seconds" % (time.time() - self.last_print_time))
self.last_print_time = time.time()
self.counter = 0
response = pingpong_pb2.Pong(count=request.count + 1)
return response
in order to make the grpc tasks report back execution time and success/failure events, i wrote this decorator:
def grpctask(func):
def wrapper(*args, **kwargs):
# get task's function name
task_name = func.__name__
start = time.time()
result = None
try:
result = func(*args, **kwargs)
except grpc.RpcError as e:
total = int((time.time() - start) * 1000)
events.request_failure.fire(request_type="grpc",
name=task_name,
response_time=total,
response_length=0,
exception=e)
else:
total = int((time.time() - start) * 1000)
events.request_success.fire(request_type="grpc",
name=task_name,
response_time=total,
response_length=5)
return result
return wrapper
my user behaviour is as follows:
every 31 seconds the user should execute:\ (behaviour 1)
ping_server_1
ping_server_2
ping_server_3
(note that each funtion is diffrent that have similar names only)
every 43 seconds the user should excute:\ (behaviour 2)
hello_server_1
hello_server_2
the two user actions should be independent, meaning that the user may execute both at the same time (not really parallel, just wait time between behaviour 1 and 2 should be zero ) \
i wrote the following script, nesting ping_server_1, ping_server_2, ping_server_3 inside a task, made locust not able to show data for each of those sub tasks"
from locust import TaskSet, between, task, User, events, HttpUser, constant, SequentialTaskSet
import random
import grpc
from google.protobuf import json_format
from client import PingClient
import time
from tools import grpctask
class TaskOne(SequentialTaskSet):
#task
class PingTest(SequentialTaskSet):
host = "localhost:9999"
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.stub = None
self.vacancy_id = None
self.token = None
self.ping_client = PingClient(host="localhost:9999")
def on_start(self):
self.connect_to_server()
self.login()
def connect_to_server(self):
# use the ping client to connect to the server
self.ping_client.connect_to_server()
def login(self):
# use the ping client to login
self.ping_client.set_token()
#task
#grpctask
def ping_server(self):
self.ping_client.ping()
#task
#grpctask
def ping_server_2(self):
self.ping_client.ping()
#task
#grpctask
def ping_server_3(self):
self.ping_client.ping()
self.interrupt()
#task
def empty(self):
print("PingTest is empty")
self.interrupt()
class TaskTwo(SequentialTaskSet):
#task
class HelloServer(TaskSet):
host = "localhost:9999"
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.stub = None
self.vacancy_id = None
self.token = None
self.ping_client = PingClient(host="localhost:9999")
def on_start(self):
self.connect_to_server()
self.login()
def connect_to_server(self):
# use the ping client to connect to the server
self.ping_client.connect_to_server()
def login(self):
# use the ping client to login
self.ping_client.set_token()
#task
#grpctask
def hello_server(self):
self.ping_client.ping()
#task
#grpctask
def hello_server_2(self):
self.ping_client.ping()
self.interrupt()
#task
def empty(self):
print("TaskTwo is empty")
self.interrupt()
class PingUser(User):
# force TaskOne to be executed every 31 seconds,
# and TaskTwo to be executed every 43 seconds
tasks = [TaskOne, TaskTwo]
is there a way to define a wait time for TaskOne and TaskTwo independetly from each other?
if not, what can be done to achieve the user behaviour described above while still treating each function as a task to get metrics for each function (task) (write each action as one function wont give metrics on each function)
I am working on a more complex example however I think this is a simplified version. The function should pause for 1 second, and given a delay to fire the function, we loop this call by intervals with a start value and stop the scheduler afterwards.
from twisted.internet import reactor
import time
from twisted.internet import task
class timer:
def __init__(self, *args):
self._paused = True
self._unpaused = False
def sleep(self):
if self._paused:
print(f"You have paused for this many seconds: {1}s")
time.sleep(1)
def scheduler(self, delay=0, *args):
if self._paused:
from twisted.internet import reactor
self._paused = task.deferLater(reactor,delay, self, *args)
if __name__ == '__main__':
pause_timer= timer()
timer_list = task.LoopingCall(pause_timer.scheduler)
timer_list.start(5)
reactor.callLater(10, reactor.stop)
reactor.run()
However, I get this error:
builtins.TypeError: 'timer' object is not callable
I will throw the complex example. am working with in here also:
import scrapy
from scrapy.utils import reactor
from scrapy import signals
import logging
logger = logging.getLogger(__name__)
class TestSpider(scrapy.Spider):
name = 'pause'
start_urls = [ f'http://quotes.toscrape.com/page/{i}/' for i in range(1, 11) ]
custom_settings = {
'DOWNLOAD_DELAY':1
}
def __init__(self, stats, pause):
self.stats = stats
self.pause = pause
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
stat = cls(crawler.stats, crawler)
crawler.signals.connect(stat.spider_opened, signals.spider_opened)
return stat
def spider_opened(self):
reactor.CallLaterOnce(self.pause.engine.pause).schedule(20)
def parse(self, response):
logger.info("Urls passed to: %s", response.url)
The class for callLaterOnce is defined by (I updated the scheduler in the reactor.py module):
class CallLaterOnce:
"""Schedule a function to be called in the next reactor loop, but only if
it hasn't been already scheduled since the last time it ran.
"""
def __init__(self, func, *a, **kw):
self._func = func
self._a = a
self._kw = kw
self._call = None
def schedule(self, delay=0):
from twisted.internet import reactor
if self._call is None:
self._call = task.deferLater(reactor,0, self)
scheduler = task.LoopingCall(self._call)
scheduler.start(delay)
scheduler.stop()
I have been trying to get my code to work for many days,
I am desperate.
I've scoured the internet, but I still can't find it.
I have a text file encoded in "latin-1" of 9GB -> 737 022 387 lines, each line contains a string.
I would like to read each line and send them in an http PUT request that waits for a response, and returns TRUE or FALSE if the response is 200 or 400
The PUT request takes about 1 to 3 seconds, so to speed up the processing time I would like to use either a Thread or a multiprocessing.
To start, I simulate my PUT request with a sleep of 3 seconds.
and even that I can't get it to work
This code split my string into char, i don't know why...
from multiprocessing import Pool
from time import sleep
def process_line(line):
sleep(3)
print(line)
return True
if __name__ == "__main__":
pool = Pool(2)
peon = open(r'D:\txtFile',encoding="latin-1")
for line in peon:
res = pool.map(process_line,line )
print(res)
This give error : TypeError: process_line() takes 1 positional argument but 17 were given
import multiprocessing
from multiprocessing import Pool
from time import sleep
def process_line(line):
sleep(3)
print(line)
return True
if __name__ == "__main__":
pool = Pool(2)
with open(r"d:\txtFile",encoding="latin-1") as file:
res = pool.apply(process_line,file.readline() )
print(res)
that : Crash the computer
from multiprocessing import Pool
from time import sleep
def process_line(line):
sleep(3)
print(line)
return True
if __name__ == "__main__":
pool = Pool(2)
peon = open(r'D:\txtFile',encoding="latin-1")
for line in peon:
res = pool.map(process_line,peon )
print(res)
Although the problem seems unrealistic though. shooting 737,022,387 requests! calculate how many months it'll take from single computer!!
Still, Better way to do this task is to read line by line from file in a separate thread and insert into a queue. And then multi-process the queue.
Solution 1:
from multiprocessing import Queue, Process
from threading import Thread
from time import sleep
urls_queue = Queue()
max_process = 4
def read_urls():
with open('urls_file.txt', 'r') as f:
for url in f:
urls_queue.put(url.strip())
print('put url: {}'.format(url.strip()))
# put DONE to tell send_request_processor to exit
for i in range(max_process):
urls_queue.put("DONE")
def send_request(url):
print('send request: {}'.format(url))
sleep(1)
print('recv response: {}'.format(url))
def send_request_processor():
print('start send request processor')
while True:
url = urls_queue.get()
if url == "DONE":
break
else:
send_request(url)
def main():
file_reader_thread = Thread(target=read_urls)
file_reader_thread.start()
procs = []
for i in range(max_process):
p = Process(target=send_request_processor)
procs.append(p)
p.start()
for p in procs:
p.join()
print('all done')
# wait for all tasks in the queue
file_reader_thread.join()
if __name__ == '__main__':
main()
Demo: https://onlinegdb.com/Elfo5bGFz
Solution 2:
You can use tornado asynchronous networking library
from tornado import gen
from tornado.ioloop import IOLoop
from tornado.queues import Queue
q = Queue(maxsize=2)
async def consumer():
async for item in q:
try:
print('Doing work on %s' % item)
await gen.sleep(0.01)
finally:
q.task_done()
async def producer():
with open('urls_file.txt', 'r') as f:
for url in f:
await q.put(url)
print('Put %s' % item)
async def main():
# Start consumer without waiting (since it never finishes).
IOLoop.current().spawn_callback(consumer)
await producer() # Wait for producer to put all tasks.
await q.join() # Wait for consumer to finish all tasks.
print('Done')
# producer and consumer can run in parallel
IOLoop.current().run_sync(main)
Using method multiprocessing.pool.imap is a step in the right direction but the problem is that with so much input you will be feeding the input task queue faster than the processing pool can take the tasks off the queue and return results. Consequently, the task queue will continue to grow and you will exhaust memory. What is needed is a way to "throttle" method imap so that it blocks once the task queue size has N tasks on it. I think a reasonable value for N as a default is twice the pool size to ensure that when a pool process completes work on a task there will be no delay for it to find another task to work on. Hence we create classes BoundedQueueProcessPool (multiprocessing) and BoundedQueueThreadPool (multithreading):
import multiprocessing.pool
import multiprocessing
import threading
class ImapResult():
def __init__(self, semaphore, result):
self._semaphore = semaphore
self.it = result.__iter__()
def __iter__(self):
return self
def __next__(self):
try:
elem = self.it.__next__()
self._semaphore.release()
return elem
except StopIteration:
raise
except:
self._semaphore.release()
raise
class BoundedQueuePool:
def __init__(self, limit, semaphore):
self._limit = limit
self._semaphore = semaphore
def release(self, result, callback=None):
self._semaphore.release()
if callback:
callback(result)
def apply_async(self, func, args=(), kwds={}, callback=None, error_callback=None):
self._semaphore.acquire()
callback_fn = self.release if callback is None else lambda result: self.release(result, callback=callback)
error_callback_fn = self.release if error_callback is None else lambda result: self.release(result, callback=callback)
return super().apply_async(func, args, kwds, callback=callback_fn, error_callback=error_callback_fn)
def imap(self, func, iterable, chunksize=1):
def new_iterable(iterable):
for elem in iterable:
self._semaphore.acquire()
yield elem
if chunksize > self._limit:
raise ValueError(f'chunksize argument exceeds {self._limit}')
result = super().imap(func, new_iterable(iterable), chunksize)
return ImapResult(self._semaphore, result)
def imap_unordered(self, func, iterable, chunksize=1):
def new_iterable(iterable):
for elem in iterable:
self._semaphore.acquire()
yield elem
if chunksize > self._limit:
raise ValueError(f'chunksize argument exceeds {self._limit}')
result = super().imap_unordered(func, new_iterable(iterable), chunksize)
return ImapResult(self._semaphore, result)
class BoundedQueueProcessPool(BoundedQueuePool, multiprocessing.pool.Pool):
def __init__(self, *args, max_waiting_tasks=None, **kwargs):
multiprocessing.pool.Pool.__init__(self, *args, **kwargs)
if max_waiting_tasks is None:
max_waiting_tasks = self._processes
elif max_waiting_tasks < 0:
raise ValueError(f'Invalid negative max_waiting_tasks value: {max_waiting_tasks}')
limit = self._processes + max_waiting_tasks
BoundedQueuePool.__init__(self, limit, multiprocessing.BoundedSemaphore(limit))
class BoundedQueueThreadPool(BoundedQueuePool, multiprocessing.pool.ThreadPool):
def __init__(self, *args, max_waiting_tasks=None, **kwargs):
multiprocessing.pool.ThreadPool.__init__(self, *args, **kwargs)
if max_waiting_tasks is None:
max_waiting_tasks = self._processes
elif max_waiting_tasks < 0:
raise ValueError(f'Invalid negative max_waiting_tasks value: {max_waiting_tasks}')
limit = self._processes + max_waiting_tasks
BoundedQueuePool.__init__(self, limit, threading.BoundedSemaphore(limit))
#######################################################################
from time import sleep
def process_line(line):
sleep(3)
# the lines already have line end characters:
print(line, end='')
return True
if __name__ == "__main__":
pool = BoundedQueueProcessPool(2)
with open("test.txt") as file:
for res in pool.imap(process_line, file):
#print(res)
pass
pool.close()
pool.join()
I'm using a decorator for the thread pool executor:
from functools import wraps
from .bounded_pool_executor import BoundedThreadPoolExecutor
_DEFAULT_POOL = BoundedThreadPoolExecutor(max_workers=5)
def threadpool(f, executor=None):
#wraps(f)
def wrap(*args, **kwargs):
return (executor or _DEFAULT_POOL).submit(f, *args, **kwargs)
where the BoundedThreadPoolExecutor is defined here
When I try to use the concurrent futures in a function decorated with #threadpool and then waiting all the futures withas_completed like
def get_results_as_completed(futures):
# finished, pending = wait(futures, return_when=ALL_COMPLETED)
futures_results = as_completed(futures)
for f in futures_results:
try:
yield f.result()
except:
pass
for some worker defined like
from thread_support import threadpool
from time import sleep
from random import randint
#threadpool
def my_worker:
res = {}
# do something
sleep(randint(1, 5))
return res
if __name__ == "__main__":
futures_results = get_results_as_completed(futures)
for r in futures_results:
results.append(r)
I cannot get the futures completed despite of the .result() call, thus resulting in a infinite loop on futures_results. Why?
So for example, I'm making an async decorator and wanted to limit the number of concurrent threads:
from multiprocessing import cpu_count
from threading import Thread
class async:
def __init__(self, function):
self.func = function
self.max_threads = cpu_count()
self.current_threads = []
def __call__(self, *args, **kwargs):
func_thread = Thread(target = self.func, args = args, kwargs = kwargs)
func_thread.start()
self.current_threads.append(func_thread)
while len(self.current_threads) > self.max_threads:
self.current_threads = [t for t in self.current_threads if t.isAlive()]
from time import sleep
#async
def printA():
sleep(1)
print "A"
#async
def printB():
sleep(1)
print "B"
Is this going to limit the total concurrent threads? IE. If I had 8 cores, would the current code end up having 16+ threads due to two separate async objects existing?
If so, how would I fix that?
Thanks!