currently I run a loop in a loop in a loop that then pass a new set of parameters of an already instanciated class and then call at first a class.reset() function and then the class.main() function.
As class.run is very cpu intense I do want to multiprocess this, but nowhere I've found an example how to do this.
Below is the code that needs to be multiprocessed:
st = Strategy() /// Strategy is my class
for start_delay in range(0, PAR_BT_CYCLE_LENGTH_END, 1):
for cycle_length in range(PAR_BT_CYCLE_LENGTH_START, PAR_BT_CYCLE_LENGTH_END+1, 1):
for cycle_pos in range(PAR_BT_N_POS_START, PAR_BT_N_POS_END+1, 1):
st.set_params(PAR_BT_START_CAPITAL, start_delay, cycle_length, cycle_pos, sBT,
iPAR_BT_TF1, iPAR_BT_TF2, iPAR_BT_TF3, iPAR_BT_TF4,
iPAR_BT_TFW1, iPAR_BT_TFW2, iPAR_BT_TFW3, iPAR_BT_TFW4)
st.reset()
bt = st.main()
# do something with return values (list) in bt
# after all processes have finished - use return values of all processes
What would be the best way to get this working as multiple processes?
You can use the ProcessPoolExecutor from concurrent.futures.
from concurrent.futures import ProcessPoolExecutor, as_completed
def run_strategy(*args):
st = Strategy(
st.set_params(*args)
st.reset()
bt = st.main()
return bt
ex = ProcessPoolExecutor()
futures = []
for start_delay in range(0, PAR_BT_CYCLE_LENGTH_END, 1):
for cycle_length in range(PAR_BT_CYCLE_LENGTH_START, PAR_BT_CYCLE_LENGTH_END+1, 1):
for cycle_pos in range(PAR_BT_N_POS_START, PAR_BT_N_POS_END+1, 1):
args = (
PAR_BT_START_CAPITAL,
start_delay,
cycle_length,
cycle_pos,
sBT,
iPAR_BT_TF1,
iPAR_BT_TF2,
iPAR_BT_TF3,
iPAR_BT_TF4,
iPAR_BT_TFW1,
iPAR_BT_TFW2,
iPAR_BT_TFW3,
iPAR_BT_TFW4
)
ex.submit(run_strategy, *args)
# collect the returned bts
bt_results = []
for f in as_completed(futures):
bt_results.append(f.result())
ex.shutdown()
Related
I'm using the following code to get a list of filings from AWS. I'm not sure where it went wrong.
import time
import datetime
from collections import deque
from typing import List, Deque, Iterable, Dict
import logging
import boto3
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed, Future
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
BUCKET: str = "irs-form-990"
EARLIEST_YEAR: int = 2009
cur_year: int = datetime.datetime.now().year
first_prefix: int = EARLIEST_YEAR * 100
last_prefix: int = (cur_year + 1) * 100
def get_keys_for_prefix(prefix: str) -> Iterable[str]:
"""Return a collection of all key names starting with the specified prefix."""
client = boto3.client('s3')
# See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/paginators.html
paginator = client.get_paginator('list_objects_v2')
page_iterator = paginator.paginate(Bucket=BUCKET, Prefix=prefix)
# A deque is a collection with O(1) appends and O(n) iteration
results: Deque[str] = deque()
i = 0
for i, page in enumerate(page_iterator):
if "Contents" not in page:
continue
# You could also capture, e.g., the timestamp or checksum here
page_keys: Iterable = (element["Key"] for element in page["Contents"])
results.extend(page_keys)
logging.info("Scanned {} page(s) with prefix {}.".format(i + 1, prefix))
return results
start: float = time.time()
# ProcessPoolExecutor starts a completely separate copy of Python for each worker
with ProcessPoolExecutor() as executor:
futures: Deque[Future] = deque()
for prefix in range(first_prefix, last_prefix):
future: Future = executor.submit(get_keys_for_prefix, str(prefix))
futures.append(future)
n = 0
# as_completed ignores submission order to prevent unnecessary waiting
for future in as_completed(futures):
keys: Iterable = future.result()
for key in keys:
# Do your analysis here
n += 1
elapsed: float = time.time() - start
logging.info("Discovered {:,} keys in {:,.1f} seconds.".format(n, elapsed))
I'm getting the following errors:
RuntimeError:
An attempt has been made to start a new process before the
current process has finished its bootstrapping phase.
This probably means that you are not using fork to start your
child processes and you have forgotten to use the proper idiom
in the main module:
if __name__ == '__main__':
freeze_support()
...
The "freeze_support()" line can be omitted if the program
is not going to be frozen to produce an executable.
and
concurrent.futures.process.BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.
Since this is a third-party code, I can't fix it myself (also, being a novice in python). Any help is appreciated.
I'm new to using concurrent futures and I cannot find any examples on how to do this. I have the global dictionary, data, that I want the function called by the concurrent futures executor to add results to. The function works but there is no output in data.
Thanks for any help,
T.
def estimate_shannon_entropy(dna_sequence):
bases = collections.Counter([tmp_base for tmp_base in dna_sequence])
# define distribution
dist = [x/sum(bases.values()) for x in bases.values()]
# use scipy to calculate entropy
entropy_value = entropy(dist, base=2)
#norm_ent = entropy_value/math.log(len(dna_sequence),2)
return entropy_value
def shan(i):
name1=i.split("/")[-1]
ext1=name1.split(".")[-1]
print(name1)
if ext1=="gz":
#print("gz detected")
f=gzip.open(i,'rt')
k=name1.split(".")[-2]
else:
f=open(i,'r')
k=ext
if k[-1]=="a":
fmt="fasta"
#print("fasta")
if k[-1]=="q":
fmt="fastq"
#print("fastq")
c=0
shannon_total=0
for x in SeqIO.parse(f,fmt):
c=c+1
if c<=samples:
shannon = estimate_shannon_entropy(str(x.seq))
shannon_total = shannon_total +shannon
ans=float(shannon_total/samples)
data[name1]=ans
folder=sys.argv[1]
filelist=glob.glob(folder)
filelist.sort(key=tokenize)
#print(filelist)
samples=int(sys.argv[2])
threads=int(sys.argv[3])
global data
data={}
executor = concurrent.futures.ProcessPoolExecutor(threads)
futures = [executor.submit(shan, i) for i in filelist]
concurrent.futures.wait(futures)
print(data)
Ok, I found the answer, will leave here in case there are better methods (sure there are).
Used Manager:
from multiprocessing import Manager
manager=Manager()
data=manager.dict()
executor = concurrent.futures.ProcessPoolExecutor(threads)
futures = [executor.submit(shan, i,data) for i in filelist]
concurrent.futures.wait(futures)
I am able to submit batches of concurrent.futures.ProcessPoolExecutor.submits() where each batch may contain several submit(). However, I noticed that if each batch of submits consumes a significant about of RAM, there can be quite a bit of RAM usage inefficiencies; need to wait for all futures in the batch to be completed before another batch of submit() can be submitted.
How does one create a continuous stream of Python's concurrent.futures.ProcessPoolExecutor.submit() until some condition is satisfied?
Test Script:
#!/usr/bin/env python3
import numpy as np
from numpy.random import default_rng, SeedSequence
import concurrent.futures as cf
from itertools import count
def dojob( process, iterations, samples, rg ):
# Do some tasks
result = []
for i in range( iterations ):
a = rg.standard_normal( samples )
b = rg.integers( -3, 3, samples )
mean = np.mean( a + b )
result.append( ( i, mean ) )
return { process : result }
if __name__ == '__main__':
cpus = 2
iterations = 10000
samples = 1000
# Setup NumPy Random Generator
ss = SeedSequence( 1234567890 )
child_seeds = ss.spawn( cpus )
rg_streams = [ default_rng(s) for s in child_seeds ]
# Peform concurrent analysis by batches
counter = count( start=0, step=1 )
# Serial Run of dojob
process = next( counter )
for cpu in range( cpus ):
process = next( counter )
rg = rg_streams[ cpu ]
rdict = dojob( process, iterations, samples, rg )
print( 'rdict', rdict )
# Concurrent Run of dojob
futures = []
results = []
with cf.ProcessPoolExecutor( max_workers=cpus ) as executor:
while True:
for cpu in range( cpus ):
process = next( counter )
rg = rg_streams[ cpu ]
futures.append( executor.submit( dojob, process, iterations, samples, rg ) )
for future in cf.as_completed( futures ):
# Do some post processing
r = future.result()
for k, v in r.items():
if len( results ) < 5000:
results.append( np.std( v ) )
print( k, len(results) )
if len(results) <= 100: #Put a huge number to simulate continuous streaming
futures = []
child_seeds = child_seeds[0].spawn( cpus )
rg_streams = [ default_rng(s) for s in child_seeds ]
else:
break
print( '\n*** Concurrent Analyses Ended ***' )
To expand on my comment, how about something like this, using the completion callback and a threading.Condition? I took the liberty of adding a progress indicator too.
EDIT: I refactored this into a neat function you pass your desired concurrency and queue depth, as well as a function that generates new jobs, and another function that processes a result and lets the executor know whether you've had enough.
import concurrent.futures as cf
import threading
import time
from itertools import count
import numpy as np
from numpy.random import SeedSequence, default_rng
def dojob(process, iterations, samples, rg):
# Do some tasks
result = []
for i in range(iterations):
a = rg.standard_normal(samples)
b = rg.integers(-3, 3, samples)
mean = np.mean(a + b)
result.append((i, mean))
return {process: result}
def execute_concurrently(cpus, max_queue_length, get_job_fn, process_result_fn):
running_futures = set()
jobs_complete = 0
job_cond = threading.Condition()
all_complete_event = threading.Event()
def on_complete(future):
nonlocal jobs_complete
if process_result_fn(future.result()):
all_complete_event.set()
running_futures.discard(future)
jobs_complete += 1
with job_cond:
job_cond.notify_all()
time_since_last_status = 0
start_time = time.time()
with cf.ProcessPoolExecutor(cpus) as executor:
while True:
while len(running_futures) < max_queue_length:
fn, args = get_job_fn()
fut = executor.submit(fn, *args)
fut.add_done_callback(on_complete)
running_futures.add(fut)
with job_cond:
job_cond.wait()
if all_complete_event.is_set():
break
if time.time() - time_since_last_status > 1.0:
rps = jobs_complete / (time.time() - start_time)
print(
f"{len(running_futures)} running futures on {cpus} CPUs, "
f"{jobs_complete} complete. RPS: {rps:.2f}"
)
time_since_last_status = time.time()
def main():
ss = SeedSequence(1234567890)
counter = count(start=0, step=1)
iterations = 10000
samples = 1000
results = []
def get_job():
seed = ss.spawn(1)[0]
rg = default_rng(seed)
process = next(counter)
return dojob, (process, iterations, samples, rg)
def process_result(result):
for k, v in result.items():
results.append(np.std(v))
if len(results) >= 10000:
return True # signal we're complete
execute_concurrently(
cpus=16,
max_queue_length=20,
get_job_fn=get_job,
process_result_fn=process_result,
)
if __name__ == "__main__":
main()
The Answer posted by #AKX works. Kudos to him. After testing it, I would like to recommend two amendments that I believe are worth considering and implementing.
Amendment 1: To prematurely cancel the execution of the python script, Ctrl+C has to be used. Unfortunately, doing that would not terminate the concurrent.futures.ProcessPoolExecutor() processes that are executing the function dojob(). This issue becomes more pronounced when the time is taken to complete dojob() is long; this situation can be simulated by making the sample size in the script to be large (e.g. samples = 100000). This issue can be seen when the terminal command ps -ef | grep python is executed. Also, if dojob() consumes a significant amount of RAM, the memory used by these concurrent processes do not get released until the concurrent processes are manually killed (e.g. kill -9 [PID]). To address these issues, the following amendment is needed.
with job_cond:
job_cond.wait()
should be changed to:
try:
with job_cond:
job_cond.wait()
except KeyboardInterrupt:
# Cancel running futures
for future in running_futures:
_ = future.cancel()
# Ensure concurrent.futures.executor jobs really do finish.
_ = cf.wait(running_futures, timeout=None)
So when Ctrl+C has to be used, you just have to press it once first. Next, give some time for the futures in running_futures to be cancelled. This could take a few seconds to several seconds to complete; it depends on the resource requirements of dojob(). You can see the CPUs activity in your task manager or system monitor drops to zero or hear the high revving sound from your cpu cooling fan reduce. Note, the RAM used would not be released yet. Thereafter, press Ctrl+C again and that should allow a clean exit of all the concurrent processes whereby the used RAM are also released.
Amendment 2: Presently, the inner while-loop dictates that jobs must be submitted continuously as fast as the cpu "mainThread" can allow. Realistically, there is no benefit to be able to submit more jobs than there are available cpus in the cpus pool. Doing so only unnecessarily consumes cpu resources from the "MainThread" of the main processor. To regulate the continuous job submission, a new submit_job threading.Event() object can be used.
Firstly, define such an object and set its value to True with:
submit_job = threading.Event()
submit_job.set()
Next, at the end of the inner while-loop add this condition and .wait() method:
with cf.ProcessPoolExecutor(cpus) as executor:
while True:
while len(running_futures) < max_queue_length:
fn, args = get_job_fn()
fut = executor.submit(fn, *args)
fut.add_done_callback(on_complete)
running_futures.add(fut)
if len(running_futures) >= cpus: # Add this line
submit_job.clear() # Add this line
submit_job.wait() # Add this line
Finally change the on_complete(future) callback to:
def on_complete(future):
nonlocal jobs_complete
if process_result_fn(future.result()):
all_complete_event.set()
running_futures.discard(future)
if len(running_futures) < cpus: # add this conditional setting
submit_job.set() # add this conditional setting
jobs_complete += 1
with job_cond:
job_cond.notify_all()
There is a library called Pypeln that does this beautifully. It allows for streaming tasks between stages, and each stage can be run in a process, thread, or asyncio pool, depending on what is optimum for your use case.
Sample code:
import pypeln as pl
import time
from random import random
def slow_add1(x):
time.sleep(random()) # <= some slow computation
return x + 1
def slow_gt3(x):
time.sleep(random()) # <= some slow computation
return x > 3
data = range(10) # [0, 1, 2, ..., 9]
stage = pl.process.map(slow_add1, data, workers=3, maxsize=4)
stage = pl.process.filter(slow_gt3, stage, workers=2)
data = list(stage) # e.g. [5, 6, 9, 4, 8, 10, 7]
I'm trying to submit around 150 million jobs to celery using the following code:
from celery import chain
from .task_receiver import do_work,handle_results,get_url
urls = '/home/ubuntu/celery_main/urls'
if __name__ == '__main__':
fh = open(urls,'r')
alldat = fh.readlines()
fh.close()
for line in alldat:
try:
result = chain(get_url.s(line[:-1]),do_work.s(line[:-1])).apply_async()
except:
print ("failed to submit job")
print('task submitted ' + str(line[:-1]))
Would it be faster to split the file into chunks and run multiple instances of this code? Or what can I do? I'm using memcached as the backend, rabbitmq as the broker.
import multiprocessing
from celery import chain
from .task_receiver import do_work,handle_results,get_url
urls = '/home/ubuntu/celery_main/urls'
num_workers = 200
def worker(urls,id):
"""worker function"""
for url in urls:
print ("%s - %s" % (id,url))
result = chain(get_url.s(url),do_work.s(url)).apply_async()
return
if __name__ == '__main__':
fh = open(urls,'r')
alldat = fh.readlines()
fh.close()
jobs = []
stack = []
id = 0
for i in alldat:
if (len(stack) < len(alldat) / num_workers):
stack.append(i[:-1])
continue
else:
id = id + 1
p = multiprocessing.Process(target=worker, args=(stack,id,))
jobs.append(p)
p.start()
stack = []
for j in jobs:
j.join()
If I understand your problem correctly:
you have a list of 150M urls
you want to run get_url() then do_work() on each of the urls
so you have two issues:
going over the 150M urls
queuing the tasks
Regarding the main for loop in your code, yes you could do that faster if you use multithreading, especially if you are using multicore cpu. Your master thread could read the file and pass chunks of it to sub-threads that will be creating the celery tasks.
Check the guide and the documentation:
https://realpython.com/intro-to-python-threading/
https://docs.python.org/3/library/threading.html
And now let's imagine you have 1 worker that is receiving these tasks. The code will generate 150M new tasks that will be pushed to the queue. Each chain will be a chain of get_url(), and do_work(), the next chain will run only when do_work() finishes.
If get_url() takes a short time and do_work() takes a long time, it will be a series of quick-task, slow-task, and the total time:
t_total_per_worker = (t_get_url_average+t_do_work_average) X 150M
If you have n workers
t_total = t_total_per_worker/n
t_total = (t_get_url_average+t_do_work_average) X 150M / n
Now if get_url() is time critical while do_work() is not, then, if you can, you should run all 150M get_url() first and when that is done run all 150M do_work(), but that may require changes to your process design.
That is what I would do. Maybe others have better ideas!?
First look at the following code:
pool = multiprocessing.Pool(processes=N)
batch = []
for item in generator():
batch.append(item)
if len(batch) == 10:
pool.apply_async(my_fun, args=(batch,))
batch = []
# leftovers
pool.apply_async(my_fun, args=(batch,))
Essentially I'm retrieving data from a generator, collecting in into a list and then spawning a process that consumes the batch of data.
This may look fine but when the consumers (aka the pool processes) are slower than the producer (aka the generator) memory usage of the main process grows until the generator stops or... the system runs out of memory.
How can I avoid this problem?
You might want to use a limited-size queue in this case.
q = multiprocessing.Queue(maxSize).
When used with max. size, this will provide you with the necessary counting and block the thread that is calling q.put() when it is full, so you could never post more than a certain number of work items on it and thus limit the memory needed to store the pending items.
Alternatively, you could use a counting semaphore (e.g., multiprocessing.BoundedSemaphore(maxSize)). Acquire it each time you get a work item from the generator and release it in your work function (my_fun) once the item is processed. This way, the maximum number of work items waiting to be processed will never exceed the initial value of the semaphore.
Use the grouper itertools recipe to chunk the data from your generator.
Use the infrastructure in concurrent futures to handle task submission and retrieval with the processes.
You could
submit a group of tasks; wait for them to finish; then submit another group, or
keep the pipeline full by submitting a new task each time one completes.
Setup (attempt to simulate your process):
import concurrent.futures
import itertools, time, collections, random
from pprint import pprint
# from itertools recipes
def grouper(iterable, n, fillvalue=None):
"Collect data into fixed-length chunks or blocks"
# grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
args = [iter(iterable)] * n
return itertools.zip_longest(*args, fillvalue=fillvalue)
# generator/iterator facsimile
class G:
'''Long-winded range(n)'''
def __init__(self, n=108):
self.n = n
self.a = []
def __iter__(self):
return self
def __next__(self):
#self.a.append(time.perf_counter())
if self.n < 0:
raise StopIteration
x = self.n
self.n -= 1
return x
def my_func(*args):
time.sleep(random.randint(1,10))
return sum(*args)
Wait for groups of tasks to complete
if __name__ == '__main__':
nworkers = 4
g = G()
# generate data three-at-a-time
data = grouper(g, 3, 0)
results = []
fs = []
with concurrent.futures.ProcessPoolExecutor(max_workers=nworkers) as executor:
for args in data:
print(f'pending:{len(executor._pending_work_items)}')
# block submission - limit pending tasks to conserve resources (memory)
if len(executor._pending_work_items) == nworkers:
# wait till all complete and get the results
futures = concurrent.futures.wait(fs, return_when=concurrent.futures.ALL_COMPLETED)
#print(futures)
results.extend(future.result() for future in futures.done)
fs = list(futures.not_done)
# add a new task
fs.append(executor.submit(my_func, args))
# data exhausted - get leftover results as they finish
for future in concurrent.futures.as_completed(fs):
print(f'pending:{len(executor._pending_work_items)}')
result = future.result()
results.append(result)
pprint(results)
Keep the process pool full.
if __name__ == '__main__':
nworkers = 4
g = G()
# generate data three-at-a-time
data = grouper(g, 3, 0)
results = []
fs = []
with concurrent.futures.ProcessPoolExecutor(max_workers=nworkers) as executor:
for args in data:
print(f'pending:{len(executor._pending_work_items)}')
# block submission - limit pending tasks to conserve resources (memory)
if len(executor._pending_work_items) == nworkers:
# wait till one completes and get the result
futures = concurrent.futures.wait(fs, return_when=concurrent.futures.FIRST_COMPLETED)
#print(futures)
results.extend(future.result() for future in futures.done)
fs = list(futures.not_done)
# add a new task
fs.append(executor.submit(my_func, args))
# data exhausted - get leftover results as they finish
for future in concurrent.futures.as_completed(fs):
print(f'pending:{len(executor._pending_work_items)}')
result = future.result()
results.append(result)
pprint(results)