I am creating a table that shows the running processes, with a decorator to update these information. Using the decorator as in the code below, causes the GUI to hangs out every time the singleshot is called (every second).
Why the singleshot is causing the GUI to hangs, and how can I get better logic ?
# First create table
data = getProcesses()
tableWidget = QTableWidget()
Layout.addWidget(tableWidget)
fillTable(data, len(data['pid']), len(data), tableWidget)
# get the processes
def getProcesses():
allprocesses = {}
for p in psutil.process_iter():
try:
if p.name().lower() in ["python.exe", "pythonw.exe"]: # console, window
with p.oneshot():
allprocesses.setdefault('pid', []).append(p.pid)
allprocesses.setdefault('memory(MB)', []).append(p.memory_full_info().uss/(1024**2))
allprocesses.setdefault('memory(%)', []).append(p.memory_percent(memtype="rss"))
allprocesses.setdefault('cpu_times(s)', []).append(sum(p.cpu_times()[:2]))
allprocesses.setdefault('create_time', []).append(datetime.datetime.fromtimestamp(p.create_time()).strftime("%Y-%m-%d %H:%M:%S"))
allprocesses.setdefault('cpu(%)', []).append(p.cpu_percent()/psutil.cpu_count())
except:
continue
del p
return allprocesses
def updateInfo(data, table):
try:
table.clear()
for p in psutil.process_iter():
if p.pid in data['pid']:
try:
with p.oneshot():
data['memory(MB)'][data['pid'].index(p.pid)] = p.memory_full_info().uss/(1024**2)
data['memory(%)'][data['pid'].index(p.pid)] = p.memory_percent(memtype="rss")
data['cpu_times(s)'][data['pid'].index(p.pid)] = sum(p.cpu_times()[:2])
data['cpu(%)'][data['pid'].index(p.pid)] = p.cpu_percent()/psutil.cpu_count()
self.fillTable(data, len(data['pid']), len(data), table)
except:
continue
except:
pass
def tabledecorator(func):
#functools.wraps(func)
def wrapper(data, r, c, table):
func(data, r, c, table)
QTimer.singleShot(1000, lambda: self.updateInfo(data, table))
return wrapper
#tabledecorator
def fillTable(data, r, c, table):
table.setRowCount(r)
table.setColumnCount(c)
horHeaders = []
for n, key in enumerate(reversed(sorted(data.keys()))):
horHeaders.append(key)
for m, item in enumerate(data[key]):
newitem = QTableWidgetItem()
newitem.setData(Qt.DisplayRole, item)
table.setItem(m, n, newitem)
table.setHorizontalHeaderLabels(horHeaders)
table.resizeColumnsToContents()
table.resizeRowsToContents()
del horHeaders, n, key, m, item, newitem
There various performance issues in your implementation, but the most important is that fillTable is called for all items.
Since that function is decorated with the timer, the result is that you will be calling a delayed updateInfo for each row in the table, and since that function again calls the decorated fillTable, you're actually having a huge recursion problem: at every new cycle, the number of function calls grows exponentially.
If you have 2 matching processes, the first time updateInfo is called it will call fillTable two times while creating two QTimers. After one second you'll have two calls to updateInfo, resulting in 4 calls (2 processes multiplied by 2 calls to updateInfo); after another second they will be 8, then 16 and so on.
Another problem with your code is that, at each call to fillTable you're calling three functions that should only be executed once for each cycle:
setHorizontalHeaderLabels;
resizeColumnsToContents;
resizeRowsToContents;
The first one is pointless in any case, since the column labels will certainly not change through the lifetime of your program, and when that function is called it cycles through all its header items to check if any has changed.
The other two are very demanding in terms of performance, since the view is forced to query all items and call underlying functions to compute size hints of each row and column and then adapt the sizes accordingly.
There's really no point in using a single shot timer for this purpose, as the problem is that you're relying on function arguments to update the data, while a better approach is to properly use objects and references.
Other performance issues in your implementation:
since the keys of the dictionary are known and won't change, there's no point in using setdefault();
you're constantly retrieving the same lists at each cycle;
you're constructing the search list every time (which is time and memory consuming);
some values are clearly constants, there's no need in computing/retrieving them at every cycle;
A possible reimplementation and simplification of your code could be the following:
create a normal QTimer in the __init__, which updates the process list (which might change for obvious reasons) and update it;
create an empty dictionary with all keys set as empty lists;
set the table with the predefined column count and horizontal labels;
create a function that cycles through the processes, if one already exists, update its data, otherwise create a new row and append the new data;
optimize the function to improve its execution time and memory usage;
CpuCount = psutil.cpu_count()
MemoryRatio = 1024 ** 2
class ProcView(QtWidgets.QWidget):
def __init__(self):
super().__init__()
layout = QtWidgets.QVBoxLayout(self)
self.table = QtWidgets.QTableWidget(0, 6)
layout.addWidget(self.table)
self.headers = 'pid', 'memory(MB)', 'memory(%)', 'cpu_times(s)', 'create_time', 'cpu(%)'
self.updateColumns = 1, 2, 3, 5
self.table.setHorizontalHeaderLabels(self.headers)
self.procTimer = QtCore.QTimer(interval=1000, timeout=self.updateInfo)
self.filter = 'python3', 'python'
self.procs = {header:[] for header in self.headers}
self.procTimer.start()
self.updateInfo()
def updateInfo(self):
pids = self.procs['pid']
memoryMb = self.procs['memory(MB)']
memoryPerc = self.procs['memory(%)']
cpu_times = self.procs['cpu_times(s)']
create_times = self.procs['create_time']
cpuPerc = self.procs['cpu(%)']
for p in psutil.process_iter():
if not p.name().lower() in self.filter:
continue
with p.oneshot():
if p.pid not in pids:
row = len(pids)
self.table.insertRow(row)
pids.append(p.pid)
memoryMb.append(p.memory_full_info().uss / MemoryRatio)
memoryPerc.append(p.memory_percent(memtype="rss"))
cpu_times.append(sum(p.cpu_times()[:2]))
create_times.append(datetime.datetime.fromtimestamp(p.create_time()).strftime("%Y-%m-%d %H:%M:%S"))
cpuPerc.append(p.cpu_percent() / CpuCount)
for col, header in enumerate(self.headers):
item = QtWidgets.QTableWidgetItem()
item.setData(QtCore.Qt.DisplayRole, self.procs[header][row])
self.table.setItem(row, col, item)
else:
row = pids.index(p.pid)
memoryMb[row] = p.memory_full_info().uss / MemoryRatio
memoryPerc[row] = p.memory_percent(memtype="rss")
cpu_times[row] = sum(p.cpu_times()[:2])
cpuPerc[row] = p.cpu_percent() / CpuCount
for col in self.updateColumns:
item = self.table.item(row, col)
item.setData(QtCore.Qt.DisplayRole, self.procs[self.headers[col]][row])
self.table.resizeColumnsToContents()
self.table.resizeRowsToContents()
Note that a proper implementation should possibly:
avoid a dictionary with fields as keys and lists for each field, but eventually a dictionary with pid as keys and a dictionary for each item's field (a specialized class that works as an abstraction layer for the process would be even better);
use a custom model (with a QTableView);
verify whenever processes terminate;
use fixed row sizes and avoid automatic column resizing at each cycle for all columns (it's better to use the Fixed resize mode for common fixed sized columns like the CPU usage and leave the resizing to the user);
Related
I'm have implemented an Evolutionary Algorithm process in Python 3.8, and am attempting to optimise/reduce its runtime. Due to the heavy constraints upon valid solutions, it can take a few minutes to generate valid chromosomes. To avoid spending hours just generating the initial population, I want to use Multiprocessing to generate multiple at a time.
My code at this point in time is:
populationCount = 500
def readDistanceMatrix():
# code removed
def generateAvailableValues():
# code removed
def generateAvailableValuesPerColumn():
# code removed
def generateScheduleTemplate():
# code removed
def generateChromosome():
# code removed
if __name__ == '__main__':
# Data type = DataFrame
distanceMatrix = readDistanceMatrix()
# Data type = List of Integers
availableValues = generateAvailableValues()
# Data type = List containing Lists of Integers
availableValuesPerColumn = generateAvailableValuesPerColumn(availableValues)
# Data type = DataFrame
scheduleTemplate = generateScheduleTemplate(distanceMatrix)
# Data type = List containing custom class (with Integer and DataFrame)
population = []
while len(population) < populationCount:
chrmSolution = generateChromosome(availableValuesPerColumn, scheduleTemplate, distanceMatrix)
population.append(chrmSolution)
Where the population list is filled in with the while loop at the end. I would like to replace the while loop with a Multiprocessing solution that can use up to a pre-set number of cores. For example:
population = []
availableCores = 6
while len(population) < populationCount:
while usedCores < availableCores:
# start generating another chromosome as 'chrmSolution'
population.append(chrmSolution)
However, after reading and watching hours worth of tutorials, I'm unable to get a loop up-and-running. How should I go about doing this?
It sounds like a simple multiprocessing.Pool should do the trick, or at least be a place to start. Here's a simple example of how that might look:
from multiprocessing import Pool, cpu_count
child_globals = {} #mutable object at the `module` level acts as container for globals (constants)
if __name__ == '__main__':
# ...
def init_child(availableValuesPerColumn, scheduleTemplate, distanceMatrix):
#passing variables to the child process every time is inefficient if they're
# constant, so instead pass them to the initialization function, and let
# each child re-use them each time generateChromosome is called
child_globals['availableValuesPerColumn'] = availableValuesPerColumn
child_globals['scheduleTemplate'] = scheduleTemplate
child_globals['distanceMatrix'] = distanceMatrix
def child_work(i):
#child_work simply wraps generateChromosome with inputs, and throws out dummy `i` from `range()`
return generateChromosome(child_globals['availableValuesPerColumn'],
child_globals['scheduleTemplate'],
child_globals['distanceMatrix'])
with Pool(cpu_count(),
initializer=init_child, #init function to stuff some constants into the child's global context
initargs=(availableValuesPerColumn, scheduleTemplate, distanceMatrix)) as p:
#imap_unordered doesn't make child processes wait to ensure order is preserved,
# so it keeps the cpu busy more often. it returns a generator, so we use list()
# to store the results into a list.
population = list(p.imap_unordered(child_work, range(populationCount)))
Is there a way to run a function in parallel within an already parallelised function? I know that using multiprocessing.Pool() this is not possible as a daemonic process can not create a child process. I am fairly new to parallel computing and am struggling to find a workaround.
I currently have several thousand calculations that need to be run in parallel using some other commercially available quantum mechanical code I interface to. Each calculation, has three subsequent calculations that need to be executed in parallel on normal termination of the parent calculation, if the parent calculation does not terminate normally, that is the end of the calculation for that point. I could always combine these three subsequent calculations into one big calculation and run normally - although I would much prefer to run separately in parallel.
Main currently looks like this, run() is the parent calculation that is first run in parallel for a series of points, and par_nacmes() is the function that I want to run in parallel for three child calculations following normal termination of the parent.
def par_nacmes(nacme_input_data):
nacme_dir, nacme_input, index = nacme_input_data # Unpack info in tuple for the calculation
axes_index = get_axis_index(nacme_input)
[norm_term, nacme_outf] = util.run_calculation(molpro_keys, pwd, nacme_dir, nacme_input, index) # Submit child calculation
if norm_term:
data.extract_nacme(nacme_outf, molpro_keys['nacme_regex'], index, axes_index)
else:
with open('output.log', 'w+') as f:
f.write('NACME Crashed for GP%s - axis %s' % (index, axes_index))
def run(grid_point):
index, geom = grid_point
if inputs['code'] == 'molpro':
[spe_dir, spe_input] = molpro.setup_spe(inputs, geom, pwd, index)
[norm_term, spe_outf] = util.run_calculation(molpro_keys, pwd, spe_dir, spe_input, index) # Run each parent calculation
if norm_term: # If parent calculation terminates normally - Extract data and continue with subsequent calculations for each point
data.extract_energies(spe_dir+spe_outf, inputs['spe'], molpro_keys['energy_regex'],
molpro_keys['cas_prog'], index)
if inputs['nacme'] == 'yes':
[nacme_dir, nacmes_inputs] = molpro.setup_nacme(inputs, geom, spe_dir, index)
nacmes_data = [(nacme_dir, nacme_inp, index) for nacme_inp in nacmes_inputs] # List of three tuples - each with three elements. Each tuple describes a child calculation to be run in parallel
nacme_pool = multiprocessing.Pool()
nacme_pool.map(par_nacmes, [nacme_input for nacme_input in nacmes_data]) # Run each calculation in list of tuples in parallel
if inputs['grad'] == 'yes':
pass
else:
with open('output.log', 'w+') as f:
f.write('SPE crashed for GP%s' % index)
elif inputs['code'] == 'molcas': # TO DO
pass
if __name__ == "__main__":
try:
pwd = os.getcwd() # parent dir
f = open(inp_geom, 'r')
ref_geom = np.genfromtxt(f, skip_header=2, usecols=(1, 2, 3), encoding=None)
f.close()
geom_list = coordinate_generator(ref_geom) # Generate nuclear coordinates
if inputs['code'] == 'molpro':
couplings = molpro.coupled_states(inputs['states'][-1])
elif inputs['code'] == 'molcas':
pass
data = setup.global_data(ref_geom, inputs['states'][-1], couplings, len(geom_list))
run_pool = multiprocessing.Pool()
run_pool.map(run, [(k, v) for k, v in enumerate(geom_list)]) # Run each parent calculation for each set of coordinates
except StopIteration:
print('Please ensure goemetry file is correct.')
Any insight on how to run these child calculations in parallel for each point would be a great help. I have seen some people suggest using multi-threading instead or to set daemon to false, although I am unsure if this is the best way to do this.
firstly I dont know why you have to run par_nacmes in paralel but if you have to you could:
a use threads to run them instead of processes
or b use multiprocessing.Process to run run however that would involve a lot of overhead so I personally wouldn't do it.
for a all you have to do is
replace
nacme_pool = multiprocessing.Pool()
nacme_pool.map(par_nacmes, [nacme_input for nacme_input in nacmes_data])
in run()
with
threads = []
for nacme_input in nacmes_data:
t = Thread(target=par_nacmes, args=(nacme_input,)); t.start()
threads.append(t)
for t in threads: t.join()
or if you dont care if the treads have finished or not
for nacme_input in nacmes_data:
t = Thread(target=par_nacmes, args=(nacme_input,)); t.start()
Each function (func1, etc) makes a request to a different url:
def thread_map(ID):
func_switch = \
{
0: func1,
1: func2,
2: func3,
3: func4
}
with ThreadPoolExecutor(max_workers=len(func_switch)) as threads:
futures = [threads.submit(func_switch[i], ID) for i in func_switch]
results = [f.result() for f in as_completed(futures)]
for df in results:
if not df.empty and df['x'][0] != '':
return df
return pd.DataFrame()
This is much faster (1.75 sec) compared to a for loop (4 sec), but the results are unordered.
How can each function be executed parallely while allowing to check the results by order of execution?
Preferably as background processes/threads returning the according dataframes starting with func1. So if the conditions for func1 are not met, check func2 and so on for the criteria given the results have already been fetched in the background. Each dataframe is different, but they all contain the same common column x.
Any suggestions are highly appreciated plus I hope ThreadPoolExecutor is appropriate for this scenario. Thanks!
First, let's do it as you are asking:
with ThreadPoolExecutor(max_workers=len(func_switch)) as threads:
futures = [threads.submit(func_switch[i], ID) for i in func_switch]
results = [f.result() for f in futures]
That was simple enough.
To process the futures as they are completed and place the results in the list in the futures were created, you need to associate with each future the order in which the future was created:
futures = {} # this time a dictionary
creation_order = 0
with ThreadPoolExecutor(max_workers=len(func_switch)) as threads:
for i in func_switch:
future = threads.submit(func_switch[i], ID)
futures[future] = creation_order # map the future to this value or any other values you want, such as the arguments being passed to the function, which happens to be the creation order
creation_order += 1
results = [None] * creation_order # preallocate results
for f in as_completed(futures):
result = f.result()
index = futures[f] # recover original creation_order:
results[index] = result
Of course, if you are waiting for all the futures to complete before you do anything with them, there is no point in using the as_completed method. I just wanted to show how if that weren't the case the method for associating the completed future back with the original creation order (or perhaps more useful, the original arguments used in the call to the worker function that created the future). An alternative is for the processing function to return the passed arguments as part of its result.
The following for loop is part of a iterative simulation process and is the main bottleneck regarding computational time:
import numpy as np
class Simulation(object):
def __init__(self,n_int):
self.n_int = n_int
def loop(self):
for itr in range(self.n_int):
#some preceeding code which updates rows_list and diff with every itr
cols_red_list = []
rows_list = list(range(2500)) #row idx for diff where negative element is known to appear
diff = np.random.uniform(-1.323, 3.780, (2500, 300)) #np.random.uniform is just used as toy example
for row in rows_list:
col = next(idx for idx, val in enumerate(diff[row,:]) if val < 0)
cols_red_list.append(col)
# some subsequent code which uses the cols_red_list data
sim1 = Simulation(n_int=10)
sim1.loop()
Hence, I tried to parallelize it by using the multiprocessing package in hope to reduce computation time:
import numpy as np
from multiprocessing import Pool, cpu_count
from functools import partial
def crossings(row, diff):
return next(idx for idx, val in enumerate(diff[row,:]) if val < 0)
class Simulation(object):
def __init__(self,n_int):
self.n_int = n_int
def loop(self):
for itr in range(self.n_int):
#some preceeding code which updates rows_list and diff with every
rows_list = list(range(2500))
diff = np.random.uniform(-1, 1, (2500, 300))
if __name__ == '__main__':
num_of_workers = cpu_count()
print('number of CPUs : ', num_of_workers)
pool = Pool(num_of_workers)
cols_red_list = pool.map(partial(crossings,diff = diff), rows_list)
pool.close()
print(len(cols_red_list))
# some subsequent code which uses the cols_red_list data
sim1 = Simulation(n_int=10)
sim1.loop()
Unfortunately, the parallelization turns out to be much slower compared to the sequential piece of code.
Hence my question: Did I use the multiprocessing package properly in that particular example? Are there alternative ways to parallelize the above mentioned for loop ?
Disclaimer: As you're trying to reduce the runtime of your code through parallelisation, this doesn't strictly answer your question but it might still be a good learning opportunity.
As a golden rule, before moving to multiprocessing to improve
performance (execution time), one should first optimise the
single-threaded case.
Your
rows_list = list(range(2500))
Generates the numbers 0 to 2499 (that's the range) and stores them in memory (list), which requires time to do the allocation of the required memory and the actual write. You then only use these predictable values once each, by reading them from memory (which also takes time), in a predictable order:
for row in rows_list:
This is particularly relevant to the runtime of your loop function as you do it repeatedly (for itr in range(n_int):).
Instead, consider generating the number only when you need it, without an intermediate store (which conceptually removes any need to access RAM):
for row in range(2500):
Secondly, on top of sharing the same issue (unnecessary accesses to memory), the following:
diff = np.random.uniform(-1, 1, (2500, 300))
# ...
col = next(idx for idx, val in enumerate(diff[row,:]) if val < 0)
seems to me to be optimisable at the level of math (or logic).
What you're trying to do is get a random variable (that col index) by defining it as "the first time I encounter a random variable in [-1;1] that is lower than 0". But notice that figuring out if a random variable with a uniform distribution over [-α;α] is negative, is the same as having a random variable over {0,1} (i.e. a bool).
Therefore, you're now working with bools instead of floats and you don't even have to do the comparison (val < 0) as you already have a bool. This potentially makes the code much faster. Using the same idea as for rows_list, you can generate that bool only when you need it; testing it until it is True (or False, choose one, it doesn't matter obviously). By doing so, you only generate as many random bools as you need, not more and not less (BTW, what happens in your code if all 300 elements in the row are negative? ;) ):
for _ in range(n_int):
cols_red_list = []
for row in range(2500):
col = next(i for i in itertools.count() if random.getrandbits(1))
cols_red_list.append(col)
or, with list comprehension:
cols_red_list = [next(i for i in count() if getrandbits(1))
for _ in range(2500)]
I'm sure that, through proper statistical analysis, you even can express that col random variable as a non-uniform variable over [0;limit[, allowing you to compute it much faster.
Please test the performance of an "optimized" version of your single-threaded implementation first. If the runtime is still not acceptable, you should then look into multithreading.
multiprocessing uses system processes (not threads!) for parallelization, which require expensive IPC (inter-process communication) to share data.
This bites you in two spots:
diff = np.random.uniform(-1, 1, (2500, 300)) creates a large matrix which is expensive to pickle/copy to another process
rows_list = list(range(2500)) creates a smaller list, but the same applies here.
To avoid this expensive IPC, you have one and a half choices:
If on a POSIX-compliant system, initialize your variables on the module level, that way each process gets a quick-and-dirty copy of the required data. This is not scalable as it requires POSIX, weird architecture (you probably don't want to put everything on the module level), and doesn't support sharing changes to that data.
Use shared memory. This only supports mostly primitive data types, but mp.Array should cover your needs.
The second problem is that setting up a pool is expensive, as num_cpu processes need to be started. Your workload is small enough to be negligible compared to this overhead. A good practice is to only create one pool and reuse it.
Here is a quick-and-dirty example of the POSIX only solution:
import numpy as np
from multiprocessing import Pool, cpu_count
from functools import partial
n_int = 10
rows_list = np.array(range(2500))
diff = np.random.uniform(-1, 1, (2500, 300))
def crossings(row, diff):
return next(idx for idx, val in enumerate(diff[row,:]) if val < 0)
def workload(_):
cols_red_list = [crossings(row, diff) for row in rows_list]
print(len(cols_red_list))
class Simulation(object):
def loop(self):
num_of_workers = cpu_count()
with Pool(num_of_workers) as pool:
pool.map(workload, range(10))
pool.close()
sim1 = Simulation()
sim1.loop()
For me (and my two cores) this is roughly twice as fast as the sequential version.
Update with shared memory:
import numpy as np
from multiprocessing import Pool, cpu_count, Array
from functools import partial
n_int = 10
ROW_COUNT = 2500
### WORKER
diff = None
result = None
def init_worker(*args):
global diff, result
(diff, result) = args
def crossings(i):
result[i] = next(idx for idx, val in enumerate(diff[i*300:(i+1)*300]) if val < 0)
### MAIN
class Simulation():
def loop(self):
num_of_workers = cpu_count()
diff = Array('d', range(ROW_COUNT*300), lock=False)
result = Array('i', ROW_COUNT, lock=False)
# Shared memory needs to be passed when workers are spawned
pool = Pool(num_of_workers, initializer=init_worker, initargs=(diff, result))
for i in range(n_int):
# SLOW, I assume you use a different source of values anyway.
diff[:] = np.random.uniform(-1, 1, ROW_COUNT*300)
pool.map(partial(crossings), range(ROW_COUNT))
print(len(result))
pool.close()
sim1 = Simulation()
sim1.loop()
A few notes:
Shared memory needs to be set up at worker creation, so it's global anyway.
This still isn't faster than the sequential version, but that's mainly due to random.uniform needing to be copied entirely into shared memory. I assume that are just values for testing, and in reality you'd fill it differently anyway.
I only pass indices to the worker, and use them to read and write values to the shared memory.
I'm performing analyses of time-series of simulations. Basically, it's doing the same tasks for every time steps. As there is a very high number of time steps, and as the analyze of each of them is independant, I wanted to create a function that can multiprocess another function. The latter will have arguments, and return a result.
Using a shared dictionnary and the lib concurrent.futures, I managed to write this :
import concurrent.futures as Cfut
def multiprocess_loop_grouped(function, param_list, group_size, Nworkers, *args):
# function : function that is running in parallel
# param_list : list of items
# group_size : size of the groups
# Nworkers : number of group/items running in the same time
# **param_fixed : passing parameters
manager = mlp.Manager()
dic = manager.dict()
executor = Cfut.ProcessPoolExecutor(Nworkers)
futures = [executor.submit(function, param, dic, *args)
for param in grouper(param_list, group_size)]
Cfut.wait(futures)
return [dic[i] for i in sorted(dic.keys())]
Typically, I can use it like this :
def read_file(files, dictionnary):
for file in files:
i = int(file[4:9])
#print(str(i))
if 'bz2' in file:
os.system('bunzip2 ' + file)
file = file[:-4]
dictionnary[i] = np.loadtxt(file)
os.system('bzip2 ' + file)
Map = np.array(multiprocess_loop_grouped(read_file, list_alti, Group_size, N_thread))
or like this :
def autocorr(x):
result = np.correlate(x, x, mode='full')
return result[result.size//2:]
def find_lambda_finger(indexes, dic, Deviation):
for i in indexes :
#print(str(i))
# Beach = Deviation[i,:] - np.mean(Deviation[i,:])
dic[i] = Anls.find_first_max(autocorr(Deviation[i,:]), valmax = True)
args = [Deviation]
Temp = Rescal.multiprocess_loop_grouped(find_lambda_finger, range(Nalti), Group_size, N_thread, *args)
Basically, it is working. But it is not working well. Sometimes it crashes. Sometimes it actually launches a number of python processes equal to Nworkers, and sometimes there is only 2 or 3 of them running at a time while I specified Nworkers = 15.
For example, a classic error I obtain is described in the following topic I raised : Calling matplotlib AFTER multiprocessing sometimes results in error : main thread not in main loop
What is the more Pythonic way to achieve what I want ? How can I improve the control this function ? How can I control more the number of running python process ?
One of the basic concepts for Python multi-processing is using queues. It works quite well when you have an input list that can be iterated and which does not need to be altered by the sub-processes. It also gives you a good control over all the processes, because you spawn the number you want, you can run them idle or stop them.
It is also a lot easier to debug. Sharing data explicitly is usually an approach that is much more difficult to setup correctly.
Queues can hold anything as they are iterables by definition. So you can fill them with filepath strings for reading files, non-iterable numbers for doing calculations or even images for drawing.
In your case a layout could look like that:
import multiprocessing as mp
import numpy as np
import itertools as it
def worker1(in_queue, out_queue):
#holds when nothing is available, stops when 'STOP' is seen
for a in iter(in_queue.get, 'STOP'):
#do something
out_queue.put({a: result}) #return your result linked to the input
def worker2(in_queue, out_queue):
for a in iter(in_queue.get, 'STOP'):
#do something differently
out_queue.put({a: result}) //return your result linked to the input
def multiprocess_loop_grouped(function, param_list, group_size, Nworkers, *args):
# your final result
result = {}
in_queue = mp.Queue()
out_queue = mp.Queue()
# fill your input
for a in param_list:
in_queue.put(a)
# stop command at end of input
for n in range(Nworkers):
in_queue.put('STOP')
# setup your worker process doing task as specified
process = [mp.Process(target=function,
args=(in_queue, out_queue), daemon=True) for x in range(Nworkers)]
# run processes
for p in process:
p.start()
# wait for processes to finish
for p in process:
p.join()
# collect your results from the calculations
for a in param_list:
result.update(out_queue.get())
return result
temp = multiprocess_loop_grouped(worker1, param_list, group_size, Nworkers, *args)
map = multiprocess_loop_grouped(worker2, param_list, group_size, Nworkers, *args)
It can be made a bit more dynamic when you are afraid that your queues will run out of memory. Than you need to fill and empty the queues while the processes are running. See this example here.
Final words: it is not more Pythonic as you requested. But it is easier to understand for a newbie ;-)