I'm trying to solve this problem where I store the locations and counts of substrings of a given length. As the strings can be long (genome sequences), I'm trying to use multiple processes to speed it up. While the program runs, the variables that store the object seem to lose all information once the threads end.
import numpy
import multiprocessing
from multiprocessing.managers import BaseManager, DictProxy
from collections import defaultdict, namedtuple, Counter
from functools import partial
import ctypes as c
class MyManager(BaseManager):
pass
MyManager.register('defaultdict', defaultdict, DictProxy)
def gc_count(seq):
return int(100 * ((seq.upper().count('G') + seq.upper().count('C') + 0.0) / len(seq)))
def getreads(length, table, counts, genome):
genome_len = len(genome)
for start in range(0,genome_len):
gc = gc_count(genome[start:start+length])
table[ (length, gc) ].append( (start) )
counts[length,gc] +=1
if __name__ == "__main__":
g = 'ACTACGACTACGACTACGCATCAGCACATACGCATACGCATCAACGACTACGCATACGACCATCAGATCACGACATCAGCATCAGCATCACAGCATCAGCATCAGCACTACAGCATCAGCATCAGCATCAG'
genome_len = len(g)
mgr = MyManager()
mgr.start()
m = mgr.defaultdict(list)
mp_arr = multiprocessing.Array(c.c_double, 10*101)
arr = numpy.frombuffer(mp_arr.get_obj())
count = arr.reshape(10,101)
pool = multiprocessing.Pool(9)
partial_getreads = partial(getreads, table=m, counts=count, genome=g)
pool.map(partial_getreads, range(1, 10))
pool.close()
pool.join()
for i in range(1, 10):
for j in range(0,101):
print count[i,j]
for i in range(1, 10):
for j in range(0,101):
print len(m[(i,j)])
The loops at the end will only print out 0.0 for each element in count and 0 for each list in m, so somehow I'm losing all the counts. If i print the counts within the getreads(...) function, I can see that the values are being increased. Conversely, printing len(table[ (length, gc) ]) in getreads(...) or len(m[(i,j)]) in the main body only results in 0.
You could also formulate your problem as a map-reduce problem, by which you would avoid sharing the data among multiple processes (i guess it would speed up the computation). You would just need to return the resulting table and counts from the function (map) and combine the results from all the processes (reduce).
Going back to your original question...
At the bottom of Managers there is a relevant note about
modifications to mutable values or items in dict and list. Basically, you
need to re-assign the modified object to the container proxy.
l = table[ (length, gc) ]
l.append( (start) )
table[ (length, gc) ] = l
There is also a relevant Stackoverflow post about combining pool map with Array.
Taking both into account you can do something like:
def getreads(length, table, genome):
genome_len = len(genome)
arr = numpy.frombuffer(mp_arr.get_obj())
counts = arr.reshape(10,101)
for start in range(0,genome_len):
gc = gc_count(genome[start:start+length])
l = table[ (length, gc) ]
l.append( (start) )
table[ (length, gc) ] = l
counts[length,gc] +=1
if __name__ == "__main__":
g = 'ACTACGACTACGACTACGCATCAGCACATACGCATACGCATCAACGACTACGCATACGACCATCAGATCACGACATCAGCATCAGCATCACAGCATCAGCATCAGCACTACAGCATCAGCATCAGCATCAG'
genome_len = len(g)
mgr = MyManager()
mgr.start()
m = mgr.defaultdict(list)
mp_arr = multiprocessing.Array(c.c_double, 10*101)
arr = numpy.frombuffer(mp_arr.get_obj())
count = arr.reshape(10,101)
pool = multiprocessing.Pool(9)
partial_getreads = partial(getreads, table=m, genome=g)
pool.map(partial_getreads, range(1, 10))
pool.close()
pool.join()
arr = numpy.frombuffer(mp_arr.get_obj())
count = arr.reshape(10,101)
Related
Explanation of what I'm trying to accomplish:
I have dataframe to iterate over looking for some condition given a variable.
I have list of variables and I iterate over this df using multiprocessing. I pop(0) everytime a process start.
Now I need to add one more level, but I can't understand how to do it.
Here is the code:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
import decimal
import multiprocessing
from multiprocessing import Pool, Manager
import itertools
#dataframe
columns = ['A', 'B', 'C', 'D']
data = np.array([np.random.randint(1, 10_000, size=750)]*4).T
df = pd.DataFrame(data, columns= columns)
print(df)
# Creating a list of tuples to apply a given function
a = np.arange(5,20, 1)
b = np.arange(1.01, 1.10, 0.01)
d = np.arange(0.95, 0.99, 0.01)
c = list(itertools.product(a, b, d))
list_of_tuples = []
dic = {}
for x in c:
dic[(x)] = x
for key, value in dic.items():
uno, due, tre = value[0], value[1], value[2]
list_of_tuples.append((uno, due, tre))
print(len(dic)) #checking size of dictionary
print(len(list_of_tuples), len(df)) #checking if size match
maximum = max(dic, key=dic.get) #maximum key inside dictionary
print(maximum, dic[maximum])
new_dic = {}
i = 1
#look_back_period = (len(df) // 10)
#print(look_back_period)
c = 0
"""chunks is the only way where I could use pool.map, it should be a list of list"""
chunks = [list_of_tuples[i::len(list_of_tuples)] for i in range(len(list_of_tuples))]
print(len(chunks[0]))
#this manager is needed to have every process append to the same Dict the result of the
# function that is given below
manager = Manager()
new_dic = manager.dict()
def multi_prova(list_of_tuples):
list_results = []
given1, given2, given3 = list_of_tuples.pop(0)
#sliding_window = df.iloc[0 : c + look_back_period, : ]
for row in df.itertuples():
result = (given1 / row.A).round(2)
list_results.append(result)
new_dic[str(given1)+', ' + str(given2)+', ' + str(given2)] = result
time1 = time.time()
if __name__ == "__main__":
try:
pool = Pool() # Make the Pool of workers
results = pool.map(multi_prova, chunks) #Open the urls in their own threads
pool.close() #close the pool and wait for the work to finish
pool.join()
except:
print('error')
time2 = time.time()
print(time2 - time1)
#On my original code len(new_dic) matched len(dic), here is 750 vs 150, don't know why?!?!?!
print(new_dic)
print(len(new_dic))
Shouldn't be the len of new_dic == dic
750 rows, and a result for every row to 'append' to the dictionary.
So the problem are two:
why (len(new_dic)) is not 750.
And on top of that I would like to have, a sliding window to iter a slice of dataframe and have a dictionary of list of list with all the result of every slice of the df while c + look_back_period < len(df).
Hope I was clear enough.
A big hug on anyone that can contribute.
I am trying to write a process which does some computation on an Array filled with strings using the multiprocessing module. However, I am not able to get back the results. This is just a minimalist code example:
from multiprocessing import Process, Value, Array
from ctypes import c_char_p
# Process
def f(n, a):
for i in range(0,10):
a[i] = "test2".encode('latin-1')
if __name__ == '__main__':
# Set up array
arr = Array(c_char_p, range(10))
# Fill it with values
for i in range(0,10):
arr[i] = "test".encode('latin-1')
x = []
for i in range(0,10):
num = Value('d', float(i)*F)
p = Process(target=f, args=(num, arr,))
x.append(p)
p.start()
for p in x:
p.join()
# THis works
print(num.value)
# This will not give out anything
print(arr[0])
The last line won't print out anything, despite it being filled or altered.
The main thing that concerns me, is when changing the code to just simply using integers it will work:
from multiprocessing import Process, Value, Array
from ctypes import c_char_p
def f(n, a):
for i in range(0,10):
a[i] = 5
if __name__ == '__main__':
arr = Array('i',range(10))
for i in tqdm(range(0,10)):
arr[i] = 10
x = []
for i in range(0,10):
num = Value('d', float(i)*F)
p = Process(target=f, args=(num, arr,))
x.append(p)
p.start()
for p in x:
p.join()
print(num.value)
print(arr[0])
My Best guess is that this has something to do with the fact that the string array is acutally filled with char arrays and an integer is just one value, but I do not know how to fix this
This might answer your question, Basically the string array arr has an array of character pointers c_char_p, When the first process invokes the function f the character pointers are created in the context of itself but not in the other processes context, so eventually when the other processes tries to access the arr it will be invalid addresses.
In my case this seems to be working fine,
from multiprocessing import Process, Value, Array
from ctypes import c_char_p
values = ['test2438']*10
# Process
def f(n, a):
for i,s in enumerate(values):
a[i] = s
if __name__ == '__main__':
# Set up array
arr = Array(c_char_p, 10)
for i in range(0,10):
arr[i] = 'test'
# Fill it with values
x = []
for i in range(0,10):
num = Value('d', float(i))
p = Process(target=f, args=(num, arr,))
x.append(p)
p.start()
for p in x:
p.join()
# This will not give out anything
print(arr[:])
I know there are a lot of topics around similar problems (like How do I make processes able to write in an array of the main program?, Multiprocessing - Shared Array or Multiprocessing a loop of a function that writes to an array in python), but I just don't get it... so sorry for asking again.
I need to do some stuff with a huge array and want to speed up things by splitting it into blocks and running my function on those blocks, with each block being run in its own process. Problem is: the blocks are "cut" from one array and the result shall then be written into a new, common array. This is what I did so far (minimum working example; don't mind the array-shaping, this is necessary for my real-world case):
import numpy as np
import multiprocessing as mp
def calcArray(array, blocksize, n_cores=1):
in_shape = (array.shape[0] * array.shape[1], array.shape[2])
input_array = array[:, :, :array.shape[2]].reshape(in_shape)
result_array = np.zeros(array.shape)
# blockwise loop
pix_count = array.size
for position in range(0, pix_count, blocksize):
if position + blocksize < array.shape[0] * array.shape[1]:
num = blocksize
else:
num = pix_count - position
result_part = input_array[position:position + num, :] * 2
result_array[position:position + num] = result_part
# finalize result
final_result = result_array.reshape(array.shape)
return final_result
if __name__ == '__main__':
start = time.time()
img = np.ones((4000, 4000, 4))
result = calcArray(img, blocksize=100, n_cores=4)
print 'Input:\n', img
print '\nOutput:\n', result
How can I now implement multiprocessing in way that I set a number of cores and then calcArray assigns processes to each block until n_cores is reached?
With the much appreciated help of #Blownhither Ma, the code now looks like this:
import time, datetime
import numpy as np
from multiprocessing import Pool
def calculate(array):
return array * 2
if __name__ == '__main__':
start = time.time()
CORES = 4
BLOCKSIZE = 100
ARRAY = np.ones((4000, 4000, 4))
pool = Pool(processes=CORES)
in_shape = (ARRAY.shape[0] * ARRAY.shape[1], ARRAY.shape[2])
input_array = ARRAY[:, :, :ARRAY.shape[2]].reshape(in_shape)
result_array = np.zeros(input_array.shape)
# do it
pix_count = ARRAY.size
handles = []
for position in range(0, pix_count, BLOCKSIZE):
if position + BLOCKSIZE < ARRAY.shape[0] * ARRAY.shape[1]:
num = BLOCKSIZE
else:
num = pix_count - position
### OLD APPROACH WITH NO PARALLELIZATION ###
# part = calculate(input_array[position:position + num, :])
# result_array[position:position + num] = part
### NEW APPROACH WITH PARALLELIZATION ###
handle = pool.apply_async(func=calculate, args=(input_array[position:position + num, :],))
handles.append(handle)
# finalize result
### OLD APPROACH WITH NO PARALLELIZATION ###
# final_result = result_array.reshape(ARRAY.shape)
### NEW APPROACH WITH PARALLELIZATION ###
final_result = [h.get() for h in handles]
final_result = np.concatenate(final_result, axis=0)
print 'Done!\nDuration (hh:mm:ss): {duration}'.format(duration=datetime.timedelta(seconds=time.time() - start))
The code runs and really starts the number processes I assigned, but takes much much longer than the old approach with just using the loop "as-is" (3 sec compared to 1 min). There must be something missing here.
The core function is pool.apply_async and handler.get.
I have been recently working on the same functions and find it useful to make a standard utility function. balanced_parallel applies function fn on matrix a in a parallel manner silently. assigned_parallel explicitly apply function on each element.
i. The way I split array is np.array_split. You may use block scheme instead.
ii. I use concat rather than assign to a empty matrix when collecting result. There is no shared memory.
from multiprocessing import cpu_count, Pool
def balanced_parallel(fn, a, processes=None, timeout=None):
""" apply fn on slice of a, return concatenated result """
if processes is None:
processes = cpu_count()
print('Parallel:\tstarting {} processes on input with shape {}'.format(processes, a.shape))
results = assigned_parallel(fn, np.array_split(a, processes), timeout=timeout, verbose=False)
return np.concatenate(results, 0)
def assigned_parallel(fn, l, processes=None, timeout=None, verbose=True):
""" apply fn on each element of l, return list of results """
if processes is None:
processes = min(cpu_count(), len(l))
pool = Pool(processes=processes)
if verbose:
print('Parallel:\tstarting {} processes on {} elements'.format(processes, len(l)))
# add jobs to the pool
handler = [pool.apply_async(fn, args=x if isinstance(x, tuple) else (x, )) for x in l]
# pool running, join all results
results = [handler[i].get(timeout=timeout) for i in range(len(handler))]
pool.close()
return results
In your case, fn would be
def _fn(matrix_part): return matrix_part * 2
result = balanced_parallel(_fn, img)
Follow-up:
Your loop should look like this to make parallelization happen.
handles = []
for position in range(0, pix_count, BLOCKSIZE):
if position + BLOCKSIZE < ARRAY.shape[0] * ARRAY.shape[1]:
num = BLOCKSIZE
else:
num = pix_count - position
handle = pool.apply_async(func=calculate, args=(input_array[position:position + num, :], ))
handles.append(handle)
# multiple handlers exist at this moment!! Don't `.get()` yet
results = [h.get() for h in handles]
results = np.concatenate(results, axis=0)
I created following code:
M=20000
sample_all = np.load('sample.npy')
sd = np.zeros(M)
chi_arr = np.zeros((M,4))
sigma_e = np.zeros((M,41632))
mean_sigma = np.zeros(M)
max_sigma = np.zeros(M)
min_sigma = np.zeros(M)
z = np.load('z_array.npy')
prof = np.load('profile_at_sources.npy')
L = np.load('luminosities.npy')
for k in range(M):
sd[k]=np.array(sp.std(sample_all[k,:]))
arr = np.genfromtxt('samples_fin1.txt').T[2:6]
arr_T = arr.T
chi_arr[k,:] = arr_T[k,:]
sigma_e[k,:]=np.sqrt(calc(z,prof,chi_arr[k,:], L))
mean_sigma[k] = np.array(sp.mean(sigma_e[k,:]))
max_sigma[k] = np.array(sigma_e[k,:].max())
min_sigma[k] = np.array(sigma_e[k,:].min())
where calc(...) is a function that calculates some stuff (is not important for my question)
This loop takes, for M=20000, about 27 hours on my machine. It's enough... There's a way to optimize it, maybe with vectors instead of loop for?
For me it's really simple create loop, my head thinks with loops for this kind of code... It's my limitation... Could you help me? thanks
It seems to me like each of the k-th rows that are created in your various arrays are independent of each k-th iteration of your for loop and only dependent on rows of sigma_e... so you could parallelize it over many workers. Not sure if the code is 100% kosher but you didn't provide a working example.
Note this only works if each k-th iteration is COMPLETELY independent of each k-1th iteration.
M=20000
sample_all = np.load('sample.npy')
sd = np.zeros(M)
chi_arr = np.zeros((M,4))
sigma_e = np.zeros((M,41632))
mean_sigma = np.zeros(M)
max_sigma = np.zeros(M)
min_sigma = np.zeros(M)
z = np.load('z_array.npy')
prof = np.load('profile_at_sources.npy')
L = np.load('luminosities.npy')
workers = 100
arr = np.genfromtxt('samples_fin1.txt').T[2:6] # only works if this is really what you're doing to set arr.
def worker(k_start, k_end):
for k in range(k_start, k_end + 1):
sd[k]=np.array(sp.std(sample_all[k,:]))
arr_T = arr.T
chi_arr[k,:] = arr_T[k,:]
sigma_e[k,:]=np.sqrt(calc(z,prof,chi_arr[k,:], L))
mean_sigma[k] = np.array(sp.mean(sigma_e[k,:]))
max_sigma[k] = np.array(sigma_e[k,:].max())
min_sigma[k] = np.array(sigma_e[k,:].min())
threads = []
kstart = 0
for k in range(0, workers):
T = threading.Thread(target=worker, args=[0 + k * M / workers, (1+ k) * M / workers - 1 ])
threads.append(T)
T.start()
for t in threads:
t.join()
Edited following comments:
Seems like there's a mutex that CPython places on all objects that prevents parallel access. Use IronPython or Jython to step around this. Also, you can move the file read outside if you're really just deserializing the same array from samples_fin1.txt.
I wanted to parallelize df.corr() using multiprocessing module in Python. I'm taking one column and computing correlation values with rest all columns in one process and second column with rest other columns in another process. I'm continuing in this fashion to fill the upper traingle of correlation matrix by stacking up the result rows from all the processes.
I took sample data of shape (678461, 210) and tried my parallelized method and df.corr() and got running time of 214.40s and 42.64s respectively. So, my parallelized method is taking more time.
Is there a way to improve this?
import multiprocessing as mp
import pandas as pd
import numpy as np
from time import *
def _correlation(args):
i, mat, mask = args
ac = mat[i]
arr = []
for j in range(len(mat)):
if i > j:
continue
bc = mat[j]
valid = mask[i] & mask[j]
if valid.sum() < 1:
c = NA
elif i == j:
c = 1.
elif not valid.all():
c = np.corrcoef(ac[valid], bc[valid])[0, 1]
else:
c = np.corrcoef(ac, bc)[0, 1]
arr.append((j, c))
return arr
def correlation_multi(df):
numeric_df = df._get_numeric_data()
cols = numeric_df.columns
mat = numeric_df.values
mat = pd.core.common._ensure_float64(mat).T
K = len(cols)
correl = np.empty((K, K), dtype=float)
mask = np.isfinite(mat)
pool = mp.Pool(processes=4)
ret_list = pool.map(_correlation, [(i, mat, mask) for i in range(len(mat))])
for i, arr in enumerate(ret_list):
for l in arr:
j = l[0]
c = l[1]
correl[i, j] = c
correl[j, i] = c
return pd.DataFrame(correl, index = cols, columns = cols)
if __name__ == '__main__':
noise = pd.DataFrame(np.random.randint(0,100,size=(100000, 50)))
noise2 = pd.DataFrame(np.random.randint(100,200,size=(100000, 50)))
df = pd.concat([noise, noise2], axis=1)
#Single process correlation
start = time()
s = df.corr()
print('Time taken: ',time()-start)
#Multi process correlation
start = time()
s1 = correlation_multi(df)
print('Time taken: ',time()-start)
The results from _correlation have to be moved from the worker processes to the process running the Pool via interprocess communication.
This means that the return data is pickled, sent to the other process, unpickled and added to the result list.
This takes time and is by nature a sequential process.
And map processes the returns in the order they were sent, IIRC. So if one iteration takes relatively long, other results might be stuck waiting. You could try using imap_unordered which yields results as soon as they arrive.