Python multiprocessing on function with return value

Python multiprocessing on function with return value - python

I am trying to do principal component analysis on each element of an array . Each element of the array is a matrix. I have 500 such arrays. Each array contains around 50 matrices. Overall it is taking too much time. I want to speed up the process using multiprocessing.
I've written following code:
from multiprocessing import Pool
def princomp(A):
A=A.T
# computing eigenvalues and eigenvectors of covariance matrix
M = (A-mean(A.T,axis=1)).T # subtract the mean (along columns)
[latent,coeff] = linalg.eig(cov(M)) # attention:not always sorted
score = dot(coeff.T,M) # projection of the data in the new space
top_eigen=[i[0] for i in sorted(enumerate(latent), key=lambda x:x[1])]
#output = random.rand(990,3)
score = score.real
output = []
for a in range(0, 128):
output += [score[top_eigen[a]]]
return numpy.array(output)
def func1(...)
...
...
for i in range(0, nframes): #nframes is around 50
fc_array[i] = normalize(fc_array[i], axis=1, norm='l1') #fc_array[i] is a 2D numpy array
pool = Pool()
#ans_array[i] = princomp(fc_array[i]) # normal principle component analysis
result = pool.apply_async(princomp, [fc_array[i]]) # principle component analysis using multiprocessing
print "Frame No.: ", i
ans_array[i] = result.get()
p.close()
p.join()
return ans_array
It gets stuck inside the for loop of 'func1'. The print statement is not executed. It gets stuck. Am I even doing multiprocessing correctly?
I hope I'll get some help on this.
Thanks in advance.

Related

Python Multiprocessing cannot Join for Large Data Set

I am trying compute cosine similarity of 260774x96 data matrix. I was using full size Dataframe to compute the cosine similarity, and using this code.
similarities = cdist(Dataframe, Dataframe, metric='cosine').
However, the Jupyter Notebook run out memory(64GB) and crash the kernel. So that I decide to compute single row with Dataframe each time, and then sort, trim and save it each loop. The issue is it takes too much times, about 12 hours.
So I applied multiprocessing to speed up, for small size data it works, single row compute with 500 rows Dataframe. However, computing single row with full size Dataframe, the process can work and put data to the queue, but the join() just not work. The main function keep running even the child process was end with no error. I try to print the queue in another cell, it has full data.
Anther problem is input data queue cannot take full size Dataframe, it show Full error.
I don't know what is the problem the program just stuck on join() step. Or is other practical way to have parallel computing.
from multiprocess import Lock, Process, Queue, current_process,Array,Lock
from time import time
import queue # imported for using queue.Empty exception
from scipy.spatial.distance import cdist
from scipy.spatial import distance
# from cos_process import cos_process
from sklearn.metrics import pairwise_distances
def cos_process(idx, result_queue, index_queue,df_data_only_numpy):
print(f"process start {current_process().name}")
# for shrik the data size, so that the multiprocessing Join can work.
# df_data_only_numpy = df_data_only_numpy[:500]
while True:
try:
task_index_list = index_queue.get_nowait()
except queue.Empty:
break
else:
cosine_dict = {}
for i in task_index_list:
similarities = cdist([df_data_only_numpy[i]], df_data_only_numpy, metric='cosine')
sorted_save_data = np.sort(similarities[0])[:20] # output the cosine simlarities data after sorted
sorted_save_key = (np.argsort(similarities[0])[:20]) #output the index of sorted simlarities
# make a dictionary {index:{index:cosine,index:cosine},....}
cosine_dict[i]={int(i):float(data) if data==data else (data) for i,data in zip(sorted_save_key,sorted_save_data) }
# print(cosine_dict)
try:
# put the dictionary data to queue
result_queue.put_nowait(cosine_dict)
except queue.Full:
print("queue full in process")
print(f"end process {current_process().name}")
return True
def main():
number_of_processes = 10 # create 10 processes for computer
process_compute_time = 1000 # each process run 1000 rows in the while loop
df_data_only_numpy = df_data_only.to_numpy()
index_queue = Queue() # index of the df
result_queue = Queue() # store the output of the similarity
processes = []
# for idx in df_data_only.T:
# only test 1000 rows data in this case, the dataframe has 260774 rows and 96 columns
for idx in range(0, 1000, process_compute_time):
try:
index_queue.put_nowait(list(range(idx, idx+process_compute_time)))
except queue.Full:
print("full")
print("creating process")
# creating processes
for w in range(number_of_processes):
p = Process(target=cos_process, args=(index_queue, result_queue, index_queue,df_data_only_numpy))
processes.append(p)
p.start()
# completing process
for p in processes:
print("before join")
p.join()
print("finish join")
# print the output
while not result_queue.empty():
print(result_queue.get())
return True
if __name__ == '__main__':
main()

Speed up this for loop in python for custom function on array

I have a for loop that is calculating the sum of the output of a custom function called calculate_some that takes tuples as input and then outputs a single value. I wanted to speed up this code as it goes through 1000+ values.
Can vectorization speed this up ? What are my options ?
sum_calculate = 0
for i in range (0,len(GT_ndarray)):
sum_calculate = sum_calculate + calculate_some(Candidates[i][0],Candidates[i][1])
print(sum_calculate)
The code for calculate_some is this
def calculate_some(arr1,arr2):
some = arr1[0]*arr2[0]+arr1[1]+arr2[1]+arr1[2]*arr2[2]
return some

You can use multiprocessing.Pool, for example
import multiprocessing as mp
def worker(i):
return calculate_some(Candidates[i][0],Candidates[i][1])
pool = mp.Pool(mp.cpu_count() - 1)
sum_calculate = sum(list(pool.map(worker, range(len(GT_ndarray)))))
pool.close()

Python, multiprocessing.pool took about the same amount of time as a for loop

I am trying to use python to process some large data sets from several data stations. My idea is to use multiprocessing.pool to assign each CPU the data from a single station, since the data from each station are independent from each other.
However, it seems that my calculation time does not really go down, comparing to single for loop.
Here is part of my code:
#function calculating the square of each data point, and taking the cumulative sum
def get_cumdd(data):
#if not isinstance(data, list):
# data = [data]
dd = np.zeros((len(data),1))
cum_dd = np.zeros((len(data),1))
for i in range(len(data)):
dd[i] = data[i]**2
cum_dd=np.cumsum(dd)
return cum_dd
#parallelization between each station
if __name__ == '__main__':
n_proc = np.min([mp.cpu_count(),nstation]) #nstation = 10
p = mp.Pool(processes=int(n_proc))
result = p.map(get_cumdd,data)
p.close()
p.join()
cum_dd = np.zeros((nstation,len(data[0])))
for i in range(nstation):
cum_dd[i] = result[i].T
I do not use chunksize because cum_dd takes the summation of all the previous data^2. I am essentially dividing my data into 10 equal pieces because there is no communication between processes. I wonder if I missed anything here.
My data has 2 million points per station per day, and I need to process years of data.

This doesn't address your multiprocessing question directly, but (as Ugur MULUK and Iguananaut mention) I think your get_cumdd function is inefficient. Numpy provides np.cumsum. Reimplementing your function I get more than 1000x speedup for an array with 10k elements. With 100k elements it's about 7000x faster. With 2M elements I didn't bother to let it finish.
# your function
def cum_dd(data):
#if not isinstance(data, list):
# data = [data]
dd = np.zeros((len(data),1))
cum_dd = np.zeros((len(data),1))
for i in range(len(data)):
dd[i] = data[i]**2
cum_dd[i]=np.sum(dd[0:i])
return cum_dd
# numpy implementation
def cum_dd2(data):
# adding an axis to match the shape of the output of your cum_dd function
return np.cumsum(data**2)[:, np.newaxis]
For 2e6 points this implementation takes ~11ms on my computer. I think that's about 30 seconds for 10 years of data for a single station.

NumPy already implements efficient parallel processing on CPUs and GPUs. The processing algorithms use Single Instruction Multiple Data (SIMD) instructions.
By pooling computations manually, you are reducing the efficiency. You can improve performance by vectorizing your explicit for loop.
See the video below for more information about vectorization.
https://www.youtube.com/watch?v=qsIrQi0fzbY
If you are having difficulties, I will be around for updates or help. Good luck!

Thanks a lot for all the comments and answers! After applying vectorization and pooling, I reduced the calculation time from one hour to 3 second (10*1.7 million data points). I have my code here in case anyone is interested,
def get_cumdd(data):
#if not isinstance(data, list):
# data = [data]
dd = np.zeros((len(data),1))
for i in range(len(data)):
dd[i] = data[i]**2
cum_dd=np.cumsum(dd)
return dd,cum_dd
if __name__ == '__main__':
n_proc = np.min([mp.cpu_count(),nstation])
p = mp.Pool(processes=int(n_proc))
result = p.map(CC.get_cumdd,d)
p.close()
p.join()
I'm not using shared memory Queue because all my processes are independent from each other.

How do I vectorize the following loop in Numpy?

"""Some simulations to predict the future portfolio value based on past distribution. x is
a numpy array that contains past returns.The interpolated_returns are the returns
generated from the cdf of the past returns to simulate future returns. The portfolio
starts with a value of 100. portfolio_value is filled up progressively as
the program goes through every loop. The value is multiplied by the returns in that
period and a dollar is removed."""
portfolio_final = []
for i in range(10000):
portfolio_value = [100]
rand_values = np.random.rand(600)
interpolated_returns = np.interp(rand_values,cdf_values,x)
interpolated_returns = np.add(interpolated_returns,1)
for j in range(1,len(interpolated_returns)+1):
portfolio_value.append(interpolated_returns[j-1]*portfolio_value[j-1])
portfolio_value[j] = portfolio_value[j]-1
portfolio_final.append(portfolio_value[-1])
print (np.mean(portfolio_final))
I couldn't find a way to write this code using numpy. I was having a look at iterations using nditer but I was unable to move ahead with that.

I guess the easiest way to figure out how you can vectorize your stuff would be to look at the equations that govern your evolution and see how your portfolio actually iterates, finding patterns that could be vectorized instead of trying to vectorize the code you already have. You would have noticed that the cumprod actually appears quite often in your iterations.
Nevertheless you can find the semi-vectorized code below. I included your code as well such that you can compare the results. I also included a simple loop version of your code which is much easier to read and translatable into mathematical equations. So if you share this code with somebody else I would definitely use the simple loop option. If you want some fancy-pants vectorizing you can use the vector version. In case you need to keep track of your single steps you can also add an array to the simple loop option and append the pv at every step.
Hope that helps.
Edit: I have not tested anything for speed. That's something you can easily do yourself with timeit.
import numpy as np
from scipy.special import erf
# Prepare simple return model - Normal distributed with mu &sigma = 0.01
x = np.linspace(-10,10,100)
cdf_values = 0.5*(1+erf((x-0.01)/(0.01*np.sqrt(2))))
# Prepare setup such that every code snippet uses the same number of steps
# and the same random numbers
nSteps = 600
nIterations = 1
rnd = np.random.rand(nSteps)
# Your code - Gives the (supposedly) correct results
portfolio_final = []
for i in range(nIterations):
portfolio_value = [100]
rand_values = rnd
interpolated_returns = np.interp(rand_values,cdf_values,x)
interpolated_returns = np.add(interpolated_returns,1)
for j in range(1,len(interpolated_returns)+1):
portfolio_value.append(interpolated_returns[j-1]*portfolio_value[j-1])
portfolio_value[j] = portfolio_value[j]-1
portfolio_final.append(portfolio_value[-1])
print (np.mean(portfolio_final))
# Using vectors
portfolio_final = []
for i in range(nIterations):
portfolio_values = np.ones(nSteps)*100.0
rcp = np.cumprod(np.interp(rnd,cdf_values,x) + 1)
portfolio_values = rcp * (portfolio_values - np.cumsum(1.0/rcp))
portfolio_final.append(portfolio_values[-1])
print (np.mean(portfolio_final))
# Simple loop
portfolio_final = []
for i in range(nIterations):
pv = 100
rets = np.interp(rnd,cdf_values,x) + 1
for i in range(nSteps):
pv = pv * rets[i] - 1
portfolio_final.append(pv)
print (np.mean(portfolio_final))

Forget about np.nditer. It does not improve the speed of iterations. Only use if you intend to go one and use the C version (via cython).
I'm puzzled about that inner loop. What is it supposed to be doing special? Why the loop?
In tests with simulated values these 2 blocks of code produce the same thing:
interpolated_returns = np.add(interpolated_returns,1)
for j in range(1,len(interpolated_returns)+1):
portfolio_value.append(interpolated_returns[j-1]*portfolio[j-1])
portfolio_value[j] = portfolio_value[j]-1
interpolated_returns = (interpolated_returns+1)*portfolio - 1
portfolio_value = portfolio_value + interpolated_returns.tolist()
I assuming that interpolated_returns and portfolio are 1d arrays of the same length.

Speed up Python eval when reading and evaluating list of equations from file

I have put together a simple Python script which reads a large list of algebraic expressions from a text file on separate lines, evaluates the mathematics on each line and puts it into a numpy array. The eigenvalues of this matrix are then found. The parameters A,B,C will then be changed and the program run again, hence a function is used to achieve this.
Some of these text files will have millions of lines of equations, so after profiling the code I found that the eval command accounts for approximately 99% of the execution time. I am aware of the dangers of using eval but this code will only ever be used by myself. All other parts of the code are fast, except the call to eval.
Here is the code where mat_size is set to 500 which represents a 500*500 array meaning 250,000 lines of equations are being read in from the file. I cannot provide the file as it is ~ 0.5GB in size, but have provided an example of what it looks like below and it only uses basic mathematical operations.
import numpy as np
from numpy import *
from scipy.linalg import eigvalsh
mat_size = 500
# Read the file line by line
with open("test_file.txt", 'r') as f:
lines = f.readlines()
# Function to evaluate the maths and build the numpy array
def my_func(A,B,C):
lst = []
for i in lines:
# Strip the \n
new = eval(i.rstrip())
lst.append(new)
# Build the numpy array
AA = np.array(lst,dtype=np.float64)
# Resize it to mat_size
matt = np.resize(AA,(mat_size,mat_size))
return matt
# Function to find eigenvalues of matrix
def optimise(x):
A,B,C = x
test = my_func(A,B,C)
ev=-1*eigvalsh(test)
return ev[-(1)]
# Define what A,B,C are, this can be changed each time the program is run
x0 = [7.65,5.38,4.00]
# Print result
print(optimise(x0))
A few lines of an example input text file: (mat_size can be changed to 2 to run this file)
.5/A**3*B**5+C
35.5/A**3*B**5+3*C
.8/C**3*A**5+C**9
.5/A*3+B**5-C/45
I am aware eval is usually bad practice and slow, so I looked for other means to achieving a speed up. I tried methods outlined here but none of these appeared to work. I also tried applying sympy to the problem but that caused a massive slowdown. What is a better way of going about this problem?
EDIT
From the suggestion to use numexpr instead, I have come across an issue where it grinds to a halt compared to the standard eval. For some instances the matrix elements contain quite a lot of algebraic expressions. Here is an example of just one matrix element, i.e one of the equations in the file (it contains a few more terms not defined in the code above, but can be easily defined at top of the code):
-71*A**3/(A+B)**7-61*B**3/(A+B)**7-3/2/B**2/C**2*A**6/(A+B)**7-7/4/B**3/m3*A**6/(A+B)**7-49/4/B**2/C*A**6/(A+B)**7+363/C*A**3/(A+B)**7*z3+451*B**3/C/(A+B)**7*z3-3/2*B**5/C/A**2/(A+B)**7-3/4*B**7/C/A**3/(A+B)**7-1/B/C**3*A**6/(A+B)**7-3/2/B**2/C*A**5/(A+B)**7-107/2/C/m3*A**4/(A+B)**7-21/2/B/C*A**4/(A+B)**7-25/2*B/C*A**2/(A+B)**7-153/2*B**2/C*A/(A+B)**7-5/2*B**4/C/m3/(A+B)**7-B**6/C**3/A/(A+B)**7-21/2*B**4/C/A/(A+B)**7-7/4/B**3/C*A**7/(A+B)**7+86/C**2*A**4/(A+B)**7*z3+90*B**4/C**2/(A+B)**7*z3-1/4*B**6/m3/A**3/(A+B)**7-149/4/B/C*A**5/(A+B)**7-65*B**2/C**3*A**4/(A+B)**7-241/2*B/C**2*A**4/(A+B)**7-38*B**3/C**3*A**3/(A+B)**7+19*B**2/C**2*A**3/(A+B)**7-181*B/C*A**3/(A+B)**7-47*B**4/C**3*A**2/(A+B)**7+19*B**3/C**2*A**2/(A+B)**7+362*B**2/C*A**2/(A+B)**7-43*B**5/C**3*A/(A+B)**7-241/2*B**4/C**2*A/(A+B)**7-272*B**3/C*A/(A+B)**7-25/4*B**6/C**2/A/(A+B)**7-77/4*B**5/C/A/(A+B)**7-3/4*B**7/C**2/A**2/(A+B)**7-23/4*B**6/C/A**2/(A+B)**7-11/B/C**2*A**5/(A+B)**7-13/B**2/m3*A**5/(A+B)**7-25*B/C**3*A**4/(A+B)**7-169/4/B/m3*A**4/(A+B)**7-27*B**2/C**3*A**3/(A+B)**7-47*B/C**2*A**3/(A+B)**7-27*B**3/C**3*A**2/(A+B)**7-38*B**2/C**2*A**2/(A+B)**7-131/4*B/m3*A**2/(A+B)**7-25*B**4/C**3*A/(A+B)**7-65*B**3/C**2*A/(A+B)**7-303/4*B**2/m3*A/(A+B)**7-5*B**5/C**2/A/(A+B)**7-49/4*B**4/m3/A/(A+B)**7-1/2*B**6/C**2/A**2/(A+B)**7-5/2*B**5/m3/A**2/(A+B)**7-1/2/B/C**3*A**7/(A+B)**7-3/4/B**2/C**2*A**7/(A+B)**7-25/4/B/C**2*A**6/(A+B)**7-45*B/C**3*A**5/(A+B)**7-3/2*B**7/C**3/A/(A+B)**7-123/2/C*A**4/(A+B)**7-37/B*A**4/(A+B)**7-53/2*B*A**2/(A+B)**7-75/2*B**2*A/(A+B)**7-11*B**6/C**3/(A+B)**7-39/2*B**5/C**2/(A+B)**7-53/2*B**4/C/(A+B)**7-7*B**4/A/(A+B)**7-7/4*B**5/A**2/(A+B)**7-1/4*B**6/A**3/(A+B)**7-11/C**3*A**5/(A+B)**7-43/C**2*A**4/(A+B)**7-363/4/m3*A**3/(A+B)**7-11*B**5/C**3/(A+B)**7-45*B**4/C**2/(A+B)**7-451/4*B**3/m3/(A+B)**7-5/C**3*A**6/(A+B)**7-39/2/C**2*A**5/(A+B)**7-49/4/B**2*A**5/(A+B)**7-7/4/B**3*A**6/(A+B)**7-79/2/C*A**3/(A+B)**7-207/2*B**3/C/(A+B)**7+22/B/C**2*A**5/(A+B)**7*z3+94*B/C**2*A**3/(A+B)**7*z3+76*B**2/C**2*A**2/(A+B)**7*z3+130*B**3/C**2*A/(A+B)**7*z3+10*B**5/C**2/A/(A+B)**7*z3+B**6/C**2/A**2/(A+B)**7*z3+3/B**2/C**2*A**6/(A+B)**7*z3+7/B**3/C*A**6/(A+B)**7*z3+52/B**2/C*A**5/(A+B)**7*z3+169/B/C*A**4/(A+B)**7*z3+131*B/C*A**2/(A+B)**7*z3+303*B**2/C*A/(A+B)**7*z3+49*B**4/C/A/(A+B)**7*z3+10*B**5/C/A**2/(A+B)**7*z3+B**6/C/A**3/(A+B)**7*z3-3/4*B**7/C/m3/A**3/(A+B)**7-7/4/B**3/C/m3*A**7/(A+B)**7-49/4/B**2/C/m3*A**6/(A+B)**7-149/4/B/C/m3*A**5/(A+B)**7-293*B/C/m3*A**3/(A+B)**7+778*B**2/C/m3*A**2/(A+B)**7-480*B**3/C/m3*A/(A+B)**7-77/4*B**5/C/m3/A/(A+B)**7-23/4*B**6/C/m3/A**2/(A+B)**7
numexpr completely chokes when the matrix elements are of this form, whereas eval evaluates it instantaneously. For just a 10*10 matrix (100 equations in file) numexpr takes about 78 seconds to process the file, whereas eval takes 0.01 seconds. Profiling the code that uses numexpr reveals that the getExprnames and precompile function of numexpr are the causes of the issue with precompile taking 73.5 seconds of the total time and getExprNames taking 3.5 seconds of the time. Why would the precompile cause such a bottleneck in this particular calculation along with the getExprNames? Is this module just not well suited to long algebraic expressions?

I found a way to speed eval() up in this particular instance by making use of the multiprocessing library. I read the file in as usual, but then break the list into equal sized sub-lists which can then be processed separately on different CPU's and the evaluated sub-lists recombined at the end. This offers a nice speedup over the original method. I am sure the code below can be simplified/optimised; but for now it works (for instance what if there is a prime number of list elements? this will mean unequal lists). Some rough benchmarks show it is ~ 3 times faster using the 4 CPU's of my laptop. Here is the code:
from multiprocessing import Process, Queue
with open("test.txt", 'r') as h:
linesHH = h.readlines()
# Get the number of list elements
size = len(linesHH)
# Break apart the list into the desired number of chunks
chunk_size = size/4
chunks = [linesHH[x:x+chunk_size] for x in xrange(0, len(linesHH), chunk_size)]
# Declare variables
A = 0.1
B = 2
C = 2.1
m3 = 1
z3 = 2
# Declare all the functions that process the substrings
def my_funcHH1(A,B,C,que): #add a argument to function for assigning a queue to each chunk function
lstHH1 = []
for i in chunks[0]:
HH1 = eval(i)
lstHH1.append(HH1)
que.put(lstHH1)
def my_funcHH2(A,B,C,que):
lstHH2 = []
for i in chunks[1]:
HH2 = eval(i)
lstHH2.append(HH2)
que.put(lstHH2)
def my_funcHH3(A,B,C,que):
lstHH3 = []
for i in chunks[2]:
HH3 = eval(i)
lstHH3.append(HH3)
que.put(lstHH3)
def my_funcHH4(A,B,C,que):
lstHH4 = []
for i in chunks[3]:
HH4 = eval(i)
lstHH4.append(HH4)
que.put(lstHH4)
queue1 = Queue()
queue2 = Queue()
queue3 = Queue()
queue4 = Queue()
# Declare the processes
p1 = Process(target= my_funcHH1, args= (A,B,C,queue1))
p2 = Process(target= my_funcHH2, args= (A,B,C,queue2))
p3 = Process(target= my_funcHH3, args= (A,B,C,queue3))
p4 = Process(target= my_funcHH4, args= (A,B,C,queue4))
# Start them
p1.start()
p2.start()
p3.start()
p4.start()
HH1 = queue1.get()
HH2 = queue2.get()
HH3 = queue3.get()
HH4 = queue4.get()
p1.join()
p2.join()
p3.join()
p4.join()
# Obtain the final result by combining lists together again.
mergedlist = HH1 + HH2 + HH3 + HH4

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python multiprocessing on function with return value - python

Related

Python Multiprocessing cannot Join for Large Data Set

Speed up this for loop in python for custom function on array

Python, multiprocessing.pool took about the same amount of time as a for loop

How do I vectorize the following loop in Numpy?

Speed up Python eval when reading and evaluating list of equations from file

Categories

Resources