The script below does the following:
1) makes a data frame with 200 rows
2) sorts the df into a list of objects, multiprocessing so that each core does a quater of the df into their own list
3)sticks the lists together into a big list and prints
problem = the list is empty it's almost like the get_car_terms function wasn't called in each process with no error message
import random
import psutil
import pandas as pd
import multiprocessing as mp
class car_term(): #object to go into list
def __init__(self, capcode, miles,months , cmprice, fmprice ):
self.capcode = capcode
self.months = months
self.miles = miles
self.cmprice = cmprice
self.fmprice = fmprice
df_final = pd.DataFrame({'capcode':[],'months':[],'mileage':[],'cm':[],'fm':[]})
for i in range (200): # making dataframe to get data from
df_final.append(pd.DataFrame({'capcode':[i],'months':[random.randint(1, 12)],'mileage':[random.randint(0, 10000)],'cm':[random.randint(5, 700)],'fm':[random.randint(15, 710)]}))
all_deals=[] # this is the list i want to put my objects into
def get_car_terms(data,mdb1,all_deals1):
all_deals1.append(car_term(mdb1['capcode'][data],mdb1['mileage'][data],mdb1['months'][data],mdb1['cm'][data],mdb1['fm'][data])) # i make the objects with the dataframe like this
all_deals1a=[] # individual lists for each proccessor
all_deals2a=[]
all_deals3a=[]
all_deals4a=[]
print("yo1")
if __name__ == "__main__":
n_cpus = psutil.cpu_count() # number of cpus
print(n_cpus) # i have 4 cpus
if df_final.shape[0]%n_cpus == 0:
for i in range(int(df_final.shape[0]/n_cpus)):
############# the problem is the get_car_terms function doesnt run below
p1 = mp.Proccess(target = get_car_terms,args = (i+((df_final.shape[0]/n_cpus)*1), df_final,all_deals1a)) # each cpu sorts a quater of the dataframe into my objects list
p2 = mp.Proccess(target = get_car_terms,args = (i+((df_final.shape[0]/n_cpus)*2), df_final,all_deals2a))
p3 = mp.Proccess(target = get_car_terms,args = (i+((df_final.shape[0]/n_cpus)*3), df_final,all_deals3a))
p4 = mp.Proccess(target = get_car_terms,args = (i+((df_final.shape[0]/n_cpus)*4), df_final,all_deals4a))
p1.start()
p2.start()
p3.start()
p4.start()
p1.end()
p2.end()
p3.end()
p4.end()
all_deals.append(all_deals1a) # group lists together
all_deals.append(all_deals2a)
all_deals.append(all_deals3a)
all_deals.append(all_deals4a)
print("we did it")
print(len(all_deals)) # this should have 200 of my objects in it... it doesnt
for i in all_deals:
print(i.capcode)
You called .end() right after .start(), so the multiprocessings did not get the time they need to work. I would recommend running time.sleep(1) between the starts and ends to give them the time they need.
Related
I am trying to figure out how to run a large problem on multiple cores. I am struggling with splitting a dataframe to the different processes.
I have a class as follows:
class Pergroup():
def __init__(self, groupid):
...
def process_datapoint(self, df_in, group):
...
My data is a time-series, and contains events that can be grouped using the groupid column. I create an instance of the class for each group as so:
for groupname in df_in['groupid'].unique():
instance_names.append(groupname)
holder = {name: Pergroup(name) for name in instance_names}
Now, for each timestamp in the dataframe, I want to call the corresponding instance (based on the group), and pass to it the dataframe at that timestamp.
I have tried the following, which does not seem to parallelize as I expect:
for val in range(0, len(df_in)):
current_group = df_in['groupid'][val]
current_df = df_in.ix[val]
with concurrent.futures.ProcessPoolExecutor() as executor:
executor.map(holder[current_group].process_datapoint, current_df, current_group)
I have also tried using this, which splits the df into its columns, when calling the instances:
Parallel(n_jobs=-1)(map(delayed(holder[current_group].process_datapoint), current_df, current_group))
How should I break up the dataframe such that I can still call the right instance with the right data? Basically, I am attempting to run a loop as below, with the last line running in parallel:
for val in range(0, len(df_in)):
current_group = df_in['groupid'][val]
current_df = df_in.ix[val]
holder[current_group].process_datapoint(current_df, current_group) #This call should be initiated in as many cores as possible.
Slightly different approach using pool
import pandas as pd
from multiprocessing import Pool
# First make sure each process has its own data
groups = df_in['groupid'].unique().values
data = [(group_id, holder[group_id], df_in.ix[group_id])
for group for groups]
# Prepare a function that can take this data as input
def help_func(current_group, holder, current_df):
return holder.process_datapoint(current_df, current_group)
# Run in parallel
with Pool(processes=4) as p:
p.map(help_func, data)
I had at some point a similar problem; as I can completely adapt to your question, I hope you can transpose and make this fit to your problem:
import multiprocessing
from joblib import Parallel, delayed
maxbatchsize = 10000 #limit the amount of data dispatched to each core
ncores = -1 #number of cores to use
data = pandas.DataFrame() #<<<- your dataframe
class DFconvoluter():
def __init__(self, myparam):
self.myparam = myparam
def __call__(self, df):
return df.apply(lamda row: row['somecolumn']*self.myparam)
nbatches = max(math.ceil(len(df)/maxbatchsize), ncores)
g = GenStrategicGroups( data['Key'].values, nbatches ) #a vector telling which row should be dispatched to which batch
#-- parallel part
def applyParallel(dfGrouped, func):
retLst = Parallel(n_jobs=ncores)(delayed(func)(group) for _, group in dfGrouped)
return pd.concat(retLst)
out = applyParallel(data.groupby(g), Dfconvoluter(42)))'
what is left is to write, how you'd like to group the batches together, for me this had to be done in a fashion so that rows, where values in the 'keys'-column where similar had to stay together:
def GenStrategicGroups(stratify, ngroups):
''' Generate a list of integers in a grouped sequence,
where grouped levels in stratifiy are preserved.
'''
g = []
nelpg = float(len(stratify)) / ngroups
prev_ = None
grouped_idx = 0
for i,s in enumerate(stratify):
if i > (grouped_idx+1)*nelpg:
if s != prev_:
grouped_idx += 1
g.append(grouped_idx)
prev_ = s
return g
I have the a script similar to this:
import random
import pandas as pd
FA = []
FB = []
Value = []
df = pd.DataFrame()
df_save = pd.DataFrame(index=['min','max'])
days = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
numbers = list(range(24)) # FA.unique()
mix = '(pairwise combination of days and numbers, i.e. 0Monday,0Tuesday,...1Monday,1Tuesday,....)' 'I dont know how to do this combination btw'
def Calculus():
global min,max
min = df['Value'][boolean].min()
max = df['Value'][boolean].max()
for i in range(1000):
FA.append(random.randrange(0,23,1))
FB.append(random.choice(days))
Value.append(random.random())
df['FA'] = FA
df['FB'] = FB
df['FAB'] = df['FA'].astype(str) + df['FB'].astype(str)
df['Value'] = Value
mix_factor = df['FA'].astype(str) + df['FB'].astype(str)
for i in numbers:
boolean = df['FA'] == i
Calculus()
df_save[str(i)] = [min,max]
for i in days:
boolean = df['FB'] == i
Calculus()
df_save[str(i)] = [min,max]
for i in mix_factor.unique():
boolean = df['FAB'] == i
Calculus() #
df_save[str(i)] = [min,max]
My question is: there is another way to do the same but more time efficiently? My real data (df in this case) is a csv with millions of rows and this three loops are taking forever.
Maybe using 'apply' but I never have worked with it before.
Any insight will be very appreciate, thanks.
You could put all three loops into one, depending on what your exact code is. Is there a parameter for calculus? If not, putting them into one would allow you to have to run Calculus() less
The code below simulates a problem with multiprocessing I am facing.
There are two functions - f1 and f2 - which return (pandas) dataframes with n rows to a calling function run_fns(n). The two functions are to be run in parallel.
The code works fine for smaller vales of n (eg n <= 700), but freezes for larger values of n (say n >= 7000).
I have tried calling Queue using Queue([maxsize]) with various maxsize values including the default, 0, -1 and many other numbers small and large with no change in this behaviour.
Any solutions, workarounds or alternate approaches would be very welcome. And I have a secondary question : Do I really need to include
if __name__ == "__main__":
somewhere? If so where?
The code:
f1 returns n rows and 3 columns, f2 returns n rows and 5 columns. The dataframes are built with randomly generated integers.
import numpy as np
import pandas as pd
from multiprocessing import Process, Queue
def run_fns(n):
"""Run p1 and p2 in parallel, and get the returned dataframes."""
q1 = Queue()
q2 = Queue()
p1 = Process(target=f1, args=(n, q1))
p2 = Process(target=f2, args=(n, q2))
p1.start()
p2.start()
p1.join()
p2.join()
df1 = q1.get()
df2 = q2.get()
return df1, df2
def f1(n, q):
"""Create a dataframe with n rows and 3 columns."""
df = pd.DataFrame(np.random.randint(n * 3, size=(n, 3)))
q.put(df)
def f2(n, q):
"""Create a dataframe with n rows and 5 columns."""
df = pd.DataFrame(np.random.randint(n * 5, size=(n, 5)))
q.put(df)
You are facing a typical issue which is documented in the multiprocessing programming guidelines.
Bear in mind that a process that has put items in a queue will wait before terminating until all the buffered items are fed by the “feeder” thread to the underlying pipe. (The child process can call the Queue.cancel_join_thread method of the queue to avoid this behaviour.)
This means that whenever you use a queue you need to make sure that all items which have been put on the queue will eventually be removed before the process is joined. Otherwise you cannot be sure that processes which have put items on the queue will terminate.
You need to make sure you get the data before joining the processes.
# start the processes
p1.start()
p2.start()
# drain the queues
df1 = q1.get()
df2 = q2.get()
# then join the queues
p1.join()
p2.join()
return df1, df2
Manager Code..
import pandas as pd
import multiprocessing
import time
import MyDF
import WORKER
class Manager():
'Common base class for all Manager'
def __init__(self,Name):
print('Hello Manager..')
self.MDF=MyDF.MYDF(Name);
self.Arg=self.MDF.display();
self.WK=WORKER.Worker(self.Arg); MGR=Manager('event_wise_count') if __name__ == '__main__':
jobs = []
x=5;
for i in range(5):
x=10*i
print('Manager : ',i)
p = multiprocessing.Process(target=MGR.WK.DISPLAY)
jobs.append(p)
p.start()
time.sleep(x);
worker code...
import pandas as pd
import time
class Worker():
'Common base class for all Workers'
empCount = 0
def __init__(self,DF):
self.DF=DF;
print('Hello worker..',self.DF.count())
def DISPLAY(self):
self.DF=self.DF.head(10);
return self.DF
Hi I am trying to do multiprocessing. and i want to share a Data Frame address with all sub-processes.
So in above from Manager Class I am spawning 5 process , where each sub-process required to use Data Frame of worker class , expecting that each sub process will share reference of worker Data Frame. But unfortunately It is not happening..
Any Answer welcome..
Thanks In Advance,,.. please :)..
This answer suggests using Namespaces to share large objects between processes by reference.
Here's an example of an application where 4 different processes can read from the same DataFrame. (Note: you can't run this on an interactive console -- save this as a program.py and run it.)
import pandas as pd
from multiprocessing import Manager, Pool
def get_slice(namespace, column, rows):
'''Return the first `rows` rows from column `column in namespace.data'''
return namespace.data[column].head(rows)
if __name__ == '__main__':
# Create a namespace to place our DataFrame in it
manager = Manager()
namespace = manager.Namespace()
namespace.data = pd.DataFrame(pd.np.random.rand(1000, 10))
# Create 4 processes
pool = Pool(processes=2)
for column in namespace.data.columns:
# Each pool can access the same DataFrame object
result = pool.apply_async(get_slice, [namespace, column, 5])
print result._job, column, result.get().tolist()
While reading from the DataFrame is perfectly fine, it gets a little tricky if you want to write back to it. It's better to just stick to immutable objects unless you really need large write-able objects.
Sorry about the necromancy.
The issue is that the workers must have unique DataFrame instances. Almost all attempts to slice, or chunk, a Pandas DataFrame will result in aliases to the original DataFrame. These aliases will still result in resource contention between workers.
There a two things that should improve performance. The first would be to make sure that you are working with Pandas. Iterating row by row, with iloc or iterrows, fights against the design of DataFrames. Using a new-style class object and the apply a method is one option.
def get_example_df():
return pd.DataFrame(pd.np.random.randint(10, 100, size=(5,5)))
class Math(object):
def __init__(self):
self.summation = 0
def operation(self, row):
row_result = 0
for elem in row:
if elem % 2:
row_result += elem
else:
row_result += 1
self.summation += row_result
if row_result % 2:
return row_result
else:
return 1
def get_summation(self):
return self.summation
Custom = Math()
df = get_example_df()
df['new_col'] = df.apply(Custom.operation)
print Custom.get_summation()
The second option would be to read in, or generate, each DataFrame for each worker. Then recombine if desired.
workers = 5
df_list = [ get_example_df() ]*workers
...
# worker code
...
aggregated = pd.concat(df_list, axis=0)
However, multiprocessing will not be necessary in most cases. I've processed more than 6 million rows of data without multiprocessing in a reasonable amount of time (on a laptop).
Note: I did not time the above code and there is probably room for improvement.
I am reading in hundreds of HDF files and processing the data of each HDF seperately. However, this takes an awful amount of time, since it is working on one HDF file at a time. I just stumbled upon http://docs.python.org/library/multiprocessing.html and am now wondering how I can speed things up using multiprocessing.
So far, I came up with this:
import numpy as np
from multiprocessing import Pool
def myhdf(date):
ii = dates.index(date)
year = date[0:4]
month = date[4:6]
day = date[6:8]
rootdir = 'data/mydata/'
filename = 'no2track'+year+month+day
records = read_my_hdf(rootdir,filename)
if records.size:
results[ii] = np.mean(records)
dates = ['20080105','20080106','20080107','20080108','20080109']
results = np.zeros(len(dates))
pool = Pool(len(dates))
pool.map(myhdf,dates)
However, this is obviously not correct. Can you follow my chain of thought what I want to do? What do I need to change?
Try joblib for a friendlier multiprocessing wrapper:
from joblib import Parallel, delayed
def myhdf(date):
# do work
return np.mean(records)
results = Parallel(n_jobs=-1)(delayed(myhdf)(d) for d in dates)
The Pool classes map function is like the standard python libraries map function, you're guaranteed to get your results back in the order that you put them in. Knowing that, the only other trick is that you need to return results in a consistant manner, and the filter them afterwards.
import numpy as np
from multiprocessing import Pool
def myhdf(date):
year = date[0:4]
month = date[4:6]
day = date[6:8]
rootdir = 'data/mydata/'
filename = 'no2track'+year+month+day
records = read_my_hdf(rootdir,filename)
if records.size:
return np.mean(records)
dates = ['20080105','20080106','20080107','20080108','20080109']
pool = Pool(len(dates))
results = pool.map(myhdf,dates)
results = [ result for result in results if result ]
results = np.array(results)
If you really do want results as soon as they are available you can use imap_unordered