How to optimize this pandas iterable - python

I have the following method in which I am eliminating overlapping intervals in a dataframe based on a set of hierarchical rules:
def disambiguate(arg):
arg['length'] = (arg.end - arg.begin).abs()
df = arg[['begin', 'end', 'note_id', 'score', 'length']].copy()
data = []
out = pd.DataFrame()
for row in df.itertuples():
test = df[df['note_id']==row.note_id].copy()
# get overlapping intervals:
# https://stackoverflow.com/questions/58192068/is-it-possible-to-use-pandas-overlap-in-a-dataframe
iix = pd.IntervalIndex.from_arrays(test.begin.apply(pd.to_numeric), test.end.apply(pd.to_numeric), closed='neither')
span_range = pd.Interval(row.begin, row.end)
fx = test[iix.overlaps(span_range)].copy()
maxLength = fx['length'].max()
minLength = fx['length'].min()
maxScore = abs(float(fx['score'].max()))
minScore = abs(float(fx['score'].min()))
# filter out overlapping rows via hierarchy
if maxScore > minScore:
fx = fx[fx['score'] == maxScore]
elif maxLength > minLength:
fx = fx[fx['length'] == minScore]
data.append(fx)
out = pd.concat(data, axis=0)
# randomly reindex to keep random row when dropping remaining duplicates: https://gist.github.com/cadrev/6b91985a1660f26c2742
out.reset_index(inplace=True)
out = out.reindex(np.random.permutation(out.index))
return out.drop_duplicates(subset=['begin', 'end', 'note_id'])
This works fine, except for the fact that the dataframes I am iterating over have well over 100K rows each, so this is taking forever to complete. I did a timing of various methods using %prun in Jupyter, and the method that seems to eat up processing time was series.py:3719(apply) ... NB: I tried using modin.pandas, but that was causing more problems (I kept getting an error wrt to Interval needing a value where left was less than right, which I couldn't figure out: I may file a GitHub issue there).
Am looking for a way to optimize this, such as using vectorization, but honestly, I don't have the slightest clue how to convert this to a vectotrized form.
Here is a sample of my data:
begin,end,note_id,score
0,9,0365,1
10,14,0365,1
25,37,0365,0.7
28,37,0365,1
38,42,0365,1
53,69,0365,0.7857142857142857
56,60,0365,1
56,69,0365,1
64,69,0365,1
83,86,0365,1
91,98,0365,0.8333333333333334
101,108,0365,1
101,127,0365,1
112,119,0365,1
112,127,0365,0.8571428571428571
120,127,0365,1
163,167,0365,1
196,203,0365,1
208,216,0365,1
208,223,0365,1
208,231,0365,1
208,240,0365,0.6896551724137931
217,223,0365,1
217,231,0365,1
224,231,0365,1
246,274,0365,0.7692307692307693
252,274,0365,1
263,274,0365,0.8888888888888888
296,316,0365,0.7222222222222222
301,307,0365,1
301,316,0365,1
301,330,0365,0.7307692307692307
301,336,0365,0.78125
308,316,0365,1
308,323,0365,1
308,330,0365,1
308,336,0365,1
317,323,0365,1
317,336,0365,1
324,330,0365,1
324,336,0365,1
361,418,0365,0.7368421052631579
370,404,0365,0.7111111111111111
370,418,0365,0.875
383,418,0365,0.8285714285714286
396,404,0365,1
396,418,0365,0.8095238095238095
405,418,0365,0.8333333333333334
432,453,0365,0.7647058823529411
438,453,0365,1
438,458,0365,0.7222222222222222

I think I know what the issue was: I did my filtering on note_id incorrectly, and thus iterating over the entire dataframe.
It should been:
cases = set(df['note_id'].tolist())
for case in cases:
test = df[df['note_id']==case].copy()
for row in df.itertuples():
# get overlapping intervals:
# https://stackoverflow.com/questions/58192068/is-it-possible-to-use-pandas-overlap-in-a-dataframe
iix = pd.IntervalIndex.from_arrays(test.begin, test.end, closed='neither')
span_range = pd.Interval(row.begin, row.end)
fx = test[iix.overlaps(span_range)].copy()
maxLength = fx['length'].max()
minLength = fx['length'].min()
maxScore = abs(float(fx['score'].max()))
minScore = abs(float(fx['score'].min()))
if maxScore > minScore:
fx = fx[fx['score'] == maxScore]
elif maxLength > minLength:
fx = fx[fx['length'] == maxLength]
data.append(fx)
out = pd.concat(data, axis=0)
For testing on one note, before I stopped iterating over the entire, non-filtered dataframe, it was taking over 16 minutes. Now, it's at 28 seconds!

Related

Calculation of the removal percentage for chemical parameters (faster code)

I have to calculate the removal pecentages of chemical/biological parameters (e.g. after an oxidation process) in a waster water treatment plant.
My code code works so far and does exactly what it should do, but it is really slow.
On my laptop the calculation for the original dataset took about 10 sec and on my PC 4 sec for a 15x80 Data Frame. That is too long, especially if I have to deal with more rows.
What the code does:
The formula for the single removal is defined as: 1 - n(i)/n(i-1)
and for the total removal: 1 - n(i)/n(0)
Every measuring point has its own ID. The code searches for the ID's and performs the calculation and saves it in the data frame.
Here is an example (I cant post the original data):
import pandas as pd
import numpy as np
data = {"ID": ["X1_P0001", "X2_P0001", "X3_P0001", "X1_P0002", "X2_P0002", "X3_P0002", "X4_P0002","X5_P0002", "X1_P0003", "X2_P0003", "X3_P0003"],
"Measurement": [100, 80, 60, 120,90,70,50,25, 85,65,35]}
df["S_removal"]= np.nan
df["T_removal"]= np.nan
Data Frame before calculation
this is my function for the calculation:
def removal_TEST(Rem1, Measure, Rem2):
lst = [i.split("_")[1] for i in df["ID"]] #takes relevant ID information
y = np.unique(lst) #stores unique ID values to loop over them
for ID in y:
id_list = []
for i in range(0, len(df["ID"])):
if ID in df["ID"][i]:
id_list.append(i)
else: # this stores only the relevant id in a new list
id_list.append(np.nan)
indexlist = pd.Series(id_list)
first_index = indexlist.first_valid_index() #gets the first and last index of the id list
last_index = indexlist.last_valid_index()
col_indizes = []
for i in range(first_index, last_index+1):
col_indizes.append(i)
for i in col_indizes:
if i == 0:
continue # for i=0 there is no 0-1 element, so i=0 should be skipped
else:
Rem1[i]= 1-(Measure[i]/Measure[i-1])
Rem1[first_index]= np.nan #first entry of an ID must be NaN value
for i in range(first_index, last_index+1):
col_indizes.append(i)
for i in range(len(Rem2)):
for i in col_indizes:
Rem2[i]= 1-(Measure[i]/Measure[first_index])
Rem2[first_index]= np.nan
this is the result:
Final Data Frame
I am new to Python and to stackoverflow (so sorry if my code and question are not so good to read). Are there any good libraries to speed up my code, or do you have some suggestions?
Thank you :)
Your use of Pandas seems to be getting in the way of solving the problem. The only relevant state seems to be when the group changes and the first and previous measurement values for each row.
I'd be tempted to solve this just using Python primitives, but you could solve this in other ways if you had lots of data (i.e. millions of rows).
import pandas as pd
df = pd.DataFrame({
"ID": ["X1_P0001", "X2_P0001", "X3_P0001", "X1_P0002", "X2_P0002", "X3_P0002", "X4_P0002","X5_P0002", "X1_P0003", "X2_P0003", "X3_P0003"],
"Measurement": [100, 80, 60, 120,90,70,50,25, 85,65,35],
"S_removal": float('nan'),
"T_removal": float('nan'),
})
# somewhere keep track of the last group identifier
last = None
# iterate over rows
for idx, ID, meas in zip(df.index, df['ID'], df['Measurement']):
# what's the current group name
_, grp = ID.split('_', 1)
# see if we're in a new group
if grp != last:
last = grp
# track the group's measurement
grp_meas = meas
else:
# calculate things
df.loc[idx, 'S_removal'] = 1 - meas / last_meas
df.loc[idx, 'T_removal'] = 1 - meas / grp_meas
# keep track of the last measurement
last_meas = meas
I've commented the code in the hopes it makes sense. This takes ~2 seconds for 1000 copies of your example data, so 11000 rows.
Given that OP has said this needs to be done for a wide dataset, here's another version that reduces runtime to ~30ms for 11000 rows and 2 columns:
import numpy as np
import pandas as pd
data = {
"ID": ["X1_P0001", "X2_P0001", "X3_P0001", "X1_P0002", "X2_P0002", "X3_P0002", "X4_P0002","X5_P0002", "X1_P0003", "X2_P0003", "X3_P0003"],
"M1": [100, 80, 60, 120,90,70,50,25, 85,65,35],
"M2": [100, 80, 60, 120,90,70,50,25, 85,65,35],
}
# reset_index() because code below assumes they are unique
df = pd.concat([pd.DataFrame(data)]*1000).reset_index()
# column names
measurement_col_names = ['M1', 'M2']
single_output_names = ['S1', 'S2']
total_output_names = ['T1', 'T2']
# somewhere keep track of the last group identifier
last = None
# somewhere to store intermediate state
vals_idx = []
meas_vals = []
last_vals = []
grp_vals = []
# iterate over rows
for idx, ID, meas in zip(df.index, df['ID'], df.loc[:,measurement_col_names].values):
# what's the current group name
_, grp = ID.split('_', 1)
# we're in a new group
if grp != last:
last = grp
# track the group's measurement
grp_meas = meas
else:
# track values and which rows they apply to
vals_idx.append(idx)
meas_vals.append(meas)
last_vals.append(last_meas)
grp_vals.append(grp_meas)
# keep track of the last measurement
last_meas = meas
# convert to numpy array so it vectorises nicely
meas_vals = np.array(meas_vals)
# perform calculation using fast numpy operations
df.loc[vals_idx, single_output_names] = 1 - (meas_vals / last_vals)
df.loc[vals_idx, total_output_names] = 1 - (meas_vals / grp_vals)

i am trying to write a function which divide a column 3 parts

i am trying to write a function which takes a column as input and divide it 3 parts as short, medium , long then return them as list.
i tried to do it with loc function, but, however, it return a dataframe rather than a list.
def DivideColumns(df,col):
mean = df[col].mean()
maxi = df[col].max()
mini = df[col].min()
less = mean - (maxi-mini)/3
more = mean + (maxi-mini)/3
short = df.loc[df[col] < less]
average = df.loc[df[col].between(df[col], less, more)]
long = df.loc[df[col] > more]
return short, average, long;
what i am expected was getting 3 different list, but unfortunately i got 3 different dataframe
Since you are using pandas you can use the concept of binning. By using the pandas cut function you can divide in the ranges you like and it makes your code easier to read. More info here
def DivideColumns(df,col):
mean = df[col].mean()
maxi = df[col].max()
mini = df[col].min()
less = mean - (maxi-mini)/3
more = mean + (maxi-mini)/3
# binning
bins_values = [mini, less, more, maxi]
group_names = ['short', 'avarage', 'long']
bins = pd.cut(df[col], bins_values, labels=group_names, include_lowest=True )
short = (df[col][bins == 'short']).tolist()
average = (df[col][bins == 'avarage']).tolist()
long = (df[col][bins == 'long']).tolist()
return short, average, long;
Use tolist() function to transform a pandas dataframe into a list.
short = df.loc[df[col] < less].values.tolist()
average = df.loc[df[col].between(df[col], less, more)].values.tolist()
long = df.loc[df[col] > more].values.tolist()

How to vectorize this peak finding for loop in Python?

Basically I'm writing a peak finding function that needs to be able to beat scipy.argrelextrema in benchmarking. Here is a link to the data I'm using, and the code:
https://drive.google.com/open?id=1U-_xQRWPoyUXhQUhFgnM3ByGw-1VImKB
If this link expires, the data can be found at dukascopy bank's online historical data downloader.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv('EUR_USD.csv')
data.columns = ['Date', 'open', 'high', 'low', 'close','volume']
data.Date = pd.to_datetime(data.Date, format='%d.%m.%Y %H:%M:%S.%f')
data = data.set_index(data.Date)
data = data[['open', 'high', 'low', 'close']]
data = data.drop_duplicates(keep=False)
price = data.close.values
def fft_detect(price, p=0.4):
trans = np.fft.rfft(price)
trans[round(p*len(trans)):] = 0
inv = np.fft.irfft(trans)
dy = np.gradient(inv)
peaks_idx = np.where(np.diff(np.sign(dy)) == -2)[0] + 1
valleys_idx = np.where(np.diff(np.sign(dy)) == 2)[0] + 1
patt_idx = list(peaks_idx) + list(valleys_idx)
patt_idx.sort()
label = [x for x in np.diff(np.sign(dy)) if x != 0]
# Look for Better Peaks
l = 2
new_inds = []
for i in range(0,len(patt_idx[:-1])):
search = np.arange(patt_idx[i]-(l+1),patt_idx[i]+(l+1))
if label[i] == -2:
idx = price[search].argmax()
elif label[i] == 2:
idx = price[search].argmin()
new_max = search[idx]
new_inds.append(new_max)
plt.plot(price)
plt.plot(inv)
plt.scatter(patt_idx,price[patt_idx])
plt.scatter(new_inds,price[new_inds],c='g')
plt.show()
return peaks_idx, price[peaks_idx]
It basically smoothes data using a fast fourier transform (FFT) then takes the derivative to find the minimum and maximum indices of the smoothed data, then finds the corresponding peaks on the unsmoothed data. Sometimes the peaks it finds are not idea due to some smoothing effects, so I run this for loop to search for higher or lower points for each index between the bounds specified by l. I need help vectorizing this for loop! I have no idea how to do it. Without the for loop, my code is about 50% faster than scipy.argrelextrema, but the for loop slows it down. So if I can find a way to vectorize it, it'd be a very quick, and very effective alternative to scipy.argrelextrema. These two images represent the data without and with the for loop respectively.
This may do it. It's not perfect but hopefully it obtains what you want and shows you a bit how to vectorize. Happy to hear any improvements you think up
label = np.array(label[:-1]) # not sure why this is 1 unit longer than search.shape[0]?
# the idea is to make the index matrix you're for looping over row by row all in one go.
# This part is sloppy and you can improve this generation.
search = np.vstack((np.arange(patt_idx[i]-(l+1),patt_idx[i]+(l+1)) for i in range(0,len(patt_idx[:-1])))) # you can refine this.
# then you can make the price matrix
price = price[search]
# and you can swap the sign of elements so you only need to do argmin instead of both argmin and argmax
price[label==-2] = - price[label==-2]
# now find the indices of the minimum price on each row
idx = np.argmin(price,axis=1)
# and then extract the refined indices from the search matrix
new_inds = search[np.arange(idx.shape[0]),idx] # this too can be cleaner.
# not sure what's going on here so that search[:,idx] doesn't work for me
# probably just a misunderstanding
I find that this reproduces your result but I did not time it. I suspect the search generation is quite slow but probably still faster than your for loop.
Edit:
Here's a better way to produce search:
patt_idx = np.array(patt_idx)
starts = patt_idx[:-1]-(l+1)
stops = patt_idx[:-1]+(l+1)
ds = stops-starts
s0 = stops.shape[0]
s1 = ds[0]
search = np.reshape(np.repeat(stops - ds.cumsum(), ds) + np.arange(ds.sum()),(s0,s1))
Here is an alternative... it uses list comprehension which is generally faster than for-loops
l = 2
# Define the bounds beforehand, its marginally faster than doing it in the loop
upper = np.array(patt_idx) + l + 1
lower = np.array(patt_idx) - l - 1
# List comprehension...
new_inds = [price[low:hi].argmax() + low if lab == -2 else
price[low:hi].argmin() + low
for low, hi, lab in zip(lower, upper, label)]
# Find maximum within each interval
new_max = price[new_inds]
new_global_max = np.max(new_max)

Python Pandas Merge Causing Memory Overflow

I'm new to Pandas and am trying to merge a few subsets of data. I'm giving a specific case where this happens, but the question is general: How/why is it happening and how can I work around it?
The data I load is around 85 Megs or so but I often watch my python session run up close to 10 gigs of memory usage then give a memory error.
I have no idea why this happens, but it's killing me as I can't even get started looking at the data the way I want to.
Here's what I've done:
Importing the Main data
import requests, zipfile, StringIO
import numpy as np
import pandas as pd
STAR2013url="http://www3.cde.ca.gov/starresearchfiles/2013/p3/ca2013_all_csv_v3.zip"
STAR2013fileName = 'ca2013_all_csv_v3.txt'
r = requests.get(STAR2013url)
z = zipfile.ZipFile(StringIO.StringIO(r.content))
STAR2013=pd.read_csv(z.open(STAR2013fileName))
Importing some Cross Cross Referencing Tables
STARentityList2013url = "http://www3.cde.ca.gov/starresearchfiles/2013/p3/ca2013entities_csv.zip"
STARentityList2013fileName = "ca2013entities_csv.txt"
r = requests.get(STARentityList2013url)
z = zipfile.ZipFile(StringIO.StringIO(r.content))
STARentityList2013=pd.read_csv(z.open(STARentityList2013fileName))
STARlookUpTestID2013url = "http://www3.cde.ca.gov/starresearchfiles/2013/p3/tests.zip"
STARlookUpTestID2013fileName = "Tests.txt"
r = requests.get(STARlookUpTestID2013url)
z = zipfile.ZipFile(StringIO.StringIO(r.content))
STARlookUpTestID2013=pd.read_csv(z.open(STARlookUpTestID2013fileName))
STARlookUpSubgroupID2013url = "http://www3.cde.ca.gov/starresearchfiles/2013/p3/subgroups.zip"
STARlookUpSubgroupID2013fileName = "Subgroups.txt"
r = requests.get(STARlookUpSubgroupID2013url)
z = zipfile.ZipFile(StringIO.StringIO(r.content))
STARlookUpSubgroupID2013=pd.read_csv(z.open(STARlookUpSubgroupID2013fileName))
Renaming a Column ID to Allow for Merge
STARlookUpSubgroupID2013 = STARlookUpSubgroupID2013.rename(columns={'001':'Subgroup ID'})
STARlookUpSubgroupID2013
Successful Merge
merged = pd.merge(STAR2013,STARlookUpSubgroupID2013, on='Subgroup ID')
Try a second merge. This is where the Memory Overflow Happens
merged=pd.merge(merged, STARentityList2013, on='School Code')
I did all of this in ipython notebook, but don't think that changes anything.
Although this is an old question, I recently came across the same problem.
In my instance, duplicate keys are required in both dataframes, and I needed a method which could tell if a merge will fit into memory ahead of computation, and if not, change the computation method.
The method I came up with is as follows:
Calculate merge size:
def merge_size(left_frame, right_frame, group_by, how='inner'):
left_groups = left_frame.groupby(group_by).size()
right_groups = right_frame.groupby(group_by).size()
left_keys = set(left_groups.index)
right_keys = set(right_groups.index)
intersection = right_keys & left_keys
left_diff = left_keys - intersection
right_diff = right_keys - intersection
left_nan = len(left_frame[left_frame[group_by] != left_frame[group_by]])
right_nan = len(right_frame[right_frame[group_by] != right_frame[group_by]])
left_nan = 1 if left_nan == 0 and right_nan != 0 else left_nan
right_nan = 1 if right_nan == 0 and left_nan != 0 else right_nan
sizes = [(left_groups[group_name] * right_groups[group_name]) for group_name in intersection]
sizes += [left_nan * right_nan]
left_size = [left_groups[group_name] for group_name in left_diff]
right_size = [right_groups[group_name] for group_name in right_diff]
if how == 'inner':
return sum(sizes)
elif how == 'left':
return sum(sizes + left_size)
elif how == 'right':
return sum(sizes + right_size)
return sum(sizes + left_size + right_size)
Note:
At present with this method, the key can only be a label, not a list. Using a list for group_by currently returns a sum of merge sizes for each label in the list. This will result in a merge size far larger than the actual merge size.
If you are using a list of labels for the group_by, the final row size is:
min([merge_size(df1, df2, label, how) for label in group_by])
Check if this fits in memory
The merge_size function defined here returns the number of rows which will be created by merging two dataframes together.
By multiplying this with the count of columns from both dataframes, then multiplying by the size of np.float[32/64], you can get a rough idea of how large the resulting dataframe will be in memory. This can then be compared against psutil.virtual_memory().available to see if your system can calculate the full merge.
def mem_fit(df1, df2, key, how='inner'):
rows = merge_size(df1, df2, key, how)
cols = len(df1.columns) + (len(df2.columns) - 1)
required_memory = (rows * cols) * np.dtype(np.float64).itemsize
return required_memory <= psutil.virtual_memory().available
The merge_size method has been proposed as an extension of pandas in this issue. https://github.com/pandas-dev/pandas/issues/15068.

pandas: setting last N rows of multi-index to Nan for speeding up groupby with shift

I am trying to speed up my groupby.apply + shift and
thanks to this previous question and answer: How to speed up Pandas multilevel dataframe shift by group? I can prove that it does indeed speed things up when you have many groups.
From that question I now have the following code to set the first entry in each multi-index to Nan. And now I can do my shift globally rather than per group.
df.iloc[df.groupby(level=0).size().cumsum()[:-1]] = np.nan
but I want to look forward, not backwards, and need to do calculations across N rows. So I am trying to use some similar code to set the last N entries to NaN, but obviously I am missing some important indexing knowledge as I just can't figure it out.
I figure I want to convert this so that every entry is a range rather than a single integer. How would I do that?
# the start of each group, ignoring the first entry
df.groupby(level=0).size().cumsum()[1:]
Test setup (for backwards shift) if you want to try it:
length = 5
groups = 3
rng1 = pd.date_range('1/1/1990', periods=length, freq='D')
frames = []
for x in xrange(0,groups):
tmpdf = pd.DataFrame({'date':rng1,'category':int(10000000*abs(np.random.randn())),'colA':np.random.randn(length),'colB':np.random.randn(length)})
frames.append(tmpdf)
df = pd.concat(frames)
df.sort(columns=['category','date'],inplace=True)
df.set_index(['category','date'],inplace=True,drop=True)
df['tmpShift'] = df['colB'].shift(1)
df.iloc[df.groupby(level=0).size().cumsum()[:-1]] = np.nan
# Yay this is so much faster.
df['newColumn'] = df['tmpShift'] / df['colA']
df.drop('tmp',1,inplace=True)
Thanks!
I ended up doing it using a groupby apply as follows (and coded to work forwards or backwards):
def replace_tail(grp,col,N,value):
if (N > 0):
grp[col][:N] = value
else:
grp[col][N:] = value
return grp
df = df.groupby(level=0).apply(replace_tail,'tmpShift',2,np.nan)
So the final code is:
def replace_tail(grp,col,N,value):
if (N > 0):
grp[col][:N] = value
else:
grp[col][N:] = value
return grp
length = 5
groups = 3
rng1 = pd.date_range('1/1/1990', periods=length, freq='D')
frames = []
for x in xrange(0,groups):
tmpdf = pd.DataFrame({'date':rng1,'category':int(10000000*abs(np.random.randn())),'colA':np.random.randn(length),'colB':np.random.randn(length)})
frames.append(tmpdf)
df = pd.concat(frames)
df.sort(columns=['category','date'],inplace=True)
df.set_index(['category','date'],inplace=True,drop=True)
shiftBy=-1
df['tmpShift'] = df['colB'].shift(shiftBy)
df = df.groupby(level=0).apply(replace_tail,'tmpShift',shiftBy,np.nan)
# Yay this is so much faster.
df['newColumn'] = df['tmpShift'] / df['colA']
df.drop('tmpShift',1,inplace=True)

Categories

Resources