I am implementing my own function for calculating taxes. Below you can see the data
df = pd.DataFrame({"id_n":["1","2","3","4","5"],
"sales1":[0,115000,440000,500000,740000],
"sales2":[0,115000,460000,520000,760000],
"tax":[0,8050,57500,69500,69500]
})
Now I want to introduce a tax function that needs to give the same results as results in column tax. Below you can see an estimation of that function:
# Thresholds
min_threeshold = 500000
max_threeshold = 1020000
# Maximum taxes
max_cap = 69500
# Rates
rate_1 = 0.035
rate_2 = 0.1
# Total sales
total_sale = df['sales1'] + df['sales2']
tax = df['tax']
# Function for estimation
def tax_fun(total_sale,tax,min_threeshold,max_threeshold,max_cap,rate_1,rate_2):
if (total_sale > 0 and tax == 0):
calc_tax = 0
elif (total_sale < min_threeshold):
calc_tax = total_sale * rate_1
elif (total_sale >= min_threeshold) & (total_sale <= max_threeshold):
calc_tax = total_sale * rate_2
elif (total_sale > max_threeshold):
calc_tax = max_cap
return calc_tax
So far so good. The next step is the execution of the above function. Below you can see the command :
df['new_tax']=tax_fun(total_sale,tax,min_threeshold,max_threeshold,max_cap,rate_1,rate_2)
After execution of this command, I received this error
ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
So can anybody help me how to solve this problem?
You want to use df.apply(), you're almost there!
def someFunc():
# do something here
return result
def main():
# setup df
new_col = 'col_name'
df[new_col] = df.apply(lambda x: someFunc(x['some_col']))
This is powerful because within x you now have access to each row, so you can pass data from the row to your custom function and then apply it - using optimized pandas - to each row.
I forget the object structure for x, so you may want to look into that and you can access what you need. Also, x can be named anything.
Docs here: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.apply.html
Hope this helps!
Hi I'm trying to create new columns to a time-series Pandas dataframe that is essentially tracking the charging and discharging of a battery. I can make it work with iterrows, but as you might expect it's very slow on a large time-series. From some internet searching im thinking Apply is the way to go (or not, im hoping you'll point me in the right direction) but im having trouble trying to access values from the previous time step. ive created this very simplified piece of code that tries to capture what im attempting to do. Basically i cannot figure out how to pass the 'end' value that i calculate on the previous row to the 'start' value on the next row.
df = pd.DataFrame(data = {'NetPosition': [-10, -5, 10], 'row_no': [0,1,2]})
df['start'] = 0
df['end'] = 0
df['dispatch'] = 0
starting_value = 20
max_rating = 4
def f(x):
prev_index = max(0,int(x.row_no-1))
if x.row_no == 0:
start = starting_value
else:
start = df['end'].iloc[prev_index]
# this is the part that doesn't work - im attempting to pull the end value from the previous row into the new next row
if x['NetPosition']<0:
dispatch = min(np.abs(x['NetPosition']), max_rating, start)
end = start - dispatch
else:
dispatch = 0
end = start
return pd.Series([start,end,dispatch])
df[['start','end','dispatch']] = df.apply(lambda x: f(x), axis=1)
df
Use pd.shift(1) to get the last value on top. Use pd.shift(-1) to get the next row below. Use np.where similar to =IF function in excel.
import pandas as pd
import numpy as np
df = pd.DataFrame(data = {'NetPosition': [-10, -5, 10], 'row_no': [0,1,2]})
df['start'] = 0
df['end'] = 0
df['dispatch'] = 0
starting_value = 20
max_rating = 4
#Answer
df.dispatch = np.where(df.NetPosition < 0, min(max_rating,df['NetPosition'].abs().min()) ,0)
df.start = df.end.shift(1)
df.start = df.start.fillna(20)
df.end = np.where(df.NetPosition < 0, df.start, df.start - df.dispatch)
df
I have a function which takes a column of a DataFrame as parameter. I want to apply it to all the columns of a DataFrame without looping.
I've already looked into the function apply, but I don't understand how to use it in my context.
Here is my function:
def shortlong_oneasset(asset):
ret = pd.DataFrame()
n_horizons = len(horizon)
for i in range(n_horizons):
r = compute_yields(asset, horizon[i], "horizon")
r.loc[r[asset.name] < short_threshold, asset.name] = int(-1)
r.loc[r[asset.name] > long_threshold, asset.name] = int(1)
r.loc[(r[asset.name] != -1) & (r[asset.name] != 1), asset.name] = 0
ret = pd.concat([ret, r], axis = 1)
ret.rename(columns={asset.name: horizon[i]}, inplace = True)
ret = ret.iloc[:,~ret.columns.duplicated()]
#ret = pd.merge_asof(dates.to_frame(), ret, left_on="Dates", right_on="Dates")
return ret
The "asset" parameter is a column of a DataFrame, for example:
shortlong_oneasset(data['column 1'])
Here my DataFrame is named "bonds". I tried to do:
test = bonds.apply(lambda x: shortlong_oneassetaverage(x), axis=1)
or
bonds.apply(shortlong_oneasset)
Can anyone help me with this issue?
Thanks
I have the following method in which I am eliminating overlapping intervals in a dataframe based on a set of hierarchical rules:
def disambiguate(arg):
arg['length'] = (arg.end - arg.begin).abs()
df = arg[['begin', 'end', 'note_id', 'score', 'length']].copy()
data = []
out = pd.DataFrame()
for row in df.itertuples():
test = df[df['note_id']==row.note_id].copy()
# get overlapping intervals:
# https://stackoverflow.com/questions/58192068/is-it-possible-to-use-pandas-overlap-in-a-dataframe
iix = pd.IntervalIndex.from_arrays(test.begin.apply(pd.to_numeric), test.end.apply(pd.to_numeric), closed='neither')
span_range = pd.Interval(row.begin, row.end)
fx = test[iix.overlaps(span_range)].copy()
maxLength = fx['length'].max()
minLength = fx['length'].min()
maxScore = abs(float(fx['score'].max()))
minScore = abs(float(fx['score'].min()))
# filter out overlapping rows via hierarchy
if maxScore > minScore:
fx = fx[fx['score'] == maxScore]
elif maxLength > minLength:
fx = fx[fx['length'] == minScore]
data.append(fx)
out = pd.concat(data, axis=0)
# randomly reindex to keep random row when dropping remaining duplicates: https://gist.github.com/cadrev/6b91985a1660f26c2742
out.reset_index(inplace=True)
out = out.reindex(np.random.permutation(out.index))
return out.drop_duplicates(subset=['begin', 'end', 'note_id'])
This works fine, except for the fact that the dataframes I am iterating over have well over 100K rows each, so this is taking forever to complete. I did a timing of various methods using %prun in Jupyter, and the method that seems to eat up processing time was series.py:3719(apply) ... NB: I tried using modin.pandas, but that was causing more problems (I kept getting an error wrt to Interval needing a value where left was less than right, which I couldn't figure out: I may file a GitHub issue there).
Am looking for a way to optimize this, such as using vectorization, but honestly, I don't have the slightest clue how to convert this to a vectotrized form.
Here is a sample of my data:
begin,end,note_id,score
0,9,0365,1
10,14,0365,1
25,37,0365,0.7
28,37,0365,1
38,42,0365,1
53,69,0365,0.7857142857142857
56,60,0365,1
56,69,0365,1
64,69,0365,1
83,86,0365,1
91,98,0365,0.8333333333333334
101,108,0365,1
101,127,0365,1
112,119,0365,1
112,127,0365,0.8571428571428571
120,127,0365,1
163,167,0365,1
196,203,0365,1
208,216,0365,1
208,223,0365,1
208,231,0365,1
208,240,0365,0.6896551724137931
217,223,0365,1
217,231,0365,1
224,231,0365,1
246,274,0365,0.7692307692307693
252,274,0365,1
263,274,0365,0.8888888888888888
296,316,0365,0.7222222222222222
301,307,0365,1
301,316,0365,1
301,330,0365,0.7307692307692307
301,336,0365,0.78125
308,316,0365,1
308,323,0365,1
308,330,0365,1
308,336,0365,1
317,323,0365,1
317,336,0365,1
324,330,0365,1
324,336,0365,1
361,418,0365,0.7368421052631579
370,404,0365,0.7111111111111111
370,418,0365,0.875
383,418,0365,0.8285714285714286
396,404,0365,1
396,418,0365,0.8095238095238095
405,418,0365,0.8333333333333334
432,453,0365,0.7647058823529411
438,453,0365,1
438,458,0365,0.7222222222222222
I think I know what the issue was: I did my filtering on note_id incorrectly, and thus iterating over the entire dataframe.
It should been:
cases = set(df['note_id'].tolist())
for case in cases:
test = df[df['note_id']==case].copy()
for row in df.itertuples():
# get overlapping intervals:
# https://stackoverflow.com/questions/58192068/is-it-possible-to-use-pandas-overlap-in-a-dataframe
iix = pd.IntervalIndex.from_arrays(test.begin, test.end, closed='neither')
span_range = pd.Interval(row.begin, row.end)
fx = test[iix.overlaps(span_range)].copy()
maxLength = fx['length'].max()
minLength = fx['length'].min()
maxScore = abs(float(fx['score'].max()))
minScore = abs(float(fx['score'].min()))
if maxScore > minScore:
fx = fx[fx['score'] == maxScore]
elif maxLength > minLength:
fx = fx[fx['length'] == maxLength]
data.append(fx)
out = pd.concat(data, axis=0)
For testing on one note, before I stopped iterating over the entire, non-filtered dataframe, it was taking over 16 minutes. Now, it's at 28 seconds!
I am trying to speed up my groupby.apply + shift and
thanks to this previous question and answer: How to speed up Pandas multilevel dataframe shift by group? I can prove that it does indeed speed things up when you have many groups.
From that question I now have the following code to set the first entry in each multi-index to Nan. And now I can do my shift globally rather than per group.
df.iloc[df.groupby(level=0).size().cumsum()[:-1]] = np.nan
but I want to look forward, not backwards, and need to do calculations across N rows. So I am trying to use some similar code to set the last N entries to NaN, but obviously I am missing some important indexing knowledge as I just can't figure it out.
I figure I want to convert this so that every entry is a range rather than a single integer. How would I do that?
# the start of each group, ignoring the first entry
df.groupby(level=0).size().cumsum()[1:]
Test setup (for backwards shift) if you want to try it:
length = 5
groups = 3
rng1 = pd.date_range('1/1/1990', periods=length, freq='D')
frames = []
for x in xrange(0,groups):
tmpdf = pd.DataFrame({'date':rng1,'category':int(10000000*abs(np.random.randn())),'colA':np.random.randn(length),'colB':np.random.randn(length)})
frames.append(tmpdf)
df = pd.concat(frames)
df.sort(columns=['category','date'],inplace=True)
df.set_index(['category','date'],inplace=True,drop=True)
df['tmpShift'] = df['colB'].shift(1)
df.iloc[df.groupby(level=0).size().cumsum()[:-1]] = np.nan
# Yay this is so much faster.
df['newColumn'] = df['tmpShift'] / df['colA']
df.drop('tmp',1,inplace=True)
Thanks!
I ended up doing it using a groupby apply as follows (and coded to work forwards or backwards):
def replace_tail(grp,col,N,value):
if (N > 0):
grp[col][:N] = value
else:
grp[col][N:] = value
return grp
df = df.groupby(level=0).apply(replace_tail,'tmpShift',2,np.nan)
So the final code is:
def replace_tail(grp,col,N,value):
if (N > 0):
grp[col][:N] = value
else:
grp[col][N:] = value
return grp
length = 5
groups = 3
rng1 = pd.date_range('1/1/1990', periods=length, freq='D')
frames = []
for x in xrange(0,groups):
tmpdf = pd.DataFrame({'date':rng1,'category':int(10000000*abs(np.random.randn())),'colA':np.random.randn(length),'colB':np.random.randn(length)})
frames.append(tmpdf)
df = pd.concat(frames)
df.sort(columns=['category','date'],inplace=True)
df.set_index(['category','date'],inplace=True,drop=True)
shiftBy=-1
df['tmpShift'] = df['colB'].shift(shiftBy)
df = df.groupby(level=0).apply(replace_tail,'tmpShift',shiftBy,np.nan)
# Yay this is so much faster.
df['newColumn'] = df['tmpShift'] / df['colA']
df.drop('tmpShift',1,inplace=True)