pandas multiindex shift on filtered values - python

I want to get the time differences between rows of interest.
t = pd.data_range('1/1/2000', period=6, freq='D')
d = pd.DataFrame({'sid':['a']*3 + ['b']*3,
'src':['m']*3 + ['t']*3,
'alert_v':[1,0,0,0,1,1]}, index=rng)
I want to get the time difference between rows where alr==1.
Ive tried shifting, but are there other ways to take the difference between two rows in a column?
i have tried simple lambdas and more complex .loc:
`
def deltat(g):
g['d1'] = g[ g['alert_v']==1 ]['timeindex'].shift(1)
g['d0'] = g[ g['alert_v']==1 ]['timeindex']
return g['td'] = g['d1'] - g['d0']
d['td'] = d.groupby('src','sid').apply(lambda x: deltat(x) )
def indx(g):
d0 = g.loc[g['alert_v']==1 ]
d1[0] = d0[0]
d1.append( d0[:-1] )
g['tavg'] = g.apply( g.ix[d1,'timeindex'] - g.ix[d0,'timeindex'])
return g
After trying a bunch of approaches, I cant seem to get past either the multigroup or filtering issues...
whats the best way to do this?
edit:
diff(1) produces this error:
raise TypeError('incompatible index of inserted column '
TypeError: incompatible index of inserted column with frame index
while shift(1) produces this error:
ZeroDivisionError: integer division or modulo by zero
attempt to clean the data, not help.
if any( pd.isnull( g['timeindex'] ) ):
print '## timeindex not null'
g['timeindex'].fillna(method='ffill')

For multindex group, select rows, diff, and insert new column paradigm: this is how I got it to work with clean output.
some groups have 0 relevant rows, this throws an exception.
shift throws key error, so just sticking with diff()
# -- get the interarrival time
def deltat(g):
try:
g['tavg'] = g[ g['alert_v']==1 ]['timeindex'].diff(1)
return g
except:
pass
d.sort_index(axis=0, inplace=True)
d = d.groupby(['source','subject_id','alert_t','variable'],as_index=False,group_keys=False).apply( lambda x: deltat(x) )
print d[d['alert_v']==1][['timeindex','tavg']]

Related

Dividing each column in a pandas df by a value from another df

I have a dataframe of a size (44,44) and another one (44,)
I need to divide each item in a column 'EOFx' by a number in a column 'PCx'.
(e.g. All values in 'EOF1' by 'PC1')
I've been trying string and numeric loops but nothing seems to work at all (error) or I get NaNs.
Last thing I tried was
for k in eof_df.keys():
for m in pc_df.keys():
eof_df[k].divide(pc_df[m])
The end result is a modified eof_df.
What did work for 1 column outside the loop is this.
eof_df.iloc[:,0].divide(std_df.iloc[0]).head()
Thank you!
upd1. In response to MoRe:
for eof_df it will be:
{'EOF1': {'8410140.nc': -0.09481700372712784,
'8418150.nc': -0.11842440098461708,
'8443970.nc': -0.1275311990493338,
'8447930.nc': -0.1321116945944401,
'8449130.nc': -0.11649753033608201,
'8452660.nc': -0.14776686151828214,
'8454000.nc': -0.1451132595405897,
'8461490.nc': -0.17032364516557338,
'8467150.nc': -0.20725618455428937,
'8518750.nc': -0.2249648853806308},
'EOF2': {'8410140.nc': 0.051213689088367806,
'8418150.nc': 0.0858110390036938,
'8443970.nc': 0.09029173023479754,
'8447930.nc': 0.05526955432871537,
'8449130.nc': 0.05136680082838883,
'8452660.nc': 0.06105351220962777,
'8454000.nc': 0.052112043784544135,
'8461490.nc': 0.08652511173850089,
'8467150.nc': 0.1137754089944319,
'8518750.nc': 0.10461193696203},
and it goes to EOF44.
For pc_df it will be
{'PC1': 0.5734671652560537,
'PC2': 0.29256502033278076,
'PC3': 0.23586098119374838,
'PC4': 0.227069130368915,
'PC5': 0.1642170373016029,
'PC6': 0.14131097046499339,
'PC7': 0.09837935104899741,
'PC8': 0.0869056762311067,
'PC9': 0.08183389338415169,
'PC10': 0.07467191608481094}
output = pd.DataFrame(index=eof_df.index, data=eof_df.values / pc_df.values)
output.columns = eof_df.columns
data = pd.DataFrame(eof_df.values.T / pc_df.values.T).T
data.columns = ["divided" + str(i + 1) for i in data.columns.to_list()]

Counting the repeated values in one column base on other column

Using Panda, I am dealing with the following CSV data type:
f,f,f,f,f,t,f,f,f,t,f,t,g,f,n,f,f,t,f,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,t,t,nowin
t,f,f,f,f,f,f,f,f,f,t,f,g,f,b,f,f,t,f,f,f,f,f,t,f,t,f,f,f,f,f,f,f,t,f,n,won
t,f,f,f,t,f,f,f,t,f,t,f,g,f,b,f,f,t,f,f,f,t,f,t,f,t,f,f,f,f,f,f,f,t,f,n,won
f,f,f,f,f,f,f,f,f,f,t,f,g,f,b,f,f,t,f,f,f,f,f,t,f,t,f,f,f,f,f,f,f,t,f,n,nowin
t,f,f,f,t,f,f,f,t,f,t,f,g,f,b,f,f,t,f,f,f,t,f,t,f,t,f,f,f,f,f,f,f,t,f,n,won
f,f,f,f,f,f,f,f,f,f,t,f,g,f,b,f,f,t,f,f,f,f,f,t,f,t,f,f,f,f,f,f,f,t,f,n,win
For this part of the raw data, I was trying to return something like:
Column1_name -- t -- counts of nowin = 0
Column1_name -- t -- count of wins = 3
Column1_name -- f -- count of nowin = 2
Column1_name -- f -- count of win = 1
Based on this idea get dataframe row count based on conditions I was thinking in doing something like this:
print(df[df.target == 'won'].count())
However, this would return always the same number of "wons" based on the last column without taking into consideration if this column it's a "f" or a "t". In other others, I was hoping to use something from Panda dataframe work that would produce the idea of a "group by" from SQL, grouping based on, for example, the 1st and last column.
Should I keep pursing this idea of should I simply start using for loops?
If you need, the rest of my code:
import pandas as pd
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/chess/king-rook-vs-king-pawn/kr-vs-kp.data"
df = pd.read_csv(url,names=[
'bkblk','bknwy','bkon8','bkona','bkspr','bkxbq','bkxcr','bkxwp','blxwp','bxqsq','cntxt','dsopp','dwipd',
'hdchk','katri','mulch','qxmsq','r2ar8','reskd','reskr','rimmx','rkxwp','rxmsq','simpl','skach','skewr',
'skrxp','spcop','stlmt','thrsk','wkcti','wkna8','wknck','wkovl','wkpos','wtoeg','target'
])
features = ['bkblk','bknwy','bkon8','bkona','bkspr','bkxbq','bkxcr','bkxwp','blxwp','bxqsq','cntxt','dsopp','dwipd',
'hdchk','katri','mulch','qxmsq','r2ar8','reskd','reskr','rimmx','rkxwp','rxmsq','simpl','skach','skewr',
'skrxp','spcop','stlmt','thrsk','wkcti','wkna8','wknck','wkovl','wkpos','wtoeg','target']
# number of lines
#tot_of_records = np.size(my_data,0)
#tot_of_records = np.unique(my_data[:,1])
#for item in my_data:
# item[:,0]
num_of_won=0
num_of_nowin=0
for item in df.target:
if item == 'won':
num_of_won = num_of_won + 1
else:
num_of_nowin = num_of_nowin + 1
print(num_of_won)
print(num_of_nowin)
print(df[df.target == 'won'].count())
#print(df[:1])
#print(df.bkblk.to_string(index=False))
#print(df.target.unique())
#ini_entropy = (() + ())
This could work -
outdf = df.apply(lambda x: pd.crosstab(index=df.target,columns=x).to_dict())
Basically we are going in on each feature column and making a crosstab with target column
Hope this helps! :)

Conditional sum in Python between multiple columns

I have the following script, from a larger analysis of securities data,
returns_columns = []
df_merged[ticker + '_returns'] = df_merged[ticker + '_close'].pct_change(periods=1)
returns_columns.append(ticker + '_returns')
df_merged['applicable_returns_sum'] = (df_merged[returns_columns] > df_merged['return_threshold']).sum(axis=1)
'return_threshold' is a complete series of float numbers.
I've been able to successfully sum each row in the returns_columns array, but cannot figure out how to conditionally sum only the numbers in the returns_columns that are greater than the res'return_threshold' in that row.
This seems like a problem similar to the one shown here, Python Pandas counting and summing specific conditions, but I'm trying to sum based on the changing condition in the returns_columns.
Any help would be much appreciated, thanks as always!
EDIT: ANOTHER APPROACH
This is another approach I tried. The script below has an error associated with the ticker input, even though I think it's necessary, and then produces and error:
def compute_applicable_returns(row, ticker):
if row[ticker + '_returns'] >= row['top_return']:
return row[ticker + '_returns']
else:
return 0
df_merged['applicable_top_returns'] = df_merged[returns_columns].apply(compute_applicable_returns, axis=1)
The [] operator for a dataframe should allow you to filter by an expression df > threshold and return a dataframe. You can then call .sum() on this df.
df[df > threshold].sum()
answered the question like this:
def compute_applicable_returns(row, ticker):
if row[ticker + '_returns'] >= row['return_threshold']:
return row[ticker + '_returns']
else:
return 0
for ticker in tickers:
df_merged[ticker + '_applicable_returns'] = df_merged.apply(compute_applicable_returns, args=(ticker,), axis=1)

How to get the row index for pandas apply function on a Series

I have a DataFrame that I split into column Series (col_series in the snippet below)and use apply tests to each value in each Series. But I would like to report which row in the Series is affected when I detect and error.
...
col_series.apply(self.testdatelimits, args= \
(datetime.strptime('2018-01-01', '%Y-%m-%d'), key))
def testlimits(self, row_id, x, lowerlimit, col_name):
low_error = None
d = float(x)
if lowerlimit != 'NA' and d < float(lowerlimit):
low_error = 'Following record has column ' + col_name + ' lower than range check'
if low_error is not None:
self.set_error(col_index, row_id, low_error)
Of course the above fails because x is a str and does not have the name property. I am thinking that maybe I can pass in the row index in the Series, but am not clear on how to do that?
Edit:
I switched to use a list comprehension to solve this issue rather than ps apply. It is significantly faster too
col_series = col_series.apply(pd.to_datetime, errors='ignore')
dfwithrow = pd.DataFrame(col_series)
dfwithrow.insert(0, 'rowid', range(0, len(dfwithrow)))
dfwithrow['lowerlimit'] = lowlimit
dfwithrow['colname'] = 'fred'
list(map(self.testdatelimits, dfwithrow['rowid'], dfwithrow[colvalue[0]], \
dfwithrow['lowerlimit'], dfwithrow['colname']))

pandas: setting last N rows of multi-index to Nan for speeding up groupby with shift

I am trying to speed up my groupby.apply + shift and
thanks to this previous question and answer: How to speed up Pandas multilevel dataframe shift by group? I can prove that it does indeed speed things up when you have many groups.
From that question I now have the following code to set the first entry in each multi-index to Nan. And now I can do my shift globally rather than per group.
df.iloc[df.groupby(level=0).size().cumsum()[:-1]] = np.nan
but I want to look forward, not backwards, and need to do calculations across N rows. So I am trying to use some similar code to set the last N entries to NaN, but obviously I am missing some important indexing knowledge as I just can't figure it out.
I figure I want to convert this so that every entry is a range rather than a single integer. How would I do that?
# the start of each group, ignoring the first entry
df.groupby(level=0).size().cumsum()[1:]
Test setup (for backwards shift) if you want to try it:
length = 5
groups = 3
rng1 = pd.date_range('1/1/1990', periods=length, freq='D')
frames = []
for x in xrange(0,groups):
tmpdf = pd.DataFrame({'date':rng1,'category':int(10000000*abs(np.random.randn())),'colA':np.random.randn(length),'colB':np.random.randn(length)})
frames.append(tmpdf)
df = pd.concat(frames)
df.sort(columns=['category','date'],inplace=True)
df.set_index(['category','date'],inplace=True,drop=True)
df['tmpShift'] = df['colB'].shift(1)
df.iloc[df.groupby(level=0).size().cumsum()[:-1]] = np.nan
# Yay this is so much faster.
df['newColumn'] = df['tmpShift'] / df['colA']
df.drop('tmp',1,inplace=True)
Thanks!
I ended up doing it using a groupby apply as follows (and coded to work forwards or backwards):
def replace_tail(grp,col,N,value):
if (N > 0):
grp[col][:N] = value
else:
grp[col][N:] = value
return grp
df = df.groupby(level=0).apply(replace_tail,'tmpShift',2,np.nan)
So the final code is:
def replace_tail(grp,col,N,value):
if (N > 0):
grp[col][:N] = value
else:
grp[col][N:] = value
return grp
length = 5
groups = 3
rng1 = pd.date_range('1/1/1990', periods=length, freq='D')
frames = []
for x in xrange(0,groups):
tmpdf = pd.DataFrame({'date':rng1,'category':int(10000000*abs(np.random.randn())),'colA':np.random.randn(length),'colB':np.random.randn(length)})
frames.append(tmpdf)
df = pd.concat(frames)
df.sort(columns=['category','date'],inplace=True)
df.set_index(['category','date'],inplace=True,drop=True)
shiftBy=-1
df['tmpShift'] = df['colB'].shift(shiftBy)
df = df.groupby(level=0).apply(replace_tail,'tmpShift',shiftBy,np.nan)
# Yay this is so much faster.
df['newColumn'] = df['tmpShift'] / df['colA']
df.drop('tmpShift',1,inplace=True)

Categories

Resources