Take the following dataframe:
import pandas as pd
df = pd.DataFrame({'group_name': ['A','A','A','B','B','B'],
'timestamp': [4,6,1000,5,8,100],
'condition': [True,True,False,True,False,True]})
I want to add two columns:
The row's order within its group
rolling sum of the condition column within each group
I know I can do it with a custom apply, but I'm wondering if anyone has any fun ideas? (Also this is slow when there are many groups.) Here's one solution:
def range_within_group(input_df):
df_to_return = input_df.copy()
df_to_return = df_to_return.sort('timestamp')
df_to_return['order_within_group'] = range(len(df_to_return))
df_to_return['rolling_sum_of_condition'] = df_to_return.condition.cumsum()
return df_to_return
df.groupby('group_name').apply(range_within_group).reset_index(drop=True)
GroupBy.cumcount does:
Number each item in each group from 0 to the length of that group - 1.
so simply:
>>> gr = df.sort('timestamp').groupby('group_name')
>>> df['order_within_group'] = gr.cumcount()
>>> df['rolling_sum_of_condition'] = gr['condition'].cumsum()
Related
I have a dataframe of daily sales:
import pandas as pd
date = ['28-01-2017','29-01-2017','30-01-2017','31-01-2017','01-02-2017','02-02-2017']
sales = [1,2,3,4,1,2]
ym = [201701,201701,201701,201701,201702,201702]
prev_1_ym = [201612,201612,201612,201612,201701,201701]
prev_2_ym = [201611,201611,201611,201611,201612,201612]
df_test = pd.DataFrame({'date': date,'ym':ym,'prev_1_ym':prev_1_ym,'prev_2_ym':prev_2_ym,'sales':sales})
df_test['date'] = pd.to_datetime(df_test['date'],format = '%d-%m-%Y')
I am trying to find total sales in the previous 1m, previous 2m etc..
My current approach is to use a list comprehension:
df_test[prev_1m_sales] = [ sum(df_test.loc[df_test['ym'] == x].sales) for x in df_test[prev_1_ym] ]
However, this proves to be very slow.
Is there a way to speed it up by using .groupby()?
you can use the date column to group your data, first change its data-type to pandas TimeStamps,
df['dates']=pd.to_datetime(df['dates'])
then you can use it directly in grouping for example
df.groupby(df.data.month).sales.sum().cumsum()
I have two dataframes measuring two properties from an instrument, where the depths are offset for a certain dz. Note that the example below is extremely simplified.
df1 = pd.DataFrame({'depth_1': [0.936250, 0.959990, 0.978864, 0.991288, 1.023876, 1.045801, 1.062768, 1.077090, 1.101248, 1.129754, 1.147458, 1.160193, 1.191206, 1.218595, 1.256964] })
df2 = pd.DataFrame({'depth_2': [0.620250, 0.643990, 0.662864, 0.675288, 0.707876, 0.729801, 0.746768, 0.761090, 0.785248, 0.813754, 0.831458, 0.844193, 0.875206, 0.902595, 0.940964 ] })
How do I get the index of df2.depth_2 that gets closest the first element of df1.depth_1 ?
Using reindex with method nearest
df2.reset_index().set_index('depth_2').reindex(df1.depth_1,method = 'nearest')['index'].unique()
Out[265]: array([14], dtype=int64)
You can use pandas merge_asof function (you will need to order your data first if it isn't in real life)
df1 = df1.sort_values(by='depth_1')
df2 = df2.sort_values(by='depth_2')
pd.merge_asof(df1, df2.reset_index(), left_on="depth_1", right_on="depth_2", direction="nearest")
if you just wanted that for the first value in df1 you could do the join on the top row:
df2 = df2.sort_values(by='depth_2')
pd.merge_asof(df1.head(1), df2.reset_index(), left_on="depth_1", right_on="depth_2", direction="nearest")
Get the absolute difference between all elements of df2 and first element of df1 and then get it's index:
import pandas as pd
import numpy as np
def get_closest(df1, df2, idx):
abs_diff = np.array([abs(df1['depth_1'][idx]-item) for item in df2['depth_2']])
return abs_diff.argmin()
df1 = pd.DataFrame({'depth_1': [0.936250, 0.959990, 0.978864, 0.991288, 1.023876, 1.045801, 1.062768, 1.077090, 1.101248, 1.129754, 1.147458, 1.160193, 1.191206, 1.218595, 1.256964] })
df2 = pd.DataFrame({'depth_2': [0.620250, 0.643990, 0.662864, 0.675288, 0.707876, 0.729801, 0.746768, 0.761090, 0.785248, 0.813754, 0.831458, 0.844193, 0.875206, 0.902595, 0.940964 ] })
get_closest(df1,df2,0)
Output:
14
I have the following example of my dataframe:
df = pd.DataFrame({'first_date': ['01-07-2017', '01-07-2017', '01-08-2017'],
'end_date': ['01-08-2017', '01-08-2017', '15-08-2017'],
'second_date': ['01-09-2017', '01-08-2017', '15-07-2017'],
'cust_num': [1, 2, 1],
'Title': ['philips', 'samsung', 'philips']})
If the cus_num is equal in the column
The Title is equal for both rows in the dataframe
The second_date in a row <= end_date in an other row
If all these requirements are met the value True should be appended to a new column in the original row.
Because I'm working with a big dataset I'm looking for an efficient way to do this.
In this case only the first record should get a true value.
I have checked for the apply with lambda and groupby function in python but couldnt find a way to make these work.
Try this (spontaneously I cannot come up with a faster method):
import pandas as pd
import numpy as np
df["second_date"]=pd.to_datetime(df["second_date"], format='%d-%m-%Y')
df["end_date"]=pd.to_datetime(df["end_date"], format='%d-%m-%Y')
df["new col"] = False
for cust in set(df["cust_num"]):
indices = df.index[df["cust_num"] == cust].tolist()
if len(indices) > 1:
sub_df = df.loc[indices]
for title in set(df.loc[indices]["Title"]):
indices_title = sub_df.index[sub_df["Title"] == title]
if len(indices_title) > 1:
for i in indices_title:
if sub_df.loc[indices_title]["second_date"][i] <= sub_df.loc[indices_title]["end_date"][i]:
df["new col"] = True
break
df["new_col"] = new_col
First you need to make all date columns comparable with eachother by casting them into datetime. Then create the additional column you want.
Now create a set of all unique customer numbers and iterate through them. For each customer number get a list of all row indices with this customer number. If this list is longer than 1, then you have several same customer numbers. Then you create a sub df of your dataframe with all rows with the same customer number. Then iterate through the set of all titles. For each title check if there is the same title somewhere else in the sub df (len > 1). If this is the case, then iterate through all rows and write True in your additional column in the same row where the date condition is met for the first time.
This should work. Also while reading comments, I am assuming that all cust_num is unique.
import pandas as pd
df = pd.DataFrame({'first_date': ['01-07-2017', '01-07-2017', '01-08-2017'],
'end_date': ['01-08-2017', '01-08-2017', '15-08-2017'],
'second_date': ['01-09-2017', '01-08-2017', '15-07-2017'],
'cust_num': [1, 2, 1],
'Title': ['philips', 'samsung', 'philips']})
df["second_date"]=pd.to_datetime(df["second_date"])
df["end_date"]=pd.to_datetime(df["end_date"])
df['Value'] = False
for i in range(len(df)):
for j in range(len(df)):
if (i != j):
if (df.loc[j,'end_date'] >= df.loc[i,'second_date']) == True:
if (df.loc[i,'cust_num'] == df.loc[j,'cust_num']) == True:
if (df.loc[i,'Title'] == df.loc[j,'Title']) == True:
df.loc[i,'Value'] = True
Tell me if this code works! and any errors.
I am trying to speed up my groupby.apply + shift and
thanks to this previous question and answer: How to speed up Pandas multilevel dataframe shift by group? I can prove that it does indeed speed things up when you have many groups.
From that question I now have the following code to set the first entry in each multi-index to Nan. And now I can do my shift globally rather than per group.
df.iloc[df.groupby(level=0).size().cumsum()[:-1]] = np.nan
but I want to look forward, not backwards, and need to do calculations across N rows. So I am trying to use some similar code to set the last N entries to NaN, but obviously I am missing some important indexing knowledge as I just can't figure it out.
I figure I want to convert this so that every entry is a range rather than a single integer. How would I do that?
# the start of each group, ignoring the first entry
df.groupby(level=0).size().cumsum()[1:]
Test setup (for backwards shift) if you want to try it:
length = 5
groups = 3
rng1 = pd.date_range('1/1/1990', periods=length, freq='D')
frames = []
for x in xrange(0,groups):
tmpdf = pd.DataFrame({'date':rng1,'category':int(10000000*abs(np.random.randn())),'colA':np.random.randn(length),'colB':np.random.randn(length)})
frames.append(tmpdf)
df = pd.concat(frames)
df.sort(columns=['category','date'],inplace=True)
df.set_index(['category','date'],inplace=True,drop=True)
df['tmpShift'] = df['colB'].shift(1)
df.iloc[df.groupby(level=0).size().cumsum()[:-1]] = np.nan
# Yay this is so much faster.
df['newColumn'] = df['tmpShift'] / df['colA']
df.drop('tmp',1,inplace=True)
Thanks!
I ended up doing it using a groupby apply as follows (and coded to work forwards or backwards):
def replace_tail(grp,col,N,value):
if (N > 0):
grp[col][:N] = value
else:
grp[col][N:] = value
return grp
df = df.groupby(level=0).apply(replace_tail,'tmpShift',2,np.nan)
So the final code is:
def replace_tail(grp,col,N,value):
if (N > 0):
grp[col][:N] = value
else:
grp[col][N:] = value
return grp
length = 5
groups = 3
rng1 = pd.date_range('1/1/1990', periods=length, freq='D')
frames = []
for x in xrange(0,groups):
tmpdf = pd.DataFrame({'date':rng1,'category':int(10000000*abs(np.random.randn())),'colA':np.random.randn(length),'colB':np.random.randn(length)})
frames.append(tmpdf)
df = pd.concat(frames)
df.sort(columns=['category','date'],inplace=True)
df.set_index(['category','date'],inplace=True,drop=True)
shiftBy=-1
df['tmpShift'] = df['colB'].shift(shiftBy)
df = df.groupby(level=0).apply(replace_tail,'tmpShift',shiftBy,np.nan)
# Yay this is so much faster.
df['newColumn'] = df['tmpShift'] / df['colA']
df.drop('tmpShift',1,inplace=True)
I have a dataframe and I use groupby to group it by Season. One of the columns of the original df is named Check and consists of True and False. My aim it to count the True values for each group and put it in the new dataframe.
import pandas as pd
df = ....
df['Check'] = df['Actual'] == df['Prediction']
grouped_per_year = df.groupby('Season')
df_2= pd.DataFrame()
df_2['Seasons'] = total_matches_per_year.keys()
df_2['Successes'] = ''
df_2['Total_Matches'] = list(grouped_per_year.size())
df_2['SR'] = df_2['Successes'] / df_2['Total_Matches']
df_2['Money_In'] = list(grouped_per_year['Money_In'].apply(sum))
df_2['Profit (%)'] = (df_profit['Money_In'] - df_profit['Total_Matches']) / df_profit['Total_Matches'] * 100.
I have tried:
successes_per_year = grouped_per_year['Pred_Check'].value_counts()
but I don't know how to get only the True count.
For counting True, you can also use sum (as True=1 and False=0 when doing a numerical operation):
grouped_per_year['Pred_Check'].sum()