how to iterate with groupby in pandas - python

I have a function minmax, that basically iterates over a dataframe of transactions. I want to calculate a set of calculations including the id, so accountstart,accountend are the two fields calculated. The intention is to make this calculations my month and account.
So when I do:
df1 = df.loc[df['accountNo']==10]
minmax(df1) it works.
What I can't do is:
df.groupby('accountNo').apply(minmax)
When I do:
grouped = df.groupby('accountNo')
for i,j in grouped:
print(minmax(j))
It does the computation, print the result, but without print it complains about KeyError: -1 that is itertools. So akward.
How to tackle that in Pandas?
def minmax(x):
dfminmax = {}
accno = set(x['accountNo'])
accno = repr(accno)
kgroup = x.groupby('monthStart')['cumsum'].sum()
maxt = x['startbalance'].max()
kgroup = pd.DataFrame(kgroup)
kgroup['startbalance'] = 0
kgroup['startbalance'][0] = maxt
kgroup['endbalance'] = 0
kgroup['accountNo'] = accno
kgroup['accountNo'] = kgroup['accountNo'].str.strip('{}.0')
kgroup.reset_index(inplace=True)
for idx, row in kgroup.iterrows():
if kgroup.loc[idx,'startbalance']==0:
kgroup.loc[idx,'startbalance']=kgroup.loc[idx-1,'endbalance'],
if kgroup.loc[idx,'endbalance']==0:
kgroup.loc[idx,'endbalance'] =
kgroup.loc[idx,'cumsum']+kgroup.loc[idx,'startbalance']
dfminmax['monthStart'].append(kgroup['monthStart'])
dfminmax['startbalance'].append(kgroup['startbalance'])
dfminmax['endbalance'].append(kgroup['endbalance'])
dfminmax['accountNo'].append(kgroup['accountNo'])
return dfminmax

.apply() takes pandas Series as inputs, not DataFrames. Using .agg, as in df.groupby('accountNo').agg(yourfunction) should yield better results. Be sure to check out the documentation for details on implementation.

Related

How to parallelize for loops in Pyspark?

I am trying to convert some Pandas code to Pyspark, which will run on an EMR cluster. This is my first time working with Pyspark, and I am not sure what is the optimal way to code the objective. The job is trying to achieve the following:
There is a base dataframe with schema like so:
institution_id, user_id, st_date
For every unique institution_id, get all users
For every user for the institution_id, take all unique st_dates in sorted order, get the difference between pairs of consecutive st_dates and output a dictionary
Here is what the code looks like as of now:
def process_user(current_user, inst_cycles):
current_user_dates = np.sort(current_user.st_date.unique())
if current_user_dates.size > 1:
prev_date = pd.to_datetime(current_user_dates[0]).date()
for current_datetime in current_user_dates[1:]:
current_date = pd.to_datetime(current_datetime).date()
month = current_date.month
delta = current_date - prev_date
cycle_days = delta.days
inst_cycles[month][cycle_days] += 1
prev_date = current_date
return inst_cycles
def get_inst_monthly_distribution(current_inst):
inst_cycles = defaultdict(lambda: defaultdict(int))
inst_user_ids = current_inst.select('user_id').distinct().collect()
for _, user_id in enumerate(inst_user_ids):
user_id_str = user_id[0]
current_user = current_inst.filter(current_inst.user_id == user_id_str)
inst_cycles = process_user(current_user, inst_cycles)
return inst_cycles
def get_monthly_distributions(inst_ids, df):
cycles = {}
for _, inst_id_str in enumerate(inst_ids.keys()):
current_inst = df.filter(df.inst_id == inst_id_str)
cycles[inst_id_str] = get_inst_monthly_distribution(current_inst)
return cycles
def execute():
df = load_data() # df is a Spark dataframe
inst_names = get_inst_names(df)
monthly_distributions = get_monthly_distributions(inst_names, df)
I think this code is not taking advantage of the parallelism of Spark, and can be coded in a much better way without the for loops. Is that correct?

How to loop over multiple subsets, perform operations and take the results to the original dataframe in python?

I have a dataframe with millions of rows, and about 100k unique ID numbers. I want to perform operations per unique ID. For now I generate a subset per unique ID and perform some operations accordingly. This loops works. But how do I efficiently combine the subsets into one dataframe?
Maybe there is a more efficient way to perform operations per subset of unique IDs.
Thanks
for ID in np.unique(df_fin['ID']):
ID_subset = df_fin.loc[df_fin['ID'] == ID]
for i in ID_subset.index:
if ID_subset['date_diff'][i] > 0:
for p in range(0,ID_subset['date_diff'][i]):
if p == WIP:
sl.appendleft(ID_subset.return_bin[i-1])
else:
sl.appendleft(0)
lissa = list(sl)
ID_subset.at[i,'list_stock'] = lissa
frames = [ID_subset] #this does not work
final_mod = pd.concat(frames) #this also does not work
THIS IS WORKING:
I also tried with groupby.apply. See the code below.
def create_stocklist(x):
x['date_diff'] = x['dates'] - x['dates'].shift()
x['date_diff'] = x['date_diff'].fillna(0)
x['date_diff'] = (x['date_diff'] / np.timedelta64(1, 'D')).astype(int)
x['list_stock'] = x['list_stock'].astype(object)
x['stock_new'] = x['stock_new'].astype(object)
var_stock = DOS*[0]
sl = deque([0],maxlen=DOS)
for i in x.index:
if x['date_diff'][i] > 0:
for p in range(0,x['date_diff'][i]):
if p == WIP:
sl.appendleft(x.return_bin[i-1])
else:
sl.appendleft(0)
lissa = list(sl)
x.at[i,'list_stock'] = lissa
return x
df_fin.groupby(by=['ID']).apply(create_stocklist)
An approach could be:
for g, _id in df_din.groupby(by=['ID']):
# do stuff with g
g is a dataframe containing all rows such that df_fin['ID'] == _id

Iterating through a list of Pandas DF's to then iterate through each DF's row

This may be a slightly insane question...
I've got a single Pandas DF of articles which I have then split into multiple DF's so each DF only contains the articles from a particular year. I have then put these variables into a list called box_of_years.
indexed_df = article_db.set_index('date')
indexed_df = indexed_df.sort_index()
year_2004 = indexed_df.truncate(before='2004-01-01', after='2004-12-31')
year_2005 = indexed_df.truncate(before='2005-01-01', after='2005-12-31')
year_2006 = indexed_df.truncate(before='2006-01-01', after='2006-12-31')
year_2007 = indexed_df.truncate(before='2007-01-01', after='2007-12-31')
year_2008 = indexed_df.truncate(before='2008-01-01', after='2008-12-31')
year_2009 = indexed_df.truncate(before='2009-01-01', after='2009-12-31')
year_2010 = indexed_df.truncate(before='2010-01-01', after='2010-12-31')
year_2011 = indexed_df.truncate(before='2011-01-01', after='2011-12-31')
year_2012 = indexed_df.truncate(before='2012-01-01', after='2012-12-31')
year_2013 = indexed_df.truncate(before='2013-01-01', after='2013-12-31')
year_2014 = indexed_df.truncate(before='2014-01-01', after='2014-12-31')
year_2015 = indexed_df.truncate(before='2015-01-01', after='2015-12-31')
year_2016 = indexed_df.truncate(before='2016-01-01', after='2016-12-31')
box_of_years = [year_2004, year_2005, year_2006, year_2007,
year_2008, year_2009, year_2010, year_2011,
year_2012, year_2013, year_2014, year_2015,
year_2016]
I've written various functions to tokenize, clean up and convert the tokens into a FreqDist object and wrapped those up into a single function called year_prep(). This works fine when I do
year_2006 = year_prep(year_2006)
...but is there a way I can iterate across every year variable, apply the function and have it transform the same variable, short of just repeating the above for every year?
I know repeating myself would be the simplest way, but not necessarily the cleanest. I may perhaps have this backwards and do the slicing later on but at that point I feel like the layers of lists will be out of hand as I'm going from a list of years to a list of years, containing a list of articles, containing a list of every word in the article.
I think you can use groupby by year with custom function:
import pandas as pd
start = pd.to_datetime('2004-02-24')
rng = pd.date_range(start, periods=30, freq='50D')
df = pd.DataFrame({'Date': rng, 'a':range(30)})
#print (df)
def f(x):
print (x)
#return year_prep(x)
#some custom output
return x.a + x.Date.dt.month
print (df.groupby(df['Date'].dt.year).apply(f))

Iteratively add columns of various length to DataFrame

I have few categorical columns (description) in my DataFrame df_churn which i'd like to convert to numerical values. And of course I'd like to create a lookup table because i will need to convert them back eventually.
The problem is that every column has a different number of categories so appending to df_categories is not easy and I cant think of any simple way of do so.
Here is what I have so far. It stops after first column, because of the different length.
cat_clmn = ['CLI_REGION','CLI_PROVINCE','CLI_ORIGIN','cli_origin2','cli_origin3', 'ONE_PRD_TYPE_1']
df_categories = pd.DataFrame()
def categorizer(_clmn):
for clmn in cat_clmn:
dict_cat = {key: value for value, key in enumerate(df_churn[clmn].unique())}
df_categories[clmn] = dict_cat.values()
df_categories[clmn + '_key'] = dict_cat.keys()
df_churn[clmn + '_CAT'] = df_churn[clmn].map(dict_cat)
categorizer(cat_clmn)
There is a temporary solution, but I am sure it can be done in a better way.
df_CLI_REGION = pd.DataFrame()
df_CLI_PROVINCE = pd.DataFrame()
df_CLI_ORIGIN = pd.DataFrame()
df_cli_origin2 = pd.DataFrame()
df_cli_origin3 = pd.DataFrame()
df_ONE_PRD_TYPE_1 = pd.DataFrame()
cat_clmn = ['CLI_REGION','CLI_PROVINCE','CLI_ORIGIN','cli_origin2','cli_origin3', 'ONE_PRD_TYPE_1']
df_lst = [df_CLI_REGION,df_CLI_PROVINCE,df_CLI_ORIGIN,df_cli_origin2,df_cli_origin3, df_ONE_PRD_TYPE_1]
def categorizer(_clmn):
for clmn, df in zip(cat_clmn,df_lst):
d = {key: value for value, key in enumerate(df_churn[clmn].unique())}
df[clmn] = d.values()
df[clmn + '_key'] = d.keys()
df_churn[clmn + '_CAT'] = df_churn[clmn].map(d)
categorizer(cat_clmn)

pandas: setting last N rows of multi-index to Nan for speeding up groupby with shift

I am trying to speed up my groupby.apply + shift and
thanks to this previous question and answer: How to speed up Pandas multilevel dataframe shift by group? I can prove that it does indeed speed things up when you have many groups.
From that question I now have the following code to set the first entry in each multi-index to Nan. And now I can do my shift globally rather than per group.
df.iloc[df.groupby(level=0).size().cumsum()[:-1]] = np.nan
but I want to look forward, not backwards, and need to do calculations across N rows. So I am trying to use some similar code to set the last N entries to NaN, but obviously I am missing some important indexing knowledge as I just can't figure it out.
I figure I want to convert this so that every entry is a range rather than a single integer. How would I do that?
# the start of each group, ignoring the first entry
df.groupby(level=0).size().cumsum()[1:]
Test setup (for backwards shift) if you want to try it:
length = 5
groups = 3
rng1 = pd.date_range('1/1/1990', periods=length, freq='D')
frames = []
for x in xrange(0,groups):
tmpdf = pd.DataFrame({'date':rng1,'category':int(10000000*abs(np.random.randn())),'colA':np.random.randn(length),'colB':np.random.randn(length)})
frames.append(tmpdf)
df = pd.concat(frames)
df.sort(columns=['category','date'],inplace=True)
df.set_index(['category','date'],inplace=True,drop=True)
df['tmpShift'] = df['colB'].shift(1)
df.iloc[df.groupby(level=0).size().cumsum()[:-1]] = np.nan
# Yay this is so much faster.
df['newColumn'] = df['tmpShift'] / df['colA']
df.drop('tmp',1,inplace=True)
Thanks!
I ended up doing it using a groupby apply as follows (and coded to work forwards or backwards):
def replace_tail(grp,col,N,value):
if (N > 0):
grp[col][:N] = value
else:
grp[col][N:] = value
return grp
df = df.groupby(level=0).apply(replace_tail,'tmpShift',2,np.nan)
So the final code is:
def replace_tail(grp,col,N,value):
if (N > 0):
grp[col][:N] = value
else:
grp[col][N:] = value
return grp
length = 5
groups = 3
rng1 = pd.date_range('1/1/1990', periods=length, freq='D')
frames = []
for x in xrange(0,groups):
tmpdf = pd.DataFrame({'date':rng1,'category':int(10000000*abs(np.random.randn())),'colA':np.random.randn(length),'colB':np.random.randn(length)})
frames.append(tmpdf)
df = pd.concat(frames)
df.sort(columns=['category','date'],inplace=True)
df.set_index(['category','date'],inplace=True,drop=True)
shiftBy=-1
df['tmpShift'] = df['colB'].shift(shiftBy)
df = df.groupby(level=0).apply(replace_tail,'tmpShift',shiftBy,np.nan)
# Yay this is so much faster.
df['newColumn'] = df['tmpShift'] / df['colA']
df.drop('tmpShift',1,inplace=True)

Categories

Resources