offsetting row values using pandas.dataframe.shift() - python

Iam having a data frame similar to the one below
absences_df= pd.DataFrame({'PersonNumber' : ['1234','1234','1234','5678', '5678', '5678', '997','998','998'],
'Start':['2022-03-07','2022-03-08','2022-03-09','2022-03-09','2022-03-10','2022-03-11','2022-03-07','2022-03-07','2022-03-08'],
'End':['2022-03-07','2022-03-08','2022-03-09','2022-03-09','2022-03-10','2022-03-11','2022-03-07','2022-03-07','2022-03-08'],
'hours' : ['1','1', '1','1','2','2','3.5','1','2']
})
absences_df:
I am having another dataframe like the one below:
input_df = pd.DataFrame({'PersonNumber' : ['1234','5678','997','998'],
'W03' : ['1.0','11.0','1.0','22.0'],
'W3_5' : ['2.0','12.0','2.0','23.0'],
'W04' : ['3.0','13.0','3.0','24.0'],
'W4_5' : ['4.0','14.0','4.0','25.0'],
'W05' : ['5.0','15.0','5.0','26.0'],
'W5_5' : ['0.0','16.0','6.0','27.0'],
'W06' : ['0.0','17.0','7.0','28.0'],
'W6_5' : ['6.0','18.0','8.0','29.0'],
'W07' : ['7.0','19.0','9.0','0.0'],
'W7_5' : ['8.0','0.0','10.0','0.0'],
'W08' : ['9.0','0.0','11.0','31.0'],
'W8_5' : ['10.0','0.0','12.0','32.0'],
'W09' : ['11.0','22.0','13.0','34.0'],
})
input_df :
i wanted to offset the row values in my second data frame(input_df ) based on the value that is present "hours" column in my first data frame(absences_df). After offsetting, the last value should be repeated for the remaining columns.
I wanted an output similar to the one below.
output_df = pd.DataFrame({'PersonNumber' : ['1234','5678','997','998'],
'W03' : ['0.0','0.0','7.0','27.0'],
'W3_5' : ['0.0','0.0','8.0','28.0'],
'W04' : ['6.0','0.0','9.0','29.0'],
'W4_5' : ['7.0','22.0','10.0','0.0'],
'W05' : ['8.0','22.0','11.0','0.0'],
'W5_5' : ['9.0','22.0','12.0','31.0'],
'W06' : ['10.0','22.0','13.0','32.0'],
'W6_5' : ['11.0','22.0','13.0','34.0'],
'W07' : ['11.0','22.0','13.0','34.0'],
'W7_5' : ['11.0','22.0','13.0','34.0'],
'W08' : ['11.0','22.0','13.0','34.0'],
'W8_5' : ['11.0','22.0','13.0','34.0'],
'W09' : ['11.0','22.0','13.0','34.0']
})
Final_df:
Simply put,
1)Employee 1234 is absent for 3 days and the sum of his each day hours is 3(1+1+1). So 3(total hours sum)+ 2(common for every one) = 5. So offset starts from W5_5
2)Employee 5678 is absent for 3 days and the sum of his each day hours is 5(1+2+2). So 5(total hours sum)+ 2(common for every one) = 7. So the offset starts from W7_5
3)Employee 997 is absent for 1 day and the sum of his each day hours is 3.5. So 3.5(total sum)+ 2(common for every one) = 5.5. So offset starts from W06
4)Employee 998 is absent for 2 days and the sum of his each day hours is 3(1+2). So 3(total hours sum) + 2(common for every one) = 5. So offset starts from W5_5
I have tried using shift() and a few other ways, but nothing helped me.
Posting what i have tried here
A=absences_df['PersonNumber'].value_counts()
dfNew_employee=[]
dfNew_repeat_time=[]
dfNew_Individual_hrs=[]
df_new_average_hours =[]
dfNew_total_hrs=[]
for i in A.index:
individual_employee=absences_df.loc[(absences_df['PersonNumber'] == i)]
hr_per_day=individual_employee['Duration'].iloc[0]
dfNew_employee.append(i)
dfNew_repeat_time.append(A[i])
dfNew_Individual_hrs.append(hr_per_day)
dfNew_total_hrs.append(str(sum(individual_employee['Duration'])+2))
df_new_average_hours.append(str((int(hr_per_day)*int(A[i]))+2))
print('employee id:',i,'; Repeated:',A[i],'; Hours=',hr_per_day,'; Total hours=',sum(individual_employee['Duration'])+2)
main_cnt = 0
b = weekly_penality_df.copy()
df_final = pd.DataFrame(columns=b.columns)
for k in dfNew_employee:
i=dfNew_total_hrs[main_cnt]
i=int(float(i)*2)-5
# if main_cnt > 0:
# b = a3.copy()
print(i)
a = b[b['PersonNumber'] == str(k)]
if a.shape[0] == 0:
print(main_cnt)
continue
a_ref_index = a.index.values.astype(int)[0]
#a_ref_index
a1 = b[["PersonNumber"]].copy()
a2 = b.copy()
a2.drop(['PersonNumber'], axis=1, inplace = True)
a21 = a2.iloc[[a_ref_index],:].copy()
a21.dropna(axis =1, inplace = True)
a21_last_value = a21[a21.columns[-1]]
a2.iloc[[a_ref_index],:] = a2.iloc[[a_ref_index],:].shift(i*-1, axis = 1, fill_value =float(a21_last_value))
a3=pd.concat([a1, a2], axis=1)
temp = a3[a3['PersonNumber'] == str(k)]
#df_final = df_final.append(temp, ignore_index=True)
b.loc[temp.index, :] = temp[:]
a3 = a3.reset_index(drop=True)
main_cnt=main_cnt+1
Please help me with any Easier/simplest solution.
Thanks in advance

This is the function to get the exact column name from absences_df
def get_offset_amount(person_number):
#calculating the sum of all the absent hour for a particular person
offset=absences_df[absences_df['PersonNumber']==person_number]['hours'].astype(float).sum()
#if sum is zero than no change in the output dataframe
if offset == 0:
return 0
# Adding 2 as per your requerment
offset+=2
#creating the column name
if offset.is_integer():
column_name = 'W{offset}_5'.format(offset= int(offset))
else:
column_name = 'W0{offset}'.format(offset= int(offset+1))
#Fetching the column number using the column name just created
return input_df.columns.tolist().index(column_name)
Iterating the input DF and creating the offset list. Using the same shift function from your try.
ouput_lst = []
for person_number in input_df['PersonNumber']:
shift_amount = get_offset_amount(person_number)
last_value = input_df[input_df['PersonNumber']==person_number].iloc[0,-1]
lst = input_df[input_df['PersonNumber']==person_number] \
.shift(periods = shift_amount*-1,axis = 1,fill_value = last_value) \
.iloc[0,:].tolist()[:-1]
new_lst = [person_number, *lst]
ouput_lst.append(new_lst)
output_df = pd.DataFrame(ouput_lst)
output_df.columns = input_df.columns
Ouput_df
PersonNumber W03 W3_5 W04 W4_5 W05 W5_5 W06 W6_5 W07 W7_5 \
0 1234 0.0 0.0 6.0 7.0 8.0 9.0 10.0 11.0 11.0 11.0
1 5678 0.0 0.0 0.0 22.0 22.0 22.0 22.0 22.0 22.0 22.0
2 997 7.0 8.0 9.0 10.0 11.0 12.0 13.0 13.0 13.0 13.0
3 998 27.0 28.0 29.0 0.0 0.0 31.0 32.0 34.0 34.0 34.0
W08 W8_5 W09
0 11.0 11.0 11.0
1 22.0 22.0 22.0
2 13.0 13.0 13.0
3 34.0 34.0 34.0

Related

How to count values over different thresholds per column in Pandas dataframe groupby?

My goal is for a dataset similar to the example below, to group by [s_num, ip, f_num, direction] and then filter the score columns using separate thresholds and count how many values are above the threshold.
id s_num ip f_num direction algo_1_x algo_2_x algo_1_score algo_2_score
0 0.0 0.0 0.0 0.0 X -4.63 -4.45 0.624356 0.664009
15 19.0 0.0 2.0 0.0 X -5.44 -5.02 0.411217 0.515843
16 20.0 0.0 2.0 0.0 X -12.36 -5.09 0.397237 0.541112
20 24.0 0.0 2.0 1.0 X -4.94 -5.15 0.401744 0.526032
21 25.0 0.0 2.0 1.0 X -4.78 -4.98 0.386410 0.564934
22 26.0 0.0 2.0 1.0 X -4.89 -5.03 0.394326 0.513896
24 28.0 0.0 2.0 2.0 X -4.78 -5.00 0.420078 0.521993
25 29.0 0.0 2.0 2.0 X -4.91 -5.14 0.407355 0.485878
26 30.0 0.0 2.0 2.0 X 11.83 -4.97 0.392242 0.659122
27 31.0 0.0 2.0 2.0 X -4.73 -5.07 0.377011 0.524774
​
the result should look something like:
Each entry in algo_i column is the # of values in the group larger than the corresponding threshold
So far I tried first grouping, and applying custom aggregation like so:
def count_success(x,thresh):
return ((x > thresh)*1).sum()
thresholds=[0.1,0.2]
df.groupby(attr_cols).agg({f'algo_{i+1}_score':count_success(thresh) for i, thresh in enumerate(thresholds)})
but this results in an error :
count_success() missing 1 required positional argument: 'thresh'
So, how can I pass another argument to a function using .agg( )? or is there an easier way to do it using some pandas function?
Named aggregation does not allow extra parameter to be passed to your function. You can use numpy boardcasting:
attr_cols = ["s_num", "ip", "f_num", "direction"]
score_cols = df.columns[df.columns.str.match("algo_\d+_score")]
# Convert everything to numpy to prepare for broadcasting
score = df[score_cols].to_numpy()
threshold = np.array([0.1, 0.5])
# Raise `threshold` up 2 dimensions so that every value in `score` is
# broadcast against every value in `threshold`
mask = score > threshold[:, None, None]
# Assemble the result
row_index = pd.MultiIndex.from_frame(df[attr_cols])
col_index = pd.MultiIndex.from_product([threshold, score_cols], names=["threshold", "algo"])
result = (
pd.DataFrame(np.hstack(mask), index=row_index, columns=col_index)
.groupby(attr_cols)
.sum()
)
Result:
threshold 0.1 0.5
algo algo_1_score algo_2_score algo_1_score algo_2_score
s_num ip f_num direction
0.0 0.0 0.0 X 1 1 1 1
2.0 0.0 X 2 2 0 2
1.0 X 3 3 0 3
2.0 X 4 4 0 3

How to fill empty data with zeros?

After going through some previous answers I found that I could use this code to fill missing values of df1[0] which range from 340 to 515,
with open('contactasortedtest.dat', 'r') as f:
text = [line.split() for line in f]
def replace_missing(df1 , Ids ):
missing = np.setdiff1d(Ids,df1[1])
print(missing)
if len(missing) > 0 :
missing_df = pd.DataFrame(data = np.zeros( (len(missing) , 4 )))
missing_df[1] = missing
missing_df[2].replace(0 , df1[2].iloc[1] , inplace = True)
df1 = pd.concat([df1 , missing_df])
return df1
Ids = (np.arange(340.0,515.0))
final_df = df1.groupby(df1[2],as_index=True).apply(replace_missing ,Ids).reset_index(drop = True)
final_df
Through troubleshooting I found that missing = np.setdiff1d(Ids,df1[1]) does not perform. Rather return the whole array. I found many answers on this, but I couldn't work it out. Any help would be appreciated.
Sample data I used,
12 340.0 1.0 0.0
2 491.0 1.0 35.8
13 492.0 1.0 81.4
4 493.0 1.0 0.0
7 495.0 1.0 0.2
0 496.0 1.0 90.3
11 509.0 1.0 2.3
6 513.0 1.0 4.3
8 515.0 1.0 0.1
Thank you !
You can use df['x'].fillna(0) to fill non zeros in a column

How to handle Cells containing only NaN values in pandas?

I am setting up a stock price prediction data set,in that while applying the following code for Ichimoku Cloud Indicator:
from datetime import timedelta
high_9 = df['High'].rolling(window= 9).max()
low_9 = df['Low'].rolling(window= 9).min()
df['tenkan_sen'] = (high_9 + low_9) /2
high_26 = df['High'].rolling(window= 26).max()
low_26 = df['Low'].rolling(window= 26).min()
df['kijun_sen'] = (high_26 + low_26) /2
# this is to extend the 'df' in future for 26 days
# the 'df' here is numerical indexed df
# the problem is here
last_index = df.iloc[-1:].index[0]
last_date = df['Date'].iloc[-1].date()
for i in range(26):
df.loc[last_index+1 +i, 'Date'] = last_date + timedelta(days=i)
df['senkou_span_a'] = ((df['tenkan_sen'] + df['kijun_sen']) / 2).shift(26)
high_52 = df['High'].rolling(window= 52).max()
low_52 = df['Low'].rolling(window= 52).min()
df['senkou_span_b'] = ((high_52 + low_52) /2).shift(26)
# most charting softwares dont plot this line
df['chikou_span'] = df['Close'].shift(-26)
The above code works great but the problem is while extending to the next 26 time steps(rows) in 'senoku span a' and 'b' columns it turns other rest columns row's values to NaN.
So i need the help to make 'Senoku span a' & 'Senoku span b' predicted rows in my data set without making other rows vlaues to NaN.
The current output is:
Date Open High Low Close Senoku span a Senoku span b
2019-03-16 50 51 52 53 56.0 55.82
2019-03-17 NaN NaN NaN NaN 55.0 56.42
2019-03-18 NaN NaN NaN NaN 54.0 57.72
2019-03-19 NaN NaN NaN NaN 53.0 58.12
2019-03-20 NaN NaN NaN NaN 52.0 59.52
The expected output is:
Date Open High Low Close Senoku span a Senoku span b
2019-03-16 50 51 52 53 56.0 55.82
2019-03-17 55.0 56.42
2019-03-18 54.0 57.72
2019-03-19 53.0 58.12
2019-03-20 52.0 59.52

difference between two rows pandas

i have a dataframe as :
id|amount|date
20|-7|2017:12:25
20|-170|2017:12:26
20|7|2017:12:27
i want to subtract each row from another for 'amount' column:
the output should be like:
id|amount|date|amount_diff
20|-7|2017:12:25|0
20|-170|2017:12:26|-177
20|7|2017:12:27|-163
i used the code:
df.sort_values(by='date',inplace=True)
df['amount_diff'] = df['invoice_amount'].diff()
and obtained the output as:
id|amount|date|amount_diff
20|-7|2017:12:25|163
20|-170|2017:12:26|-218
20|48|2017:12:27|0
IIUC you need:
df.sort_values(by='date',inplace=True)
df['amount_diff'] = df['amount'].add(df['amount'].shift()).fillna(0)
print (df)
id amount date amount_diff
0 20 -7 2017:12:25 0.0
1 20 -170 2017:12:26 -177.0
2 20 7 2017:12:27 -163.0
Because if want subtract your solution should work:
df.sort_values(by='date',inplace=True)
df['amount_diff1'] = df['amount'].sub(df['amount'].shift()).fillna(0)
df['amount_diff2'] = df['amount'].diff().fillna(0)
print (df)
id amount date amount_diff1 amount_diff2
0 20 -7 2017:12:25 0.0 0.0
1 20 -170 2017:12:26 -163.0 -163.0
2 20 7 2017:12:27 177.0 177.0

Tiling in groupby on dataframe

I have a data frame that contains returns, size and sedols for a couple of dates.
My goal is to identify the top and bottom values for a certain condition per date, i.e I want the top decile largest size entries and the bottom decile smallest size entries for each date and flag them in a new column by 'xx' and 'yy'.
I am confused how to apply the tiling while grouping as well as creating a new column, here is what I already have.
import pandas as pd
import numpy as np
import datetime as dt
from random import choice
from string import ascii_uppercase
def create_dummy_data(start_date, days, entries_pday):
date_sequence_lst = [dt.datetime.strptime(start_date,'%Y-%m-%d') +
dt.timedelta(days=x) for x in range(0,days)]
date_sequence_lst = date_sequence_lst * entries_pday
returns_lst = [round(np.random.uniform(low=-0.10,high=0.20),2) for _ in range(entries_pday*days)]
size_lst = [round(np.random.uniform(low=10.00,high=10000.00),0) for _ in range(entries_pday*days)]
rdm_sedol_lst = [(''.join(choice(ascii_uppercase) for i in range(7))) for x in range(entries_pday)]
rdm_sedol_lst = rdm_sedol_lst * days
dates_returns_df = pd.DataFrame({'Date':date_sequence_lst , 'Sedols':rdm_sedol_lst, 'Returns':returns_lst,'Size':size_lst})
dates_returns_df = dates_returns_df.sort_values('Date',ascending=True)
dates_returns_df = dates_returns_df.reset_index(drop=True)
return dates_returns_df
def order_df_by(df_in,column_name):
df_out = df_in.sort_values(['Date',column_name],ascending=[True,False])
return df_out
def get_ntile(df_in,ntile):
df_in['Tiled'] = df_in.groupby(['Date'])['Size'].transform(lambda x : pd.qcut(x,ntile))
return df_in
if __name__ == "__main__":
# create dummy returns
data_df = create_dummy_data('2001-01-01',31,10)
# sort by attribute
data_sorted_df = order_df_by(data_df,'Size')
#ntile data per date
data_ntiled = get_ntile(data_sorted_df, 10)
for key, item in data_ntiled:
print(data_ntiled.get_group(key))
so far I would be expecting deciled results based on 'Size' for each date, the next step would be to filter only for decile 1 and decile 10 and flag the entries 'xx' and 'yy' respectively.
thanks
Consider using transform on the pandas.qcut method with labels 1 through ntile+1 for a decile column, then conditionally set flag with np.where using decile values:
...
def get_ntile(df_in, ntile):
df_in['Tiled'] = df_in.groupby(['Date'])['Size'].transform(lambda x: pd.qcut(x, ntile, labels=list(range(1, ntile+1))))
return df_in
if __name__ == "__main__":
# create dummy returns
data_df = create_dummy_data('2001-01-01',31,10)
# sort by attribute
data_sorted_df = order_df_by(data_df,'Size')
#ntile data per date
data_ntiled = get_ntile(data_sorted_df, 10)
data_ntiled['flag'] = np.where(data_ntiled['Tiled']==1.0, 'YY',
np.where(data_ntiled['Tiled']==10.0, 'XX', np.nan))
print(data_ntiled.reset_index(drop=True).head(15))
# Date Returns Sedols Size Tiled flag
# 0 2001-01-01 -0.03 TEEADVJ 8942.0 10.0 XX
# 1 2001-01-01 -0.03 PDBWGBJ 7142.0 9.0 nan
# 2 2001-01-01 0.03 QNVVPIC 6995.0 8.0 nan
# 3 2001-01-01 0.04 NTKEAKB 6871.0 7.0 nan
# 4 2001-01-01 0.20 ZVVCLSJ 6541.0 6.0 nan
# 5 2001-01-01 0.12 IJKXLIF 5131.0 5.0 nan
# 6 2001-01-01 0.14 HVPDRIU 4490.0 4.0 nan
# 7 2001-01-01 -0.08 XNOGFET 3397.0 3.0 nan
# 8 2001-01-01 -0.06 JOARYWC 2582.0 2.0 nan
# 9 2001-01-01 0.12 FVKBQGU 723.0 1.0 YY
# 10 2001-01-02 0.03 ZVVCLSJ 9291.0 10.0 XX
# 11 2001-01-02 0.14 HVPDRIU 8875.0 9.0 nan
# 12 2001-01-02 0.08 PDBWGBJ 7496.0 8.0 nan
# 13 2001-01-02 0.02 FVKBQGU 7307.0 7.0 nan
# 14 2001-01-02 -0.01 QNVVPIC 7159.0 6.0 nan

Categories

Resources