Apply the same function to a subset of rows in pandas dataframe - python

I am using the following code to filter the dataframe using type and value column and then deleting any entries with a tdiff >0 and < 3.
import pandas as pd
d = {'Timestamp': ['2020-09-02 07:00:00','2020-09-02 07:10:00', '2020-09-02 07:30:00', '2020-09-02 08:00:00', '2020-09-02 10:00:00', '2020-09-02 11:10:00', '2020-09-02 11:30:00'],
'type': ['A','A','B','A', 'A','A','B'], 'value': [1,2,3,1,1,2,3]}
df = pd.DataFrame(data=d)
df3 = pd.DataFrame()
unique_type = pd.unique(df['type']).astype(str)
for i in range(0,len(unique_type)):
df1 = df[df.type == unique_type[i]]
unique_val = pd.unique(df1['value']).astype(int)
for j in range(0, len(unique_val)):
df2 = df1[df1.value == unique_val[j]]
trange = pd.to_datetime(df2.Timestamp)
tdiff = (trange-min(trange)).dt.total_seconds()/3600
df2['tdiff'] = tdiff#.round(1)
df3 = df3.append(df2, ignore_index=True)
df4 = df3[~((df3.tdiff>0) & (df3.tdiff<3))]
print(df)
df4.sort_values(by=['Timestamp'])
While this works, I would like to move away from for loops and use a more efficient code.

Try this simple code, with lambda function
Data:
d = {'type': ['A','A','B','C'], 'col1': [1,2,3,9]}
df = pd.DataFrame(data=d)
df:
type col1
0 A 1
1 A 2
2 B 3
3 C 9
f = pd.Series(pd.Series(df['type'].unique()).apply(lambda type_: (type_, df[df['type'] == type_].col1.sum()))).to_list()
df['New-column'] = df.type.replace(dict(f))
df.loc[df['New-column'] >= 9, 'New-column'] = np.nan
df:
type col1 New-column
0 A 1 3.0
1 A 2 3.0
2 B 3 3.0
3 C 9 NaN

You can use groupby and transform applied with min:
df['Timestamp'] = pd.to_datetime(df['Timestamp'] )
df['tmin'] = df.groupby(['type','value'])['Timestamp'].transform(min)
df['tdiff'] = (df['Timestamp'] - df['tmin']).dt.total_seconds()/3600
df[~((df.tdiff>0) & (df.tdiff<3))]
output
Timestamp type value tmin tdiff
-- ------------------- ------ ------- ------------------- -------
0 2020-09-02 07:00:00 A 1 2020-09-02 07:00:00 0
1 2020-09-02 07:10:00 A 2 2020-09-02 07:10:00 0
2 2020-09-02 07:30:00 B 3 2020-09-02 07:30:00 0
4 2020-09-02 10:00:00 A 1 2020-09-02 07:00:00 3
5 2020-09-02 11:10:00 A 2 2020-09-02 07:10:00 4
6 2020-09-02 11:30:00 B 3 2020-09-02 07:30:00 4

Related

pandas lagged value of time series (previous year) for cohort

For a dataframe of:
import pandas as pd
df = pd.DataFrame({
'dt':[
'2019-01-01',
'2019-01-02',
'2019-01-03',
'2020-01-01',
'2020-01-02',
'2020-01-03',
'2019-01-01',
'2019-01-02',
'2019-01-03',
'2020-01-01',
'2020-01-02',
'2020-01-03'
],
'foo': [1,2,3, 4,5,6, 1,5,3, 4,10,6],
'category': [1,1,1,1,1,1, 2,2,2,2,2,2]
})
How can I find the lagged value from the previous year for each category?
df['dt'] = pd.to_datetime(df['dt'])
display(df)
Shifting the index only returns an empty result, and thus fails assignment.
df['last_year'] = df[df.dt == df.dt - pd.offsets.Day(365)]
Obviously, a join with the data from 2019 on the month and day would work - but seem rahter cumbersome. Is there a better way?
edit
the desired result:
dt foo category last_year
2020-01-01 4 1 1
2020-01-02 5 1 2
2020-01-03 6 1 3
2020-01-01 4 2 1
2020-01-02 10 2 5
2020-01-03 6 2 3
you can merge df with itself after you assign the column dt with the difference you want with pd.DateOffset.
print (df.merge(df.assign(dt=lambda x: x['dt']+pd.DateOffset(years=1)),
on=['dt', 'category'],
suffixes=('','_lastYear'),
how='left'))
dt foo category foo_lastYear
0 2019-01-01 1 1 NaN
1 2019-01-02 2 1 NaN
2 2019-01-03 3 1 NaN
3 2020-01-01 4 1 1.0
4 2020-01-02 5 1 2.0
5 2020-01-03 6 1 3.0
6 2019-01-01 1 2 NaN
7 2019-01-02 5 2 NaN
8 2019-01-03 3 2 NaN
9 2020-01-01 4 2 1.0
10 2020-01-02 10 2 5.0
11 2020-01-03 6 2 3.0

Transform the dataframe from long to wide using pandas - Single row output

I have a dataframe like as shown below
df = pd.DataFrame({
'subject_id':[1,1,1,1,2,2,2,2],
'date':['2173/04/11','2173/04/12','2173/04/11','2173/04/12','2173/05/14','2173/05/15','2173/05/14','2173/05/15'],
'time_1':['2173/04/11 12:35:00','2173/04/12 12:50:00','2173/04/11 12:59:00','2173/04/12 13:14:00','2173/05/14 13:37:00','2173/05/15 13:39:00','2173/05/14 18:37:00','2173/05/15 19:39:00'],
'val' :[5,5,40,40,7,7,38,38],
'iid' :[12,12,12,12,21,21,21,21]
})
df['time_1'] = pd.to_datetime(df['time_1'])
df['day'] = df['time_1'].dt.day
I tried using stack,unstack,pivot and melt approach but doesn't seem to help
pd.melt(df, id_vars =['subject_id','val'], value_vars =['date','val']) #1
df.unstack().reset_index() #2
df.pivot(index='subject_id', columns='time_1', values='val') #3
I expect my output dataframe to look like as shown below
updated screenshot
Idea is create helper Series by GroupBy.cumcount with same column/columns for new index - here subject_id, create MultiIndex, reshape by DataFrame.unstack and last flatten MulitIndex in columns:
cols = ['time_1','val']
df = df.set_index(['subject_id', df.groupby('subject_id').cumcount().add(1)])[cols].unstack()
df.columns = [f'{a}{b}' for a, b in df.columns]
df = df.reset_index()
print (df)
subject_id time_11 time_12 time_13 \
0 1 2173-04-11 12:35:00 2173-04-12 12:50:00 2173-04-11 12:59:00
1 2 2173-05-14 13:37:00 2173-05-15 13:39:00 2173-05-14 18:37:00
time_14 val1 val2 val3 val4
0 2173-04-12 13:14:00 5 5 40 40
1 2173-05-15 19:39:00 7 7 38 38
Missing values are expected, if not same number of id groups - unstack use max count and then are added misisng values:
df = pd.DataFrame({
'subject_id':[1,1,1,2,2,3],
'date':['2173/04/11','2173/04/12','2173/04/11','2173/04/12','2173/05/14','2173/05/15'],
'time_1':['2173/04/11 12:35:00','2173/04/12 12:50:00','2173/04/11 12:59:00',
'2173/04/12 13:14:00','2173/05/14 13:37:00','2173/05/15 13:39:00'],
'val' :[5,5,40,40,7,7],
'iid' :[12,12,12,12,21,21]
})
df['time_1'] = pd.to_datetime(df['time_1'])
df['day'] = df['time_1'].dt.day
print (df)
subject_id date time_1 val iid day
0 1 2173/04/11 2173-04-11 12:35:00 5 12 11
1 1 2173/04/12 2173-04-12 12:50:00 5 12 12
2 1 2173/04/11 2173-04-11 12:59:00 40 12 11
3 2 2173/04/12 2173-04-12 13:14:00 40 12 12
4 2 2173/05/14 2173-05-14 13:37:00 7 21 14
5 3 2173/05/15 2173-05-15 13:39:00 7 21 15
cols = ['time_1','val']
df = df.set_index(['subject_id', df.groupby('subject_id').cumcount().add(1)])[cols].unstack()
df.columns = [f'{a}{b}' for a, b in df.columns]
df = df.reset_index()
print (df)
subject_id time_11 time_12 time_13 \
0 1 2173-04-11 12:35:00 2173-04-12 12:50:00 2173-04-11 12:59:00
1 2 2173-04-12 13:14:00 2173-05-14 13:37:00 NaT
2 3 2173-05-15 13:39:00 NaT NaT
val1 val2 val3
0 5.0 5.0 40.0
1 40.0 7.0 NaN
2 7.0 NaN NaN

Conditional Running Count in Pandas for All Previous Rows Only

Suppose I have the following DataFrame:
df = pd.DataFrame({'Event': ['A', 'B', 'A', 'A', 'B', 'C', 'B', 'B', 'A', 'C'],
'Date': ['2019-01-01', '2019-02-01', '2019-03-01', '2019-03-01', '2019-02-15',
'2019-03-15', '2019-04-05', '2019-04-05', '2019-04-15', '2019-06-10'],
'Sale':[100,200,150,200,150,100,300,250,500,400]})
df['Date'] = pd.to_datetime(df['Date'])
df
Event Date
A 2019-01-01
B 2019-02-01
A 2019-03-01
A 2019-03-01
B 2019-02-15
C 2019-03-15
B 2019-04-05
B 2019-04-05
A 2019-04-15
C 2019-06-10
I would like to obtain the following result:
Event Date Previous_Event_Count
A 2019-01-01 0
B 2019-02-01 0
A 2019-03-01 1
A 2019-03-01 1
B 2019-02-15 1
C 2019-03-15 0
B 2019-04-05 2
B 2019-04-05 2
A 2019-04-15 3
C 2019-06-10 1
where df['Previous_Event_Count'] is the number of an event (rows) when the event (df['Event']) takes place before its adjacent date (df['Date']). For instance,
The number of event A takes place before 2019-01-01 is 0,
The number of event A takes place before 2019-03-01 is 1, and
The number of event A takes place before 2019-04-15 is 3.
I am able to obtain the desired result using this line:
df['Previous_Event_Count'] = [df.loc[(df.loc[i, 'Event'] == df['Event']) & (df.loc[i, 'Date'] > df['Date']),
'Date'].count() for i in range(len(df))]
Although, it is slow but it works fine. I believe there is a better way to do that. I have tried this line:
df['Previous_Event_Count'] = df.query('Date < Date').groupby(['Event', 'Date']).cumcount()
but it produces NaNs.
groupby + rank
Dates can be treated as numeric. Use'min' to get your counting logic.
df['PEC'] = (df.groupby('Event').Date.rank(method='min')-1).astype(int)
Event Date PEC
0 A 2019-01-01 0
1 B 2019-02-01 0
2 A 2019-03-01 1
3 A 2019-03-01 1
4 B 2019-02-15 1
5 C 2019-03-15 0
6 B 2019-04-05 2
7 B 2019-04-05 2
8 A 2019-04-15 3
9 C 2019-06-10 1
First get counts by GroupBy.size per both columns, then aggregate by first level with shift and cumulative sum and last join to original:
s = (df.groupby(['Event', 'Date'])
.size()
.groupby(level=0)
.apply(lambda x: x.shift(1).cumsum())
.fillna(0)
.astype(int))
df = df.join(s.rename('Previous_Event_Count'), on=['Event','Date'])
print (df)
Event Date Previous_Event_Count
0 A 2019-01-01 0
1 B 2019-02-01 0
2 A 2019-03-01 1
3 A 2019-03-01 1
4 B 2019-02-15 1
5 C 2019-03-15 0
6 B 2019-04-05 2
7 B 2019-04-05 2
8 A 2019-04-15 3
9 C 2019-06-10 1
Finally, I can find a better and faster way to get the desired result. It turns out that it is very easy. One can try:
df['Total_Previous_Sale'] = df.groupby('Event').cumcount() \
- df.groupby(['Event', 'Date']).cumcount()

How to complement missing dates after groupby for each group in pandas?

My goal is to complement the missing date entries per project_id with 0 in the data row.
For example
df = pd.DataFrame({
'project_id': ['A', 'A', 'A', 'B', 'B'],
'timestamp': ['2018-01-01', '2018-03-01', '2018-04-01', '2018-03-01', '2018-06-01'],
'data': [100, 28, 45, 64, 55]})
which is
project_id timestamp data
0 A 2018-01-01 100
1 A 2018-03-01 28
2 A 2018-04-01 45
3 B 2018-03-01 64
4 B 2018-06-01 55
shall become
project_id timestamp data
0 A 2018-01-01 100
1 A 2018-02-01 0
2 A 2018-03-01 28
3 A 2018-04-01 45
4 B 2018-03-01 64
5 B 2018-04-01 0
6 B 2018-05-01 0
7 B 2018-06-01 55
where indices 1, 5, and 6 are added.
My current approach :
df.groupby('project_id').apply(lambda x: x[['timestamp', 'data']].set_index('timestamp').asfreq('M', how='start', fill_value=0))
is obviously wrong, because it sets everything to 0 and resampled not the first date of a month but the last one - although I thought this should be handled by how.
How do I expand/complement missing datetime entries after groupby to get a continuous time series for each group?
You are close:
df.timestamp = pd.to_datetime(df.timestamp)
# notice 'MS'
new_df = df.groupby('project_id').apply(lambda x: x[['timestamp', 'data']]
.set_index('timestamp').asfreq('MS'))
new_df.data = df.set_index(['project_id', 'timestamp']).data
df = new_df.fillna(0).reset_index()
You can use groupby in combination with pandas.Grouper:
df_new = pd.concat([
d for n, d in df.set_index('timestamp').groupby(pd.Grouper(freq='MS'))
])
df_new = df_new.sort_values('project_id').reset_index()
Output
print(df_new)
timestamp project_id data
0 2018-01-01 A 100.0
1 2018-02-01 A 0.0
2 2018-03-01 A 28.0
3 2018-04-01 A 45.0
4 2018-03-01 B 64.0
5 2018-04-01 B 0.0
6 2018-05-01 B 0.0
7 2018-06-01 B 55.0

pandas match date in one df with timeframe in another, then groupby-sum

I have two dataframes, test1 and test2. For each ID value in test2, I want to check the date in test2 and compare it to the date ranges for that same ID value in test1. If any of the date's in test2 are within a date range in test1, sum the amount column and assign that sum as an additional column in test1.
Output:
So the new test1 df will have a column amount_sum which is the sum of all amounts in test2 where the date is within the date range of test1 - for that ID
import random
import string
test1 = pd.DataFrame({
'ID':[''.join(random.choice(string.ascii_letters[0:4]) for _ in range(3)) for n in range(100)],
'date1':[pd.to_datetime(random.choice(['01-01-2018','05-01-2018','06-01-2018','08-01-2018','09-01-2018'])) + pd.DateOffset(int(np.random.randint(0, 100, 1))) for n in range(100)],
'date2':[pd.to_datetime(random.choice(['01-01-2018','05-01-2018','06-01-2018','08-01-2018','09-01-2018'])) + pd.DateOffset(int(np.random.randint(101, 200, 1))) for n in range(100)]
})
test2 = pd.DataFrame({
'ID':[''.join(random.choice(string.ascii_letters[0:4]) for _ in range(3)) for n in range(100)],
'amount':[random.choice([1,2,3,5,10]) for n in range(100)],
'date':[pd.to_datetime(random.choice(['01-01-2018','05-01-2018','06-01-2018','08-01-2018','09-01-2018'])) + pd.DateOffset(int(np.random.randint(0, 100, 1))) for n in range(100)]
})
Use:
#outer join both df by ID columns
df = test1.merge(test2, on='ID', how='outer')
#filter by range
df = df[(df.date > df.date1) & (df.date < df.date2)]
#thank you #Abhi for alternative
#df = df[df.date.between(df.date1, df.date2, inclusive=False)]
#aggregate sum
s = df.groupby(['ID','date1','date2'])['amount'].sum()
#add new column to test1
test = test1.join(s, on=['ID','date1','date2'])
Sample:
#https://stackoverflow.com/q/21494489
np.random.seed(123)
#https://stackoverflow.com/a/50559321/2901002
def gen(start, end, n):
start_u = start.value//10**9
end_u = end.value//10**9
return pd.to_datetime(np.random.randint(start_u, end_u, n), unit='s')
n = 10
test1 = pd.DataFrame({
'ID':np.random.choice(list('abc'), n),
'date1': gen(pd.to_datetime('2010-01-01'),pd.to_datetime('2010-03-01'), n).floor('d'),
'date2':gen(pd.to_datetime('2010-03-01'),pd.to_datetime('2010-06-01'), n).floor('d')
})
m = 5
test2 = pd.DataFrame({
'ID': np.random.choice(list('abc'), m),
'amount':np.random.randint(10, size=m),
'date':gen(pd.to_datetime('2010-01-01'), pd.to_datetime('2010-06-01'), m).floor('d')
})
print (test1)
ID date1 date2
0 c 2010-01-15 2010-05-22
1 b 2010-02-08 2010-04-16
2 c 2010-01-24 2010-04-12
3 c 2010-02-01 2010-04-09
4 a 2010-01-19 2010-05-20
5 c 2010-01-27 2010-05-24
6 c 2010-02-23 2010-03-15
7 b 2010-01-31 2010-05-09
8 c 2010-02-23 2010-03-29
9 b 2010-01-08 2010-03-07
print (test2)
ID amount date
0 a 4 2010-05-15
1 b 6 2010-03-26
2 a 1 2010-01-07
3 b 5 2010-02-07
4 a 6 2010-04-13
#outer join both df by ID columns
df = test1.merge(test2, on='ID', how='outer')
#filter by range
df = df[(df.date > df.date1) & (df.date < df.date2)]
print (df)
ID date1 date2 amount date
6 b 2010-02-08 2010-04-16 6.0 2010-03-26
8 b 2010-01-31 2010-05-09 6.0 2010-03-26
9 b 2010-01-31 2010-05-09 5.0 2010-02-07
11 b 2010-01-08 2010-03-07 5.0 2010-02-07
12 a 2010-01-19 2010-05-20 4.0 2010-05-15
14 a 2010-01-19 2010-05-20 6.0 2010-04-13
#thank you #Abhi for alternative
#df = df[df.date.between(df.date1, df.date2, inclusive=False)]
#aggregate sum
s = df.groupby(['ID','date1','date2'])['amount'].sum()
#add new column to test1
test = test1.join(s, on=['ID','date1','date2'])
print (test)
ID date1 date2 amount
0 c 2010-01-15 2010-05-22 NaN
1 b 2010-02-08 2010-04-16 6.0
2 c 2010-01-24 2010-04-12 NaN
3 c 2010-02-01 2010-04-09 NaN
4 a 2010-01-19 2010-05-20 10.0
5 c 2010-01-27 2010-05-24 NaN
6 c 2010-02-23 2010-03-15 NaN
7 b 2010-01-31 2010-05-09 11.0
8 c 2010-02-23 2010-03-29 NaN
9 b 2010-01-08 2010-03-07 5.0

Categories

Resources