I have a code that calculates the date differance excluding the weekends using np.busdaycount, but i need it in the hours which i cannot able to get.
import datetime
import numpy as np
df.Inflow_date_time= [pandas.Timestamp('2019-07-22 21:11:26')]
df.End_date_time= [pandas.Timestamp('2019-08-02 11:44:47')]
df['Day'] = ([np.busday_count(b,a) for a, b in zip(df['End_date_time'].values.astype('datetime64[D]'),df['Inflow_date_time'].values.astype('datetime64[D]'))])
Day
0 9
I need the out put as hours excluding the weekend. Like
Hours
0 254
Problems
Inflow_date_time=2019-08-01 23:22:46
End_date_time = 2019-08-05 17:43:51
Hours expected 42 hours
(1+24+17)
Inflow_date_time=2019-08-03 23:22:46
End_date_time = 2019-08-05 17:43:51
Hours expected 17 hours
(0+0+17)
Inflow_date_time=2019-08-01 23:22:46
End_date_time = 2019-08-05 17:43:51
Hours expected 17 hours
(0+0+17)
Inflow_date_time=2019-07-26 23:22:46
End_date_time = 2019-08-05 17:43:51
Hours expected 138 hours
(1+120+17)
Inflow_date_time=2019-08-05 11:22:46
End_date_time = 2019-08-05 17:43:51
Hours expected 6 hours
(0+0+6)
Please suggest.
Idea is floor datetimes for remove times by floor by days and get number of business days between start day + one day to hours3 column by numpy.busday_count and then create hour1 and hour2 columns for start and end hours with floor by hours if not weekends hours. Last sum all hours columns together:
df = pd.DataFrame(columns=['Inflow_date_time','End_date_time', 'need'])
df.Inflow_date_time= [pd.Timestamp('2019-08-01 23:22:46'),
pd.Timestamp('2019-08-03 23:22:46'),
pd.Timestamp('2019-08-01 23:22:46'),
pd.Timestamp('2019-07-26 23:22:46'),
pd.Timestamp('2019-08-05 11:22:46')]
df.End_date_time= [pd.Timestamp('2019-08-05 17:43:51')] * 5
df.need = [42,17,41,138,6]
#print (df)
df["hours1"] = df["Inflow_date_time"].dt.ceil('d')
df["hours2"] = df["End_date_time"].dt.floor('d')
one_day_mask = df["Inflow_date_time"].dt.floor('d') == df["hours2"]
df['hours3'] = [np.busday_count(b,a)*24 for a, b in zip(df['hours2'].dt.strftime('%Y-%m-%d'),
df['hours1'].dt.strftime('%Y-%m-%d'))]
mask1 = df['hours1'].dt.dayofweek < 5
hours1 = df['hours1'] - df['Inflow_date_time'].dt.floor('H')
df['hours1'] = np.where(mask1, hours1, np.nan) / np.timedelta64(1 ,'h')
mask2 = df['hours2'].dt.dayofweek < 5
df['hours2'] = (np.where(mask2, df['End_date_time'].dt.floor('H')-df['hours2'], np.nan) /
np.timedelta64(1 ,'h'))
df['date_diff'] = df['hours1'].fillna(0) + df['hours2'].fillna(0) + df['hours3']
one_day = (df['End_date_time'].dt.floor('H') - df['Inflow_date_time'].dt.floor('H')) /
np.timedelta64(1 ,'h')
df["date_diff"] = df["date_diff"].mask(one_day_mask, one_day)
print (df)
Inflow_date_time End_date_time need hours1 hours2 hours3 \
0 2019-08-01 23:22:46 2019-08-05 17:43:51 42 1.0 17.0 24
1 2019-08-03 23:22:46 2019-08-05 17:43:51 17 NaN 17.0 0
2 2019-08-01 23:22:46 2019-08-05 17:43:51 41 1.0 17.0 24
3 2019-07-26 23:22:46 2019-08-05 17:43:51 138 NaN 17.0 120
4 2019-08-05 11:22:46 2019-08-05 17:43:51 6 13.0 17.0 -24
date_diff
0 42.0
1 17.0
2 42.0
3 137.0
4 6.0
If i am not completly wrong you can also use a shorter workaround:
First save your day difference in an array:
res = np.busday_count(df['Inflow_date_time'].values.astype('datetime64[D]'), df['End_date_time'].values.astype('datetime64[D]'))
Then we need an extra hour column for every row:
df['starth'] = df['Inflow_date_time'].dt.hour
df['endh'] = df['End_date_time'].dt.hour
Then we will get the day difference to your dataframe:
my_list = res.tolist()
dfhelp =pd.DataFrame(my_list,columns=['col1'])
df2 = pd.concat((df, df2) , axis=1)
Then we have to get a help column, as the hour of End_date_timecan be before Inflow_date-time:
df2['h'] = df2['endh']-df2['starth']
And then we can calculate the hour difference (one day has 24 hours, based if the hour of the end date is before the start hour date or not):
df2['differenceh'] = np.where(df2['h'] >= 0, df2['col1']*24+df2['h'], df2['col1']*24-24+(24+df2['h']))
I updated jezrael answer's to work with version 1.x.x of pandas. I edited the code and the logic a bit to calculate the difference in hours and minutes.
Function
def datetimes_hours_difference(df_end: pd.Series, df_start: pd.Series) -> pd.Series:
"""
Calculate the total hours difference between two Pandas Series
containing datetime values (df_end - df_start)
Args:
df_end (pd.Series): Contains datetime values
df_start (pd.Series): Contains datetime values
Returns:
df_date_diff (pd.Series): Difference between df_end and df_start
"""
df_start_hours = df_start.dt.ceil('d')
df_end_hours = df_end.dt.floor('d')
one_day_mask = df_start.dt.floor('d') == df_end_hours
df_days_hours = [np.busday_count(
b, a, weekmask='1111011') * 24 for a, b in zip(
df_end_hours.dt.strftime('%Y-%m-%d'),
df_start_hours.dt.strftime('%Y-%m-%d')
)
]
mask1 = df_start.dt.dayofweek != 4
hours1 = df_start_hours - df_start.dt.floor('min')
hours1.loc[~mask1] = pd.NaT
df_start_hours = hours1 / pd.to_timedelta(1, unit='H')
df_start_hours = df_start_hours.fillna(0)
mask2 = df_end.dt.dayofweek != 4
hours2 = df_end.dt.floor('min') - df_end_hours
hours2.loc[~mask2] = pd.NaT
df_end_hours = hours2 / pd.to_timedelta(1, unit='H')
df_end_hours = df_end_hours.fillna(0)
df_date_diff = df_start_hours + df_end_hours + df_days_hours
one_day = (df_end.dt.floor('min') - df_start.dt.floor('min'))
one_day = one_day / pd.to_timedelta(1, unit='H')
df_date_diff = df_date_diff.mask(one_day_mask, one_day)
return df_date_diff
Example
df = pd.DataFrame({
'datetime1': ["2022-06-15 16:06:00", "2022-06-15 03:45:00", "2022-06-10 12:13:00", "2022-06-11 12:13:00", "2022-06-10 12:13:00", "2022-05-31 17:20:00"],
'datetime2': ["2022-06-22 22:36:00", "2022-06-15 22:36:00", "2022-06-22 10:10:00", "2022-06-22 10:10:00", "2022-06-24 10:10:00", "2022-06-02 05:29:00"],
'hours_diff': [150.5, 18.9, 250.9, 237.9, 288.0, 36.2]
})
df['datetime1'] = pd.to_datetime(df['datetime1'])
df['datetime2'] = pd.to_datetime(df['datetime2'])
df['hours_diff_fun'] = datetimes_hours_difference(df['datetime2'], df['datetime1'])
print(df)
datetime1 datetime2 hours_diff hours_diff_fun
0 2022-06-15 16:06:00 2022-06-22 22:36:00 150.5 150.500000
1 2022-06-15 03:45:00 2022-06-15 22:36:00 18.9 18.850000
2 2022-06-10 12:13:00 2022-06-22 10:10:00 250.9 250.166667
3 2022-06-11 12:13:00 2022-06-22 10:10:00 237.9 237.950000
4 2022-06-10 12:13:00 2022-06-24 10:10:00 288.0 288.000000
5 2022-05-31 17:20:00 2022-06-02 05:29:00 36.2 36.150000
Related
I am trying to derive a mean value for the average duration spent in a specific status by ID.
For this I first sort my data frame by ID and date, and with the apply and shift function trying to deduct the date of row[i+1] - row[i] - given row[i+1] - row[i] are for the same ID.
I get the following exception: AttributeError: 'int' object has no attribute 'shift'
Below a code for simulation:
import datetime
from datetime import datetime
today = datetime.today().strftime('%Y-%m-%d')
frame = pd.DataFrame({'id': [1245, 4556, 2345, 4556, 1248],'status': [1,2,4,5,6], 'date': ['2022-07-01', '2022-03-12', '2022-04-20', '2022-02-02', '2022-01-03']})
frame_ordered = frame.sort_values(['id','date'], ascending=True)
frame_ordered['duration'] = frame_ordered.apply(lambda x: x['date'].shift(-1) - x['date'] if x['id'] == x['id'].shift(-1) else today - x['date'], axis=1)
Can anyone please advise how to solve the last line with the lambda function?
I was not able to get it done with lambda. You can try like this:
import datetime
today = datetime.datetime.today() # you want it as real date, not string
frame = pd.DataFrame({'id': [1245, 4556, 2345, 4556, 1248],'status': [1,2,4,5,6], 'date': ['2022-07-01', '2022-03-12', '2022-04-20', '2022-02-02', '2022-01-03']})
frame['date'] = pd.to_datetime(frame['date']) #convert date column to datetime
frame_ordered = frame.sort_values(['id','date'], ascending=True)
#add column with shifted date values
frame_ordered['shifted'] = frame_ordered['date'].shift(-1)
# mask where the next row has same id as current one
mask = frame_ordered['id'] == frame_ordered['id'].shift(-1)
print(mask)
# subtract date and shifted date if mask is true, otherwise subtract date from today. ".dt.days" only displays the days, not necessary
frame_ordered['duration'] = np.where(mask, (frame_ordered['shifted']-frame_ordered['date']).dt.days, (today-frame_ordered['date']).dt.days)
#delete shifted date column if you want
frame_ordered = frame_ordered.drop('shifted', axis=1)
print(frame_ordered)
Output:
#mask
0 False
4 False
2 False
3 True
1 False
Name: id, dtype: bool
#frame_ordered
id status date duration
0 1245 1 2022-07-01 25.0
4 1248 6 2022-01-03 204.0
2 2345 4 2022-04-20 97.0
3 4556 5 2022-02-02 38.0
1 4556 2 2022-03-12 136.0
I think that the values were not interpreted as pandas Timestamps. With the right conversion it should be easy though:
import datetime
from datetime import datetime
today = datetime.today().strftime('%Y-%m-%d')
frame = pd.DataFrame({'id': [1245, 4556, 2345, 4556, 1248],'status': [1,2,4,5,6], 'date': ['2022-07-01', '2022-03-12', '2022-04-20', '2022-02-02', '2022-01-03']})
frame['date'] = pd.to_datetime(frame['date'])
frame_ordered = frame.sort_values(['id','date'], ascending=True)
frame_ordered['shifted'] = frame_ordered['date'].shift(1)
frame_ordered['Difference'] = frame_ordered['date']-frame_ordered['date'].shift(1)
print(frame_ordered)
which prints out
id status date shifted Difference
0 1245 1 2022-07-01 NaT NaT
4 1248 6 2022-01-03 2022-07-01 -179 days
2 2345 4 2022-04-20 2022-01-03 107 days
3 4556 5 2022-02-02 2022-04-20 -77 days
1 4556 2 2022-03-12 2022-02-02 38 days
I have this pandas dataframe with times:
id time
1 4/01/2019 08:00:00
2 4/02/2019 12:00:00
3 4/03/2019 18:00:00
And I want the fraction of a day, fraction of the week and fraction of the month. For example for the first row 08:00:00 is one third of a day, so first column should be 0.333. And it was Monday so it should be 0.047 (a complete day is 1/7 = 0.143 of a week, but since it's a third then 0.143 * 0.333 = 0.047). And it was the start of the month so it should be 0.011 (a complete day is 1/30 = 0.033 of a month, but it is only 8:am so it is 0.033 * 0.333 = 0.011.
Please note that the values are for complete days, for example for 4/02/2019 12:00:00, only 1 day and a half is counted.
The expected result should be:
id time frac_day frac_week frac_month
1 4/01/2019 08:00:00 0.333 0.047 0.011
2 4/02/2019 12:00:00 0.5 0.214 0.050
3 4/03/2019 18:00:00 0.75 0.393 0.092
Please, could you help me with this question in python? Any help will be greatly appreciated.
Try:
import pandas as pd
from pandas.tseries.offsets import MonthEnd, Day
df = pd.DataFrame({
'id': [1, 2, 3],
'time': ['4/01/2019 08:00:00', '4/02/2019 12:00:00',
'4/03/2019 18:00:00']
})
df['time'] = pd.to_datetime(df['time'])
# Percentage of Day by dividing current hour by number of hours in day
df['frac_day'] = df['time'].dt.hour / 24
# Get midnight the beginning of the week for each row
beginning_of_each_week = (
df['time'] - pd.to_timedelta(df['time'].dt.dayofweek, unit='D')
).dt.normalize()
seconds_in_week = 24 * 7 * 60 * 60
# % of week so far. by dividing total seconds by total seconds in week
df['frac_week'] = (
df['time'] - beginning_of_each_week
).dt.total_seconds() / seconds_in_week
# Get Timedelta based on midnight of the first day of the current month
time_so_far = df['time'] - (df['time'] - MonthEnd(1) + Day(1)).dt.normalize()
# Get Total time for the given month
time_in_month = (df['time'] + MonthEnd(1)) - (df['time'] - MonthEnd(1))
# % of month so far by dividing values
df['frac_month'] = time_so_far / time_in_month
df:
id time frac_day frac_week frac_month
0 1 2019-04-01 08:00:00 0.333333 0.047619 0.011111
1 2 2019-04-02 12:00:00 0.500000 0.214286 0.050000
2 3 2019-04-03 18:00:00 0.750000 0.392857 0.091667
Another solution:
df["frac_day"] = (
df["time"].dt.hour * 60 * 60
+ df["time"].dt.minute * 60
+ df["time"].dt.second
) / (24 * 60 * 60)
df["frac_week"] = (df["time"].dt.dayofweek + df["frac_day"]) / 7
df["frac_month"] = df["time"].apply(lambda x: pd.Period(str(x)).days_in_month)
df["frac_month"] = ((df["time"].dt.day - 1) / df["frac_month"]) + df[
"frac_day"
] / df["frac_month"]
print(df)
Prints:
id time frac_day frac_week frac_month
0 1 2019-04-01 08:00:00 0.333333 0.047619 0.011111
1 2 2019-04-02 12:00:00 0.500000 0.214286 0.050000
2 3 2019-04-03 18:00:00 0.750000 0.392857 0.091667
You'll find a lot of built in support for time and datetime functions.
Firstly make sure your df['time'] column is correctly stored as a datetime and then the following should do the trick:
# get number of seconds elapsed in the day - total seconds in a day
# note here we create a timedelta
# hack: use df['time'].dt.date to set time to 00:00:00
df['frac_day'] = (df['time'] - pd.to_datetime(df['time'].dt.date)).dt.total_seconds() / (24 * 3600)
df['frac_week'] = (df['time'].dt.dayofweek + df['frac_day']) / 7
df['frac_month'] = (df['time'].dt.day + df['frac_day'] - 1)/ df['time'].dt.days_in_month
df
Lets say I have a dataframe like this
df1:
datetime1 datetime2
0 2021-05-09 19:52:14 2021-05-09 20:52:14
1 2021-05-09 19:52:14 2021-05-09 21:52:14
2 NaN NaN
3 2021-05-09 16:30:14 NaN
4 NaN NaN
5 2021-05-09 12:30:14 2021-05-09 14:30:14
I want to compare the timestamps in datetime1 and datetime2 and create a new column with the difference between them.
In some scenarios I have a cases that I don't have values in datetime1 and datetime2, or I have values in datatime1 but I don't in datatime2, so is there is a possible way to get NaN in "difference" column if there is no timestamp in datetime1 and 2, and if there is a timestamp only in datetime1, get the difference compared to datetime.now() and put that in another column.
Desirable df output:
datetime1 datetime2 Difference in H:m:s Compared with datetime.now()
0 2021-05-09 19:52:14 2021-05-09 20:52:14 01:00:00 NaN
1 2021-05-09 19:52:14 2021-05-09 21:52:14 02:00:00 NaN
2 NaN NaN NaN NaN
3 2021-05-09 16:30:14 NaN NaN e.g(04:00:00)
4 NaN NaN NaN NaN
5 2021-05-09 12:30:14 2021-05-09 14:30:14 02:00:00 NaN
I tried a solution from #AndrejKesely, but it is failing if there is no timestamp in datetime1 and datetime2:
def strfdelta(tdelta, fmt):
d = {"days": tdelta.days}
d["hours"], rem = divmod(tdelta.seconds, 3600)
d["minutes"], d["seconds"] = divmod(rem, 60)
return fmt.format(**d)
# if datetime1/datetime2 aren't already datetime, apply `.to_datetime()`:
df["datetime1"] = pd.to_datetime(df["datetime1"])
df["datetime2"] = pd.to_datetime(df["datetime2"])
df["Difference in H:m:s"] = df.apply(
lambda x: strfdelta(
x["datetime2"] - x["datetime1"],
"{hours:02d}:{minutes:02d}:{seconds:02d}",
),
axis=1,
)
print(df)
Select only rows match conditions by using boolean indexing (mask) to do what you need and let Pandas fill missing values with NaN:
def strfdelta(td: pd.Timestamp):
seconds = td.total_seconds()
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
seconds = int(seconds % 60)
return f"{hours:02}:{minutes:02}:{seconds:02}"
bm1 = df["datetime1"].notna() & df["datetime2"].notna()
bm2 = df["datetime1"].notna() & df["datetime2"].isna()
df["Difference in H:m:s"] = (df.loc[bm1, "datetime2"] - df.loc[bm1, "datetime1"]).apply(strfdelta)
df["Compared with datetime.now()"] = (datetime.now() - df.loc[bm2, "datetime1"]).apply(strfdelta)
>>> df
datetime1 datetime2 Diff... Comp...
0 2021-05-09 19:52:14 2021-05-09 20:52:14 01:00:00 NaN
1 2021-05-09 19:52:14 2021-05-09 21:52:14 02:00:00 NaN
2 NaT NaT NaN NaN
3 2021-05-09 16:30:14 NaT NaN 103:09:19
4 NaT NaT NaN NaN
5 2021-05-09 12:30:14 2021-05-09 14:30:14 02:00:00 NaN
You could start by replacing all NaN values in the datetime2 column with datetime.now value. Thus it would make it easier to compare datetime1 to now if datetime1 is NaN.
You can do it with :
df["datetime2"] = df["datetime2"].fillna(value=pandas.to_datetime('today').normalize(),axis=1)
Then you hace only 2 conditions remaining :
If datetime1 column is empty, the result is NaN.
Otherwise, the result is the difference between datetime1 and datetime2 column (as there is no NaN remaining in datetime2 column).
You can perform this with :
import numpy as np
df["Difference in H:m:s"] = np.where(
df["datetime1"].isnull(),
pd.NA,
df["datetime2"] - df["datetime1"]
)
You can finally format your Difference in H:m:s in the required format with the function you provided :
def strfdelta(tdelta, fmt):
d = {"days": tdelta.days}
d["hours"], rem = divmod(tdelta.seconds, 3600)
d["minutes"], d["seconds"] = divmod(rem, 60)
return fmt.format(**d)
df["Difference in H:m:s"] = df.apply(
lambda x: strfdelta(
x["Difference in H:m:s"],
"{hours:02d}:{minutes:02d}:{seconds:02d}",
),
axis=1,
)
The complete code is :
import numpy as np
# if datetime1/datetime2 aren't already datetime, apply `.to_datetime()`:
df["datetime1"] = pd.to_datetime(df["datetime1"])
df["datetime2"] = pd.to_datetime(df["datetime2"])
df["datetime2"] = df["datetime2"].fillna(value=pandas.to_datetime('today').normalize(),axis=1)
df["Difference in H:m:s"] = np.where(
df["datetime1"].isnull(),
pd.NA,
df["datetime2"] - df["datetime1"]
)
def strfdelta(tdelta, fmt):
d = {"days": tdelta.days}
d["hours"], rem = divmod(tdelta.seconds, 3600)
d["minutes"], d["seconds"] = divmod(rem, 60)
return fmt.format(**d)
df["Difference in H:m:s"] = df.apply(
lambda x: strfdelta(
x["Difference in H:m:s"],
"{hours:02d}:{minutes:02d}:{seconds:02d}",
),
axis=1,
)
I have a dataframe like as shown below
df = pd.DataFrame({'person_id': [11,11,11,21,21],
'offset' :['-131 days','29 days','142 days','20 days','-200 days'],
'date_1': ['05/29/2017', '01/21/1997', '7/27/1989','01/01/2013','12/31/2016'],
'dis_date': ['05/29/2017', '01/24/1999', '7/22/1999','01/01/2015','12/31/1991'],
'vis_date':['05/29/2018', '01/27/1994', '7/29/2011','01/01/2018','12/31/2014']})
df['date_1'] = pd.to_datetime(df['date_1'])
df['dis_date'] = pd.to_datetime(df['dis_date'])
df['vis_date'] = pd.to_datetime(df['vis_date'])
I would like to shift all the dates of each subject based on his offset
Though my code works (credit - SO), I am looking for an elegant approach. You can see am kind of repeating almost the same line thrice.
df['offset_to_shift'] = pd.to_timedelta(df['offset'],unit='d')
#am trying to make the below lines elegant/efficient
df['shifted_date_1'] = df['date_1'] + df['offset_to_shift']
df['shifted_dis_date'] = df['dis_date'] + df['offset_to_shift']
df['shifted_vis_date'] = df['vis_date'] + df['offset_to_shift']
I expect my output to be like as shown below
Use, DataFrame.add along with DataFrame.add_prefix and DataFrame.join:
cols = ['date_1', 'dis_date', 'vis_date']
df = df.join(df[cols].add(df['offset_to_shift'], 0).add_prefix('shifted_'))
OR, it is also possible to use pd.concat:
df = pd.concat([df, df[cols].add(df['offset_to_shift'], 0).add_prefix('shifted_')], axis=1)
OR, we can also directly assign the new shifted columns to the dataframe:
df[['shifted_' + col for col in cols]] = df[cols].add(df['offset_to_shift'], 0)
Result:
# print(df)
person_id offset date_1 dis_date vis_date offset_to_shift shifted_date_1 shifted_dis_date shifted_vis_date
0 11 -131 days 2017-05-29 2017-05-29 2018-05-29 -131 days 2017-01-18 2017-01-18 2018-01-18
1 11 29 days 1997-01-21 1999-01-24 1994-01-27 29 days 1997-02-19 1999-02-22 1994-02-25
2 11 142 days 1989-07-27 1999-07-22 2011-07-29 142 days 1989-12-16 1999-12-11 2011-12-18
3 21 20 days 2013-01-01 2015-01-01 2018-01-01 20 days 2013-01-21 2015-01-21 2018-01-21
4 21 -200 days 2016-12-31 1991-12-31 2014-12-31 -200 days 2016-06-14 1991-06-14 2014-06-14
I want to insert a new column called total in final_dfwhich is a cumulative sum of value in df if it occurs between the times in final_df. It sums the values if it occurs between the start and end in final_df. So for example during the time range 01:30 to 02:00 in final_df - both index 0 and 1 in df occur between this time range so the total is 15 (10+5).
I have two pandas dataframes:
df
import pandas as pd
d = {'start_time': ['01:00','00:00','00:30','02:00'],
'end_time': ['02:00','03:00','01:30','02:30'],
'value': ['10','5','20','5']}
df = pd.DataFrame(data=d)
final_df
final_df = {'start_time': ['00:00, 00:30, 01:00, 01:30, 02:00, 02:30'],
'end_time': ['00:30, 01:00, 01:30, 02:00, 02:30, 03:00']}
final_df = pd.DataFrame(data=final_d)
output I want final_df
start_time end_time total
00:00 00:30 5
00:30 01:00 25
01:00 01:30 35
01:30 02:00 15
02:30 03:00 10
My try
final_df['total'] = final_df.apply(lambda x: df.loc[(df['start_time'] >= x.start_time) &
(df['end_time'] <= x.end_time), 'value'].sum(), axis=1)
Problem 1
I get the error: TypeError: ("'>=' not supported between instances of 'str' and 'datetime.time'", 'occurred at index 0')
I converted the relevant columns to datetime as follows:
df[['start_time','end_time']] = df[['start_time','end_time']].apply(pd.to_datetime, format='%H:%M')
final_df[['start_time','end_time']] = final_df[['start_time','end_time']].apply(pd.to_datetime, format='%H:%M:%S')
But I don't want to convert to datetime. Is there a way around this?
Problem 2
The sum is not working properly. It's only looking for exact match for the time range. So the output is:
start_time end_time total
00:00 00:30 0
00:30 01:00 0
01:00 01:30 0
01:30 02:00 0
02:30 03:00 5
One way to not use apply could be like this this.
df_ = (df.rename(columns={'start_time':1, 'end_time':-1}) #to use in the calculation later
.rename_axis(columns='mult') # mostly for esthetic
.set_index('value').stack() #reshape the data
.reset_index(name='time') # put the index back to columns
)
df_ = (df_.set_index(pd.to_datetime(df_['time'], format='%H:%M')) #to use resampling technic
.assign(total=lambda x: x['value'].astype(float)*x['mult']) #get plus or minus the value depending start/end
.resample('30T')[['total']].sum() # get the sum at the 30min bounds
.cumsum() #cumulative sum from the beginning
)
# create the column for merge with final resul
df_['start_time'] = df_.index.strftime('%H:%M')
# merge
final_df = final_df.merge(df_)
and you get
print (final_df)
start_time end_time total
0 00:00 00:30 5.0
1 00:30 01:00 25.0
2 01:00 01:30 35.0
3 01:30 02:00 15.0
4 02:00 02:30 10.0
5 02:30 03:00 5.0
But if you want to use apply, first you need to ensure that the columns are the good dtype and then you did the inegality in the reverse order like:
df['start_time'] = pd.to_datetime(df['start_time'], format='%H:%M')
df['end_time'] = pd.to_datetime(df['end_time'], format='%H:%M')
df['value'] = df['value'].astype(float)
final_df['start_time'] = pd.to_datetime(final_df['start_time'], format='%H:%M')
final_df['end_time'] = pd.to_datetime(final_df['end_time'], format='%H:%M')
final_df.apply(
lambda x: df.loc[(df['start_time'] <= x.start_time) & #see other inequality
(df['end_time'] >= x.end_time), 'value'].sum(), axis=1)
0 5.0
1 25.0
2 35.0
3 15.0
4 10.0
5 5.0
dtype: float64