Subtract business days (with holidays) in pandas - python

Suppose I have a dataframe:
Case_no
Event_ID
Date
Type
Done_by
1
1
2023-01-01
Email Sent
Customer
1
2
2023-01-02
Response
Agent
1
3
2023-01-03
Email Sent
Customer
1
4
2023-01-10
Response
Agent
2
5
2023-01-02
Email Sent
Customer
2
6
2023-01-04
Email Sent
Customer
I have found a function to apply that flags up whether an email has a response and whether one has had a response in 5 days as follows. assuming the above is df3:
def f(x):
m = x['Done_by'].eq('Customer')
for k, v in x.loc[m, 'Date'].items():
s = x[~m].loc[k:, 'Date'].sub(v).dt.days
x.loc[k, 'Response?'] = not s.empty
x.loc[k, 'Response_Within_Five_Days'] = s.lt(5).any()
#NEED TO ADD WORKING DAY AND BANK HOLIDAY EXCEPTIONS AS
return x
df4 = df3.reset_index(drop=True)
df4 = df4.groupby('Reference #').apply(f)
df4
This will return
Case_no
Event_ID
Date
Type
Done_by
Response?
Response_Within_Five_Days
1
1
2023-01-01
Email Sent
Customer
TRUE
TRUE
1
2
2023-01-02
Response
Agent
nan
NAN
1
3
2023-01-03
Email Sent
Customer
TRUE
FALSE
1
4
2023-01-10
Response
Agent
NAN
NAN
2
5
2023-01-02
Email Sent
Customer
FALSE
FALSE
2
6
2023-01-04
Email Sent
Customer
FALSE
FALSE
I also know that I can add and subtract business days using the following code:
import requests
import json
import pandas as pd
import numpy as np
from pandas.tseries.offsets import *
from datetime import *
import holidays
uk_holidays = holidays.GB(subdiv='England',years = [2022,2023,2024,2025])
myholidays = [holiday.strftime('%Y-%m-%d') for holiday in uk_holidays]
myholidays
holidays= myholidays
bday_uk = CustomBusinessDay(holidays=holidays)
ts = pd.Timestamp(2022, 12, 23)
ts+(2 *bday_uk )
result : Timestamp('2022-12-29 00:00:00')
How would I go about swapping the
s = x[~m].loc[k:, 'Date'].sub(v).dt.days
for something that would subtract business days instead? I have read about creating a custom accessor but I am not going to lie when I say I was not able to wrap my head around it enough to do what I want. Would I have to create a custom accessor or is there an easier way to subtract business days in my function?

I think you need to use the pandas.tseries.offsets.CustomBusinessDay class to subtract business days in your function.
import pandas as pd
import numpy as np
from pandas.tseries.offsets import CustomBusinessDay
from datetime import date
import holidays
uk_holidays = holidays.GB(subdiv='England',years = [2022,2023,2024,2025])
myholidays = [holiday.strftime('%Y-%m-%d') for holiday in uk_holidays]
bday_uk = CustomBusinessDay(holidays=myholidays)
df = pd.DataFrame({
'Case_no': [1, 1, 1, 1, 2, 2],
'Event_ID': [1, 2, 3, 4, 5, 6],
'Date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-10', '2023-01-02', '2023-01-04'],
'Type': ['Email Sent', 'Response', 'Email Sent', 'Response', 'Email Sent', 'Email Sent'],
'Done_by': ['Customer', 'Agent', 'Customer', 'Agent', 'Customer', 'Customer']
})
df['Date'] = pd.to_datetime(df['Date'])
def f(x):
m = x['Done_by'].eq('Customer')
for k, v in x.loc[m, 'Date'].items():
s = x[~m].loc[k:, 'Date'].apply(lambda d: bday_uk.rollback(d)).sub(bday_uk.rollback(v)).dt.days
x.loc[k, 'Response?'] = not s.empty
x.loc[k, 'Response_Within_Five_Days'] = (s >= -5).any()
return x
df = df.groupby('Case_no').apply(f).reset_index(drop=True)
print(df)
which will return
Case_no Event_ID Date Type Done_by Response? Response_Within_Five_Days
0 1 1 2023-01-01 Email Sent Customer True True
1 1 2 2023-01-02 Response Agent False False
2 1 3 2023-01-03 Email Sent Customer True False
3 1 4 2023-01-10 Response Agent False False
4 2 5 2023-01-02 Email Sent Customer False False
5 2 6 2023-01-04 Email Sent Customer False False

Related

Calculate the avarage amount of items posted per day/month/year using python

I want to calculate the average amount of social media posts posted per day/month/year
given data with this structure:
[{'created_utc': 1669804989}, {'created_utc': 1669804782}, {'created_utc': 1669804772}] # etc...
(Its thousands of entries in length)
I came up with the following code:
import datetime
def unix_to_utc(unix_time: int):
utc = datetime.datetime.utcfromtimestamp(unix_time)
return datetime.date(
utc.year, utc.month, utc.day
)
def avarege_number(numbers: list) -> float:
return sum(numbers) / len(numbers)
def grop_by_day(posts: list) -> list:
items = []
for post in posts:
items.append(
unix_to_utc(post['created_utc'])
)
grouped = []
for date in items:
appended = False
for group in grouped:
if group[0] != date:
continue
group.append(date)
appended = True
if not appended:
grouped.append(
[date]
)
return grouped
def post_per(posts: list):
grouped = grop_by_day(post)
var_name = []
for i in grouped:
var_name.append(
len(i)
)
return avarege_number(
var_name
)
calling the code:
data = [{'created_utc': 1669804989}, {'created_utc': 1669804782}, {'created_utc': 1669804772}]
result = post_per(data)
There are a lot of problems with the code but i cant think of a better solution.
The main one is that it dose not account for days with no posts.
And i have no idea how to calculate per month/year
You can use pandas.Grouper to group the dates by Day/Month/Year:
import pandas as pd
data = [{'created_utc': 1669804989}, {'created_utc': 1669804782}, {'created_utc': 1669894772},
{'created_utc': 1669904989}, {'created_utc': 1669904782}, {'created_utc': 1669904772},
{'created_utc': 1679804989}, {'created_utc': 1679804782}, {'created_utc': 1679804772}]
df = pd.DataFrame(data)
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')
df['count'] = 0
print(df.groupby(pd.Grouper(key='created_utc', freq='D')).count())
print(df.groupby(pd.Grouper(key='created_utc', freq='MS')).count())
print(df.groupby(pd.Grouper(key='created_utc', freq='YS')).count())
Output:
count # Day
created_utc
2022-11-30 2
2022-12-01 4
2022-12-02 0
2022-12-03 0
2022-12-04 0
... ...
2023-03-22 0
2023-03-23 0
2023-03-24 0
2023-03-25 0
2023-03-26 3
[117 rows x 1 columns]
count # Month
created_utc
2022-11-01 2
2022-12-01 4
2023-01-01 0
2023-02-01 0
2023-03-01 3
count # Year
created_utc
2022-01-01 6
2023-01-01 3
You can then compute the average of these dataframes with mean().

[Python], How to calculate 'service response time' per conversation id

I would like to calculate the average 'service response time' per conversation id as a variable in a dataframe (in minutes).
The 'service response time' is calculated by the difference between the 'created_at' variable from Y and X in minutes:
X = the first row where owner_type == "User" and is_interaction == 1.
Y = the first row after X where owner_type == "Agent" and owner_id != 1.
Update:
id
owner_type
owner_id
conversation_id
message
created_at
is_interaction
260943
Agent
1
26276
a
01/03/2022 15:00
265544
Agent
1
26276
b
05/03/2022 12:01
266749
User
153263
26276
c
05/03/2022 15:49
1
266750
User
153263
26276
d
05/03/2022 15:49
1
266753
Agent
14
26276
e
05/03/2022 15:51
267003
Agent
1
26276
f
06/03/2022 12:01
268900
User
153263
26276
g
06/03/2022 17:01
1
268904
Agent
1
26276
h
07/03/2022 12:00
271141
Agent
1
26276
i
09/03/2022 12:00
271725
User
153263
26276
j
09/03/2022 13:01
1
271728
User
153263
26276
k
09/03/2022 13:01
1
271727
Agent
10
26277
l
09/03/2022 13:01
272085
Agent
1
26276
m
10/03/2022 12:01
Any ideas on how to calculate this?
Update:
The resulted output should look like this:
You should replace the column name "Average Response Time (in minutes)" for "srt" in the dataframe. Ignore the "Average" in this column name, because it's not and the "Date" column if not needed.
Best regards,
Milan Passchier
Update 04.11.2022
If you had a unique ID for each event, then it would be easier. And more: 'X = the first row where owner_type == "User" and is_interaction == 1.' This is not the first row at all, but the last one before 'Y = the first row after X where owner_type == "Agent" and owner_id != 1'.
I offer two options. In both cases, the created_at column is converted to the desired format using pd.to_datetime, and a 'srt' column is created with empty values. Explicit loc indexing is used.
In the first one, the main logic is in list comprehensions (they are many times faster than a loop).
More:
First, a list bbb is created, in which the condition is checked at each iteration:
if df.loc[i, 'owner_type'] == 'User' and df.loc[i, 'is_interaction'] == 1
if it is met, then the iteration number is written and the my_func function is called, which is fed the iteration number and 'conversation_id'. The function takes a dataframe by slice starting from i to the last one. Finds a row with 'Agent' that does not equal 1 and has the same 'conversation_id'. The first available line is taken:
m = aaa.index[0]
If there are no such strings, then the function returns -1.
Thus, we get the list bbb, in which the User indexes are on the left and Agent on the right.
In the fff list, the last lines where the Agent index stops repeating are copied.
Further in the loop, with the help of the selected indices, the necessary lines are filled using loc.
code list comprehensions:
import numpy as np
import pandas as pd
df['created_at'] = pd.to_datetime(df['created_at'], errors='raise')
df['srt'] = np.nan
def my_func(i, id):
m = -1
aaa = df[i:]
aaa = aaa[(df.loc[i:, 'conversation_id'] == id) & (df.loc[i:, 'owner_type'] == 'Agent')
& (df.loc[i:, 'owner_id'] != 1)]
if len(aaa) > 0:
m = aaa.index[0]
return m
bbb = np.array([[i, my_func(i, df.loc[i, 'conversation_id'])]
for i in range(len(df)) if df.loc[i, 'owner_type'] == 'User' and df.loc[i, 'is_interaction'] == 1])
fff = [bbb[i] for i in range(len(bbb) - 1) if (bbb[i, 1] != bbb[i + 1:, 1]).all() == True and bbb[i, 1] != -1]
if len(bbb) > 1 and bbb[-1, 1] != -1:
fff.append(bbb[-1])
fff = np.array(fff)
for i in fff:
df.loc[i[0], 'srt'] = (df.loc[i[1], 'created_at'] - df.loc[i[0], 'created_at']) / np.timedelta64(1, 'm')
print(df)
The solution is where all the logic is in a loop.
import numpy as np
import pandas as pd
df['created_at'] = pd.to_datetime(df['created_at'], errors='raise')
df['srt'] = np.nan
ferst_time_user = 0
cid = 0
ind = 0
for i in range(len(df)):
if df.loc[i, 'owner_type'] == 'User' and df.loc[i, 'is_interaction'] == 1:
ferst_time_user = df.loc[i, 'created_at']
cid = df.loc[i, 'conversation_id']
ind = i
if ferst_time_user != 0 and df.loc[i, 'conversation_id'] == cid and df.loc[i, 'owner_type'] == 'Agent' and df.loc[i, 'owner_id'] != 1:
df.loc[ind, 'srt'] = (df.loc[i, 'created_at'] - ferst_time_user) / np.timedelta64(1, 'm')
ferst_time_user = 0
ind = 0
cid = 0
print(df)
Output
id owner_type owner_id conversation_id message created_at \
0 260943 Agent 1 26276 a 2022-01-03 15:00:00
1 265544 Agent 1 26276 b 2022-05-03 12:01:00
2 266749 User 153263 26276 c 2022-05-03 15:49:00
3 266750 User 153263 26276 d 2022-05-03 15:49:00
4 266753 Agent 14 26276 e 2022-05-03 15:51:00
5 267003 Agent 1 26276 f 2022-06-03 12:01:00
6 268900 User 153263 26276 g 2022-06-03 17:01:00
7 268904 Agent 1 26276 h 2022-07-03 12:00:00
8 271141 Agent 1 26276 i 2022-09-03 12:00:00
9 271725 User 153263 26276 j 2022-09-03 13:01:00
10 271728 User 153263 26276 k 2022-09-03 13:01:00
11 271727 Agent 10 26277 l 2022-09-03 13:01:00
12 272085 Agent 1 26276 m 2022-10-03 12:01:00
is_interaction srt
0 NaN NaN
1 NaN NaN
2 1.0 NaN
3 1.0 2.0
4 NaN NaN
5 NaN NaN
6 1.0 NaN
7 NaN NaN
8 NaN NaN
9 1.0 NaN
10 1.0 NaN
11 NaN NaN
12 NaN NaN

Dataframe column locating min and max value depeding of an ID

I'm wondering how to optimize a part of code to remove a loop which takes forever since I have around 350 000 IDs.
Here is the current code, which is not optimal and takes quite a while.
I'm trying to get it working better and if possible removing a loop.
The dataset is made of 4 columns with IDs, start_dates, end_dates and amount. We can have multi rows with same IDs but not the same amount. The main thing is in some rows the dates are not saved in the dataset. In that case we have to find the earlier start_date of the ID and the later end_date and add them to the row where it's not put in the dataframe
ID start_date end_date value
ABC 12/10/2010 12/12/2020 8
ABC 01/01/2020 01/04/2021 9
ABC 43
BCD 14/02/2020 14/03/2020 8
So we should have on the third row the start_date as 12/10/2010 and end date 01/04/2021. In the picture you cant see it but don't forget that BCD start_date could be earlier than ABC but you still use the 12/10/2010 because it is linked to the ID
for x in df['ID'].unique():
tmp = df.loc[df['ID'] == x].reset_index()
df.loc[(df['ID'] == x) & (df['start_date'].isna()), 'start_date'] = tmp['start_date'].min()
df.loc[(df['ID'] == x) & (df['end_date'].isna()), 'end_date'] = tmp['end_date'].max()
I suppose the code is quite clear about what I am trying to do.
But if you have any questions don't hesitate do post them I'll do my best to answer.
set up the job
import pandas as pd
data = { 'ID': ['ABC','ABC','ABC','BCD'], 'start_date' : ['12/10/2010', '01/01/2020',None ,'14/02/2020'], 'end_date': ['12/12/2020', '01/01/2021',None ,'14/03/2020'], 'value': [8,9,43,8]}
df = pd.DataFrame(data)
df['start_date'] = pd.to_datetime(df['start_date'])
df['end_date'] = pd.to_datetime(df['end_date'])
we get this result
ID start_date end_date value
0 ABC 2010-12-10 2020-12-12 8
1 ABC 2020-01-01 2021-01-01 9
2 ABC NaT NaT 43
3 BCD 2020-02-14 2020-03-14 8
do the work
df.start_date = df.groupby('ID')['start_date'].apply(lambda x: x.fillna(x.min()))
df.end_date = df.groupby('ID')['end_date'].apply(lambda x: x.fillna(x.max()))
we get this result
ID start_date end_date value
0 ABC 2010-12-10 2020-12-12 8
1 ABC 2020-01-01 2021-01-01 9
2 ABC 2010-12-10 2021-01-01 43
3 BCD 2020-02-14 2020-03-14 8

Pandas dataframe conditional cumulative sum based on date range

I have a pandas dataframe:
Date Party Status
-------------------------------------------
0 01-01-2018 John Sent
1 13-01-2018 Lisa Received
2 15-01-2018 Will Received
3 19-01-2018 Mark Sent
4 02-02-2018 Will Sent
5 28-02-2018 John Received
I would like to add new columns that perform a .cumsum(), but it is conditional on the dates. It would look like this:
Num of Sent Num of Received
Date Party Status in Past 30 Days in Past 30 Days
-----------------------------------------------------------------------------------
0 01-01-2018 John Sent 1 0
1 13-01-2018 Lisa Received 1 1
2 15-01-2018 Will Received 1 2
3 19-01-2018 Mark Sent 2 2
4 02-02-2018 Will Sent 2 2
5 28-02-2018 John Received 1 1
I managed to implement what I need by writing the following code:
def inner_func(date_var, status_var, date_array, status_array):
sent_increment = 0
received_increment = 0
for k in range(0, len(date_array)):
if((date_var - date_array[k]).days <= 30):
if(status_array[k] == "Sent"):
sent_increment += 1
elif(status_array[k] == "Received"):
received_increment += 1
return sent_increment, received_increment
import pandas as pd
import time
df = pd.DataFrame({"Date": pd.to_datetime(["01-01-2018", "13-01-2018", "15-01-2018", "19-01-2018", "02-02-2018", "28-02-2018"]),
"Party": ["John", "Lisa", "Will", "Mark", "Will", "John"],
"Status": ["Sent", "Received", "Received", "Sent", "Sent", "Received"]})
df = df.sort_values("Date")
date_array = []
status_array = []
for i in range(0, len(df)):
date_var = df.loc[i,"Date"]
date_array.append(date_var)
status_var = df.loc[i,"Status"]
status_array.append(status_var)
sent_count, received_count = inner_func(date_var, status_var, date_array, status_array)
df.loc[i, "Num of Sent in Past 30 days"] = sent_count
df.loc[i, "Num of Received in Past 30 days"] = received_count
However, the process is computationally expensive and painfully slow when df is large, since the nested loops go through the dataframe twice. Is there a more pythonic way to implement what I am trying to achieve without iterating through the dataframe in the way I am doing?
Update 2
Michael has provided the solution to what I am looking for: here. Lets assume that I want to apply the solution on groupby objects. For example, using the rolling solution to compute the cumulative sums based for each party:
Sent past 30 Received past 30
Date Party Status days by party days by party
-----------------------------------------------------------------------------------
0 01-01-2018 John Sent 1 0
1 13-01-2018 Lisa Received 0 1
2 15-01-2018 Will Received 0 1
3 19-01-2018 Mark Sent 1 0
4 02-02-2018 Will Sent 1 1
5 28-02-2018 John Received 0 1
I have attempted to regenerate the solution for the using the groupby method below:
l = []
grp_obj = df.groupby("Party")
grp_obj.rolling('30D', min_periods=1)["dummy"].apply(lambda x: l.append(x.value_counts()) or 0)
df.reset_index(inplace=True)
But I ended up with incorrect values. I know that it is happening because the concat method is combining the dataframes without condsidering their indices, since groupby orders the data differently. Is there a way I can modify the list appending to include the original index, such that I can merge/join the value_counts dataframe to the original one?
If you set Date as index and convert Status temporary to a categorical you can use pd.rolling with a little trick
df = df.set_index('Date')
df['dummy'] = df['Status'].astype('category',copy=False).cat.codes
l = []
df.rolling('30D', min_periods=1)['dummy'].apply(lambda x: l.append(x.value_counts()) or 0)
df.reset_index(inplace=True)
pd.concat(
[df,
(pd.DataFrame(l)
.rename(columns={1.0: "Sent past 30 Days", 0.0: "Received past 30 Days"})
.fillna(0)
.astype('int'))
], axis=1).drop('dummy', 1)
Out:
Date Party Status Received past 30 Days Sent past 30 Days
0 2018-01-01 John Sent 0 1
1 2018-01-13 Lisa Received 1 1
2 2018-01-15 Will Received 2 1
3 2018-01-19 Mark Sent 2 2
4 2018-02-02 Will Sent 2 2
5 2018-02-28 John Received 1 1
Maintaining an original index to allow subsequent merging
Slightly adjust the data to have different sequences in Date and index
df = pd.DataFrame({"Date": pd.to_datetime(["01-01-2018", "13-01-2018", "03-01-2018", "19-01-2018", "08-02-2018", "22-02-2018"]),
"Party": ["John", "Lisa", "Will", "Mark", "Will", "John"],
"Status": ["Sent", "Received", "Received", "Sent", "Sent", "Received"]})
df
Out:
Date Party Status
0 2018-01-01 John Sent
1 2018-01-13 Lisa Received
2 2018-03-01 Will Received
3 2018-01-19 Mark Sent
4 2018-08-02 Will Sent
5 2018-02-22 John Received
Store the original index after sorting by Date and reindex after operationing on the dataframe sorted by Date
df = df.sort_values('Date')
df = df.reset_index()
df = df.set_index('Date')
df['dummy'] = df['Status'].astype('category',copy=False).cat.codes
l = []
df.rolling('30D', min_periods=1)['dummy'].apply(lambda x: l.append(x.value_counts()) or 0)
df.reset_index(inplace=True)
df = pd.concat(
[df,
(pd.DataFrame(l)
.rename(columns={1.0: "Sent past 30 Days", 0.0: "Received past 30 Days"})
.fillna(0)
.astype('int'))
], axis=1).drop('dummy', 1)
df.set_index('index')
Out:
Date Party Status Received past 30 Days Sent past 30 Days
index
0 2018-01-01 John Sent 0 1
1 2018-01-13 Lisa Received 1 1
3 2018-01-19 Mark Sent 1 2
5 2018-02-22 John Received 1 0
2 2018-03-01 Will Received 2 0
4 2018-08-02 Will Sent 0 1
Counting values in groups
Sort by Party and Date first to get the right order to append the grouped counts
df = pd.DataFrame({"Date": pd.to_datetime(["01-01-2018", "13-01-2018", "15-01-2018", "19-01-2018", "02-02-2018", "28-02-2018"]),
"Party": ["John", "Lisa", "Will", "Mark", "Will", "John"],
"Status": ["Sent", "Received", "Received", "Sent", "Sent", "Received"]})
df = df.sort_values(['Party','Date'])
After that reindex before concat to append to the right rows
df = df.set_index('Date')
df['dummy'] = df['Status'].astype('category',copy=False).cat.codes
l = []
df.groupby('Party').rolling('30D', min_periods=1)['dummy'].apply(lambda x: l.append(x.value_counts()) or 0)
df.reset_index(inplace=True)
pd.concat(
[df,
(pd.DataFrame(l)
.rename(columns={1.0: "Sent past 30 Days", 0.0: "Received past 30 Days"})
.fillna(0)
.astype('int'))
], axis=1).drop('dummy', 1).sort_values('Date')
Out:
Date Party Status Received past 30 Days Sent past 30 Days
0 2018-01-01 John Sent 0 1
2 2018-01-13 Lisa Received 1 0
4 2018-01-15 Will Received 1 0
3 2018-01-19 Mark Sent 0 1
5 2018-02-02 Will Sent 1 1
1 2018-02-28 John Received 1 0
Micro-Benchmark
As this solution is also iterating over the dataset I compared the running times of both approaches. Only very small datasets were used because the original solution's runtime was increasing fast.
Results
Code to reproduce the benchmark
import pandas as pd
import perfplot
def makedata(n=1):
df = pd.DataFrame({"Date": pd.to_datetime(["01-01-2018", "13-01-2018", "15-01-2018", "19-01-2018", "02-02-2018", "28-02-2018"]*n),
"Party": ["John", "Lisa", "Will", "Mark", "Will", "John"]*n,
"Status": ["Sent", "Received", "Received", "Sent", "Sent", "Received"]*n})
return df.sort_values("Date")
def rolling(df):
df = df.set_index('Date')
df['dummy'] = df['Status'].astype('category',copy=False).cat.codes
l = []
df.rolling('30D', min_periods=1)['dummy'].apply(lambda x: l.append(x.value_counts()) or 0)
df.reset_index(inplace=True)
return pd.concat(
[df,
(pd.DataFrame(l)
.rename(columns={1.0: "Sent past 30 Days", 0.0: "Received past 30 Days"})
.fillna(0)
.astype('int'))
], axis=1).drop('dummy', 1)
def forloop(df):
date_array = []
status_array = []
def inner_func(date_var, status_var, date_array, status_array):
sent_increment = 0
received_increment = 0
for k in range(0, len(date_array)):
if((date_var - date_array[k]).days <= 30):
if(status_array[k] == "Sent"):
sent_increment += 1
elif(status_array[k] == "Received"):
received_increment += 1
return sent_increment, received_increment
for i in range(0, len(df)):
date_var = df.loc[i,"Date"]
date_array.append(date_var)
status_var = df.loc[i,"Status"]
status_array.append(status_var)
sent_count, received_count = inner_func(date_var, status_var, date_array, status_array)
df.loc[i, "Num of Sent in Past 30 days"] = sent_count
df.loc[i, "Num of Received in Past 30 days"] = received_count
return df
perfplot.show(
setup=makedata,
kernels=[forloop, rolling],
n_range=[x for x in range(5, 105, 5)],
equality_check=None,
xlabel='len(df)'
)

Pandas - Calculate the number of customers between startdate and enddate

I would like to calculate how many customers there were at each time of month in the past year. My dataframe contains customer ID, start-date (where customer started being customer) and end-date (where customer ended being customer):
Customer_ID StartDate EndDate
1 01/01/2019 NAT
2 25/10/2017 01/06/2020
2 13/06/2012 15/07/2015
2 20/12/2015 03/01/2016
2 25/03/2016 14/06/2017
3 05/06/2018 05/06/2019
3 12/12/2019 NAT
The result I would like; is counting the number of customers that were "active" per month-year combination:
MONTH YEAR NUMB_CUSTOMERS
01 2013 1
02 2013 1
03 2013 1
04 2013 1
...
01 2019 2
...
09 2020 2
I would like to avoid for-loops as that takes too much long (I have a table of over 100 000 rows).
Has anyone an idea to do this neat and quickly?
Thanks!
First, read data and make it digestible for program
import pandas as pd
import datetime
df = pd.read_csv("table.csv")
func = lambda x: x.split('/', maxsplit=1)[1]
df["StartDate"] = df["StartDate"].apply(func)
mask = df["EndDate"] != "NAT"
df.loc[mask, "EndDate"] = df.loc[mask, "EndDate"].apply(func)
Then, count changes in amount of clients (you basically get a derivative of your data)
customers_gained = df[["Customer_ID", "StartDate"]].groupby("StartDate").agg("count")
customers_lost = df[["Customer_ID", "EndDate"]].groupby("EndDate").agg("count")
customers_lost.drop("NAT",inplace=True)
make a grouper for all changes in amount of clients
def make_time_table(start, end):
start_date = datetime.datetime.strptime(start, "%d/%m/%Y")
end_date = datetime.datetime.strptime(end, "%d/%m/%Y")
data_range = pd.date_range(start_date, end_date, freq="M")
string_range = [el.strftime("%m/%Y") for el in data_range]
ser = pd.Series([0]*data_range.size, index=string_range)
return ser
Next introduce change into time_table and "integrate" by accumulation
time_table = make_time_table("01/01/2012", "01/12/2020")
time_table[customers_gained.index] = customers_gained["Customer_ID"]
time_table[customers_lost.index] -= customers_lost["Customer_ID"]
result = time_table.cumsum()
print(result)
Outputs:
01/2012 0
02/2012 0
03/2012 0
04/2012 0
05/2012 0
06/2012 1
07/2012 1
...
10/2019 2
11/2019 2
12/2019 3
01/2020 3
02/2020 3
03/2020 3
04/2020 3
05/2020 3
06/2020 2
07/2020 2
08/2020 2
09/2020 2
10/2020 2
11/2020 2
dtype: int64
table.csv
Customer_ID,StartDate,EndDate
1,01/01/2019,NAT
2,25/10/2017,01/06/2020
2,13/06/2012,15/07/2015
2,20/12/2015,03/01/2016
2,25/03/2016,14/06/2017
3,25/03/2016,05/06/2019
3,12/12/2019,NAT

Categories

Resources