Group-wise expansion of rows based on start and end date - python

I have the following pandas dataframe:
import numpy as np
import pandas as pd
dfw = pd.DataFrame({"id": ["A", "B"],
"start_date": pd.to_datetime(["2012-01-01", "2013-02-13"], format="%Y-%m-%d"),
"end_date": pd.to_datetime(["2012-04-17", "2014-11-18"], format="%Y-%m-%d")})
Result:
end_date id start_date
2012-04-17 A 2012-01-01
2014-11-18 B 2013-02-13
I am looking for the most efficient way to transform this dataframe to the following dataframe:
dates = np.empty(0, dtype="datetime64[M]")
dates = np.append(dates, pd.date_range(start="2012-01-01", end="2012-06-01", freq="MS").astype("object"))
dates = np.append(dates, pd.date_range(start="2013-02-01", end="2014-12-01", freq="MS").astype("object"))
dfl = pd.DataFrame({"id": np.repeat(["A", "B"], [6, 23]),
"counter": np.concatenate((np.arange(0, 6, dtype="float"), np.arange(0, 23, dtype="float"))),
"date": pd.to_datetime(dates, format="%Y-%m-%d")})
Result:
counter date id
0.0 2012-01-01 A
1.0 2012-02-01 A
2.0 2012-03-01 A
3.0 2012-04-01 A
4.0 2012-05-01 A
0.0 2013-02-01 B
1.0 2013-03-01 B
2.0 2013-04-01 B
3.0 2013-05-01 B
4.0 2013-06-01 B
5.0 2013-07-01 B
6.0 2013-08-01 B
7.0 2013-09-01 B
8.0 2013-10-01 B
9.0 2013-11-01 B
10.0 2013-12-01 B
11.0 2014-01-01 B
12.0 2014-02-01 B
13.0 2014-03-01 B
14.0 2014-04-01 B
15.0 2014-05-01 B
16.0 2014-06-01 B
17.0 2014-07-01 B
18.0 2014-08-01 B
19.0 2014-09-01 B
20.0 2014-10-01 B
21.0 2014-11-01 B
22.0 2014-12-01 B
A naive solution I came up so far is the following function:
def expand(df):
dates = np.empty(0, dtype="datetime64[ns]")
ids = np.empty(0, dtype="object")
counter = np.empty(0, dtype="float")
for name, group in df.groupby("id"):
start_date = group["start_date"].min()
start_date = pd.to_datetime(np.array(start_date, dtype="datetime64[M]").tolist())
end_date = group["end_date"].min()
end_date = end_date + pd.Timedelta(1, unit="M")
end_date = pd.to_datetime(np.array(end_date, dtype="datetime64[M]").tolist())
tmp = pd.date_range(start=start_date, end=end_date, freq="MS", closed=None).values
dates = np.append(dates, tmp)
ids = np.append(ids, np.repeat(group.id.values[0], len(tmp)))
counter = np.append(counter, np.arange(0, len(tmp)))
dfl = pd.DataFrame({"id": ids, "counter": counter, "date": dates})
return dfl
But it is not very fast:
%timeit expand(dfw)
100 loops, best of 3: 4.84 ms per loop

normally I adivise to avoid itertuples, but in some situations it can be more intuitive. You can get fine-grained control of the endpoints via kwargs to pd.date_range if desired (e.g. to include an endpoint or not)
In [27]: result = pd.concat([pd.Series(r.id,pd.date_range(r.start_date, r.end_date)) for r in dfw.itertuples()]).reset_index()
In [28]: result.columns = ['counter', 'date']
In [29]: result
Out[29]:
counter date
0 2012-01-01 A
1 2012-01-02 A
2 2012-01-03 A
3 2012-01-04 A
4 2012-01-05 A
5 2012-01-06 A
.. ... ...
746 2014-11-13 B
747 2014-11-14 B
748 2014-11-15 B
749 2014-11-16 B
750 2014-11-17 B
751 2014-11-18 B
[752 rows x 2 columns]
In [26]: %timeit pd.concat([pd.Series(r.id,pd.date_range(r.start_date, r.end_date)) for r in dfw.itertuples()]).reset_index()
100 loops, best of 3: 2.15 ms per loop
Not really sure of the purpose of making this super fast. You would generally do this kind of expansion a single time.
You wanted month starts, so here is that.
In [23]: result = pd.concat([pd.Series(r.id,pd.date_range(r.start_date, r.end_date+pd.offsets.MonthBegin(1), freq='MS', closed=None)) for r in dfw.itertuples()]).reset_index()
In [24]: result.columns=['counter', 'date']
In [25]: result
Out[25]:
counter date
0 2012-01-01 A
1 2012-02-01 A
2 2012-03-01 A
3 2012-04-01 A
4 2012-05-01 A
5 2013-03-01 B
6 2013-04-01 B
7 2013-05-01 B
8 2013-06-01 B
9 2013-07-01 B
10 2013-08-01 B
11 2013-09-01 B
12 2013-10-01 B
13 2013-11-01 B
14 2013-12-01 B
15 2014-01-01 B
16 2014-02-01 B
17 2014-03-01 B
18 2014-04-01 B
19 2014-05-01 B
20 2014-06-01 B
21 2014-07-01 B
22 2014-08-01 B
23 2014-09-01 B
24 2014-10-01 B
25 2014-11-01 B
26 2014-12-01 B
You can adjust dates like this
In [17]: pd.Timestamp('2014-01-17')-pd.offsets.MonthBegin(1)
Out[17]: Timestamp('2014-01-01 00:00:00')
In [18]: pd.Timestamp('2014-01-31')-pd.offsets.MonthBegin(1)
Out[18]: Timestamp('2014-01-01 00:00:00')
In [19]: pd.Timestamp('2014-02-01')-pd.offsets.MonthBegin(1)
Out[19]: Timestamp('2014-01-01 00:00:00')

Related

Filter dataframe by multiple date ranges

Given a dataframe with observations how can rows be returned which are within +-X days of a given list of dates?
I came up with the following function, but is there a simpler more efficient way of achieving the task?
import numpy as np
from numpy.random import RandomState
def filterDfByDates(df, dates_of_observations, date_range):
"""
Extract all rows in the dataframe which fall between any date in the dates_of_observation +- date_range range
"""
##Build mask
mask = np.full(df.shape[0],False)
for query_date in dates_of_observations:
min_day = query_date - date_range
max_day = query_date + date_range
mask |= ( (df.index >= min_day) & (df.index <= max_day) )
return df[mask]
rand = RandomState(17)
dates : np.ndarray = rand.choice(a=np.arange(np.datetime64('2021-01-01'),
np.datetime64('2021-01-15'), np.timedelta64(1, 'h')),size= 30, replace=True)
dates.sort()
randData = rand.choice([True, False], len(dates), p=[0.1, 0.9])
df = pd.DataFrame({"event": randData},
index=dates)
dates_of_obs = df.query("event").index
filterDfByDates(df,dates_of_obs, np.timedelta64(1,'D'))
From your DataFrame :
>>> import pandas as pd
>>> from io import StringIO
>>> df = pd.read_csv(StringIO("""
date,event
2012-01-01 12:30:00,event1
2012-01-01 12:30:12,event2
2012-01-01 12:30:12,event3
2012-01-02 12:28:29,event4
2012-02-01 12:30:29,event4
2012-02-01 12:30:38,event5
2012-03-01 12:31:05,event6
2012-03-01 12:31:38,event7
2012-06-01 12:31:44,event8
2012-07-01 10:31:48,event9
2012-07-01 11:32:23,event10"""))
>>> df['date'] = pd.to_datetime(df['date'], format="%Y-%m-%d %H:%M:%S.%f")
>>> df
date event
0 2012-01-01 12:30:00 event1
1 2012-01-01 12:30:12 event2
2 2012-01-01 12:30:12 event3
3 2012-01-02 12:28:29 event4
4 2012-02-01 12:30:29 event4
5 2012-02-01 12:30:38 event5
6 2012-03-01 12:31:05 event6
7 2012-03-01 12:31:38 event7
8 2012-06-01 12:31:44 event8
9 2012-07-01 10:31:48 event9
10 2012-07-01 11:32:23 event10
First, we start by shifting the date column and substract it to the original date column :
>>> g = df['date'].sub(df['date'].shift(1)).dt.days
>>> g
0 NaN
1 0.0
2 0.0
3 0.0
4 30.0
5 0.0
6 29.0
7 0.0
8 92.0
9 29.0
10 0.0
Name: date, dtype: float64
Then, we apply a cumsum for all values greater than X (here it is 1 day) to get the expect result :
>>> X = 1
>>> df.groupby(g.gt(X).cumsum()).apply(print)
date event
0 2012-01-01 12:30:00 event1
1 2012-01-01 12:30:12 event2
2 2012-01-01 12:30:12 event3
3 2012-01-02 12:28:29 event4
date event
4 2012-02-01 12:30:29 event4
5 2012-02-01 12:30:38 event5
date event
6 2012-03-01 12:31:05 event6
7 2012-03-01 12:31:38 event7
date event
8 2012-06-01 12:31:44 event8
date event
9 2012-07-01 10:31:48 event9
10 2012-07-01 11:32:23 event10

How to join a table with each group of a dataframe in pandas

I have a dataframe like below. Each date is Monday of each week.
df = pd.DataFrame({'date' :['2020-04-20', '2020-05-11','2020-05-18',
'2020-04-20', '2020-04-27','2020-05-04','2020-05-18'],
'name': ['A', 'A', 'A', 'B', 'B', 'B', 'B'],
'count': [23, 44, 125, 6, 9, 10, 122]})
date name count
0 2020-04-20 A 23
1 2020-05-11 A 44
2 2020-05-18 A 125
3 2020-04-20 B 6
4 2020-04-27 B 9
5 2020-05-04 B 10
6 2020-05-18 B 122
Neither 'A' and 'B' covers the whole date range. Both of them have some missing dates, which means the counts on that week is 0. Below is all the dates:
df_dates = pd.DataFrame({ 'date':['2020-04-20', '2020-04-27','2020-05-04','2020-05-11','2020-05-18'] })
So what I need is like the dataframe below:
date name count
0 2020-04-20 A 23
1 2020-04-27 A 0
2 2020-05-04 A 0
3 2020-05-11 A 44
4 2020-05-18 A 125
5 2020-04-20 B 6
6 2020-04-27 B 9
7 2020-05-04 B 10
8 2020-05-11 B 0
9 2020-05-18 B 122
It seems like I need to join (merge) df_dates with df for each name group ( A and B) and then fill the data with missing name and missing count value with 0's. Does anyone know achieve that? how I can join with another table with a grouped table?
I tried and no luck...
pd.merge(df_dates, df.groupby('name'), how='left', on='date')
We can do reindex with multiple index creation
idx=pd.MultiIndex.from_product([df_dates.date,df.name.unique()],names=['date','name'])
s=df.set_index(['date','name']).reindex(idx,fill_value=0).reset_index().sort_values('name')
Out[136]:
date name count
0 2020-04-20 A 23
2 2020-04-27 A 0
4 2020-05-04 A 0
6 2020-05-11 A 44
8 2020-05-18 A 125
1 2020-04-20 B 6
3 2020-04-27 B 9
5 2020-05-04 B 10
7 2020-05-11 B 0
9 2020-05-18 B 122
Or
s=df.pivot(*df.columns).reindex(df_dates.date).fillna(0).reset_index().melt('date')
Out[145]:
date name value
0 2020-04-20 A 23.0
1 2020-04-27 A 0.0
2 2020-05-04 A 0.0
3 2020-05-11 A 44.0
4 2020-05-18 A 125.0
5 2020-04-20 B 6.0
6 2020-04-27 B 9.0
7 2020-05-04 B 10.0
8 2020-05-11 B 0.0
9 2020-05-18 B 122.0
If you are looking for just fill in the union of dates in df, you can do:
(df.set_index(['date','name'])
.unstack('date',fill_value=0)
.stack().reset_index()
)
Output:
name date count
0 A 2020-04-20 23
1 A 2020-04-27 0
2 A 2020-05-04 0
3 A 2020-05-11 44
4 A 2020-05-18 125
5 B 2020-04-20 6
6 B 2020-04-27 9
7 B 2020-05-04 10
8 B 2020-05-11 0
9 B 2020-05-18 122

Combine Dataframes with Minute and Date Indexes

Say I have the following dataframes; one with data on 1 minute intervals, and the another with daily data intervals.
>>> df_1 = pd.DataFrame({'A':[1,2,3],
'B':[4,5,6],
'C':[7,8,9]},
index=pd.to_datetime(['2017-01-01 00:01:00', '2017-01-01 00:02:00', '2017-01-02 00:01:00']))
>>> df_1
A B C
2017-01-01 00:01:00 1 4 7
2017-01-01 00:02:00 2 5 8
2017-01-02 00:01:00 3 6 9
>>> df_2 = pd.DataFrame({'D':['ON','OFF']},
index=pd.to_datetime(['2017-01-01', '2017-01-02']))
>>> df_2
D
2017-01-01 ON
2017-01-02 OFF
And I want to merge them so that the data from df_2 gets pulled into df_1 by the date only, as below.
>>> df_merge = pd.merge(df_1, df_2)
>>> df_merge
A B C D
2017-01-01 00:01:00 1 4 7 ON
2017-01-01 00:02:00 2 5 8 ON
2017-01-02 00:01:00 3 6 9 OFF
Is there any way to merge based on date only that will pull the daily data into the minutely data?
With reindex
df_1['D']=df_2.reindex(df_1.index,method='nearest')
df_1
Out[184]:
A B C D
2017-01-01 00:01:00 1 4 7 ON
2017-01-01 00:02:00 2 5 8 ON
2017-01-02 00:01:00 3 6 9 OFF
Option 1
If your dataframes are sorted by index, then pd.merge_asof should also work:
pd.merge_asof(df_1, df_2, left_index=True, right_index=True)
A B C D
2017-01-01 00:01:00 1 4 7 ON
2017-01-01 00:02:00 2 5 8 ON
2017-01-02 00:01:00 3 6 9 OFF
Option 2
If not, then you can construct a general solution using merge on a temporary column constructed from the floored date of df_1's index:
df_1.assign(temp=df_1.index.floor('D')).merge(
df_2, left_on='temp', right_index=True
).drop('temp', 1)
A B C D
2017-01-01 00:01:00 1 4 7 ON
2017-01-01 00:02:00 2 5 8 ON
2017-01-02 00:01:00 3 6 9 OFF
Option 3
Using concat:
idx = df_1.index
pd.concat([df_1.set_index(df_1.index.floor('D')), df_2], 1).set_index(idx)
A B C D
2017-01-01 00:01:00 1 4 7 ON
2017-01-01 00:02:00 2 5 8 ON
2017-01-02 00:01:00 3 6 9 OFF

Pandas Reindexing MultiIndex Relative to Arbitrary Level

I'm trying to reindex a dataframe relative to the second level of an index. I have a dataframe where the first level of the index is user id and the second level is date. For example:
pd.DataFrame({
'id': 3*['A'] + 5*['B'] + 4*['C'],
'date': ['01-01-2010', '02-01-2010', '12-01-2010',
'04-01-2015', '05-01-2015', '03-01-2016', '04-01-2016', '05-01-2016',
'01-01-2015', '02-01-2015', '03-01-2015', '04-01-2015'],
'value': np.random.randint(10,100, 12)})\
.set_index(['id', 'date'])
I want to reindex the dates to fill in the missing dates, but only for the dates between the max and min dates for each "id" group.
For example user "A" should have continuous monthly data from January to December 2010 and user "B" should have continuous dates between April 2015 through May 2016. For simplicity let's assume I want to fill the NaNs with zeros.
Other questions similar to this assume that I want to use the same date_range for all users, which doesn't work in this use case. Any ideas?
I think you need reset_index + groupby + resample + asfreq + fillna:
np.random.seed(123)
df = pd.DataFrame({
'id': 3*['A'] + 5*['B'] + 4*['C'],
'date': ['01-01-2010', '02-01-2010', '12-01-2010',
'04-01-2015', '05-01-2015', '03-01-2016', '04-01-2016', '05-01-2016',
'01-01-2015', '02-01-2015', '03-01-2015', '04-01-2015'],
'value': np.random.randint(10,100, 12)})
df['date'] = pd.to_datetime(df['date'])
df = df.set_index(['id', 'date'])
print (df)
value
id date
A 2010-01-01 76
2010-02-01 27
2010-12-01 93
B 2015-04-01 67
2015-05-01 96
2016-03-01 57
2016-04-01 83
2016-05-01 42
C 2015-01-01 56
2015-02-01 35
2015-03-01 93
2015-04-01 88
df1 = df.reset_index(level='id').groupby('id')['value'].resample('D').asfreq().fillna(0)
print (df1.head(10))
value
id date
A 2010-01-01 76.0
2010-01-02 0.0
2010-01-03 0.0
2010-01-04 0.0
2010-01-05 0.0
2010-01-06 0.0
2010-01-07 0.0
2010-01-08 0.0
2010-01-09 0.0
2010-01-10 0.0
But if need process only max and min dates first need select data with agg by idxmax
idxmin with loc:
df = df.reset_index()
df1 = df.loc[df.groupby('id')['date'].agg(['idxmin', 'idxmax']).stack()]
print (df1)
id date value
0 A 2010-01-01 76
2 A 2010-12-01 93
3 B 2015-04-01 67
7 B 2016-05-01 42
8 C 2015-01-01 56
11 C 2015-04-01 88
df1 = df1.set_index('date').groupby('id')['value'].resample('MS').asfreq().fillna(0)
print (df1.head(10))
Is that what you want?
In [52]: (df.reset_index().groupby('id')
...: .apply(lambda x: x.set_index('date').resample('D').mean().fillna(0))
...: )
Out[52]:
value
id date
A 2010-01-01 91.0
2010-01-02 0.0
2010-01-03 0.0
2010-01-04 0.0
2010-01-05 0.0
2010-01-06 0.0
2010-01-07 0.0
2010-01-08 0.0
2010-01-09 0.0
2010-01-10 0.0
... ...
C 2015-03-23 0.0
2015-03-24 0.0
2015-03-25 0.0
2015-03-26 0.0
2015-03-27 0.0
2015-03-28 0.0
2015-03-29 0.0
2015-03-30 0.0
2015-03-31 0.0
2015-04-01 11.0
[823 rows x 1 columns]
PS i have converted date to datetime dtype first...
use groupby and agg to get 'start' and 'end' dates and build set up tuples to reindex with.
m = dict(min='start', max='end')
df = df.reset_index().groupby('id').date.agg(['min', 'max']).rename(columns=m)
idx = [(i, d) for i, row in d2.iterrows() for d in pd.date_range(freq='MS', **row)]
df.reindex(idx, fill_value=0)
value
id date
A 2010-01-01 27
2010-02-01 15
2010-03-01 0
2010-04-01 0
2010-05-01 0
2010-06-01 0
2010-07-01 0
2010-08-01 0
2010-09-01 0
2010-10-01 0
2010-11-01 0
2010-12-01 11
B 2015-04-01 10
2015-05-01 94
2015-06-01 0
2015-07-01 0
2015-08-01 0
2015-09-01 0
2015-10-01 0
2015-11-01 0
2015-12-01 0
2016-01-01 0
2016-02-01 0
2016-03-01 42
2016-04-01 15
2016-05-01 71
C 2015-01-01 17
2015-02-01 51
2015-03-01 99
2015-04-01 58

Pandas: How to resample dataframe such that each combination is present?

Assume we have the following data frame:
# data
t = pd.to_datetime(pd.Series(['2015-01-01', '2015-02-01', '2015-03-01', '2015-04-01', '2015-01-01', '2015-02-01']))
g = pd.Series(['A', 'A', 'A', 'A', 'B', 'B'])
v = pd.Series([12.1, 14.2, 15.3, 16.2, 12.2, 13.7])
df = pd.DataFrame({'time': t, 'group': g, 'value': v})
# show data
>>> df
time group value
0 2015-01-01 A 12.1
1 2015-02-01 A 14.2
2 2015-03-01 A 15.3
3 2015-04-01 A 16.2
4 2015-01-01 B 12.2
5 2015-02-01 B 13.7
What I would like to have in the end is the following data frame:
>>> df
time group value
0 2015-01-01 A 12.1
1 2015-02-01 A 14.2
2 2015-03-01 A 15.3
3 2015-04-01 A 16.2
4 2015-01-01 B 12.2
5 2015-02-01 B 13.7
6 2015-03-01 B 13.7
7 2015-04-01 B 13.7
The missing observations in group B should be added and the missing values should default to the last observed value.
How can I achieve this? Thanks in advance!
You can use pivot for reshaping, ffill NaN (fillna with method ffill) and reshape to original by unstack with reset_index:
print (df.pivot(index='time',columns='group',values='value')
.ffill()
.unstack()
.reset_index(name='value'))
group time value
0 A 2015-01-01 12.1
1 A 2015-02-01 14.2
2 A 2015-03-01 15.3
3 A 2015-04-01 16.2
4 B 2015-01-01 12.2
5 B 2015-02-01 13.7
6 B 2015-03-01 13.7
7 B 2015-04-01 13.7
Another solution first find date_range by min and max values of time. Then groupby with resample by D with ffill:
Notice:
I think you forget parameter format='%Y-%d-%m' in to_datetime, if last number is month:
t = pd.to_datetime(pd.Series(['2015-01-01', '2015-02-01', '2015-03-01',
'2015-04-01', '2015-01-01', '2015-02-01']),
format='%Y-%d-%m')
idx = pd.date_range(df.time.min(), df.time.max())
print (idx)
DatetimeIndex(['2015-01-01', '2015-01-02', '2015-01-03', '2015-01-04'],
dtype='datetime64[ns]', freq='D')
df1 = (df.groupby('group')
.apply(lambda x: x.set_index('time')
.reindex(idx))
.ffill()
.reset_index(level=0, drop=True)
.reset_index()
.rename(columns={'index':'time'}))
print (df1)
time group value
0 2015-01-01 A 12.1
1 2015-01-02 A 14.2
2 2015-01-03 A 15.3
3 2015-01-04 A 16.2
4 2015-01-01 B 12.2
5 2015-01-02 B 13.7
6 2015-01-03 B 13.7
7 2015-01-04 B 13.7

Categories

Resources