Filter dataframe by multiple date ranges - python

Given a dataframe with observations how can rows be returned which are within +-X days of a given list of dates?
I came up with the following function, but is there a simpler more efficient way of achieving the task?
import numpy as np
from numpy.random import RandomState
def filterDfByDates(df, dates_of_observations, date_range):
"""
Extract all rows in the dataframe which fall between any date in the dates_of_observation +- date_range range
"""
##Build mask
mask = np.full(df.shape[0],False)
for query_date in dates_of_observations:
min_day = query_date - date_range
max_day = query_date + date_range
mask |= ( (df.index >= min_day) & (df.index <= max_day) )
return df[mask]
rand = RandomState(17)
dates : np.ndarray = rand.choice(a=np.arange(np.datetime64('2021-01-01'),
np.datetime64('2021-01-15'), np.timedelta64(1, 'h')),size= 30, replace=True)
dates.sort()
randData = rand.choice([True, False], len(dates), p=[0.1, 0.9])
df = pd.DataFrame({"event": randData},
index=dates)
dates_of_obs = df.query("event").index
filterDfByDates(df,dates_of_obs, np.timedelta64(1,'D'))

From your DataFrame :
>>> import pandas as pd
>>> from io import StringIO
>>> df = pd.read_csv(StringIO("""
date,event
2012-01-01 12:30:00,event1
2012-01-01 12:30:12,event2
2012-01-01 12:30:12,event3
2012-01-02 12:28:29,event4
2012-02-01 12:30:29,event4
2012-02-01 12:30:38,event5
2012-03-01 12:31:05,event6
2012-03-01 12:31:38,event7
2012-06-01 12:31:44,event8
2012-07-01 10:31:48,event9
2012-07-01 11:32:23,event10"""))
>>> df['date'] = pd.to_datetime(df['date'], format="%Y-%m-%d %H:%M:%S.%f")
>>> df
date event
0 2012-01-01 12:30:00 event1
1 2012-01-01 12:30:12 event2
2 2012-01-01 12:30:12 event3
3 2012-01-02 12:28:29 event4
4 2012-02-01 12:30:29 event4
5 2012-02-01 12:30:38 event5
6 2012-03-01 12:31:05 event6
7 2012-03-01 12:31:38 event7
8 2012-06-01 12:31:44 event8
9 2012-07-01 10:31:48 event9
10 2012-07-01 11:32:23 event10
First, we start by shifting the date column and substract it to the original date column :
>>> g = df['date'].sub(df['date'].shift(1)).dt.days
>>> g
0 NaN
1 0.0
2 0.0
3 0.0
4 30.0
5 0.0
6 29.0
7 0.0
8 92.0
9 29.0
10 0.0
Name: date, dtype: float64
Then, we apply a cumsum for all values greater than X (here it is 1 day) to get the expect result :
>>> X = 1
>>> df.groupby(g.gt(X).cumsum()).apply(print)
date event
0 2012-01-01 12:30:00 event1
1 2012-01-01 12:30:12 event2
2 2012-01-01 12:30:12 event3
3 2012-01-02 12:28:29 event4
date event
4 2012-02-01 12:30:29 event4
5 2012-02-01 12:30:38 event5
date event
6 2012-03-01 12:31:05 event6
7 2012-03-01 12:31:38 event7
date event
8 2012-06-01 12:31:44 event8
date event
9 2012-07-01 10:31:48 event9
10 2012-07-01 11:32:23 event10

Related

Calculate average temperature/humidity between 2 dates pandas data frames

I have the following data frames:
df3
Harvest_date
Starting_date
2022-10-06
2022-08-06
2022-02-22
2021-12-22
df (I have all temp and humid starting from 2021-01-01 till the present)
date
temp
humid
2022-10-06 00:30:00
2
30
2022-10-06 00:01:00
1
30
2022-10-06 00:01:30
0
30
2022-10-06 00:02:00
0
30
2022-10-06 00:02:30
-2
30
I would like to calculate the avg temperature and humidity between the starting_date and harvest_date. I tried this:
import pandas as pd
df = pd.read_csv (r'C:\climate.csv')
df3 = pd.read_csv (r'C:\Flower_weight_Seson.csv')
df['date'] = pd.to_datetime(df.date)
df3['Harvest_date'] = pd.to_datetime(df3.Harvest_date)
df3['Starting_date'] = pd.to_datetime(df3.Starting_date)
df.style.format({"date": lambda t: t.strftime("%Y-%m-%d")})
df3.style.format({"Harvest_date": lambda t: t.strftime("%Y-%m-%d")})
df3.style.format({"Starting_date": lambda t: t.strftime("%Y-%m-%d")})
for harvest_date,starting_date in zip(df3['Harvest_date'],df3['Starting_date']):
df3["Season avg temp"]= df[df["date"].between(starting_date,harvest_date)]["temp"].mean()
df3["Season avg humid"]= df[df["date"].between(starting_date,harvest_date)]["humid"].mean()
I get the same value for all dates. Can someone point out what I did wrong, please?
Use DataFrame.loc with match indices by means of another DataFrame:
#changed data for match with df3
print (df)
date temp humid
0 2022-10-06 00:30:00 2 30
1 2022-09-06 00:01:00 1 33
2 2022-09-06 00:01:30 0 23
3 2022-10-06 00:02:00 0 30
4 2022-01-06 00:02:30 -2 25
for i,harvest_date,starting_date in zip(df3.index,df3['Harvest_date'],df3['Starting_date']):
mask = df["date"].between(starting_date,harvest_date)
avg = df.loc[mask, ["temp",'humid']].mean()
df3.loc[i, ["Season avg temp",'Season avg humid']] = avg.to_numpy()
print (df3)
Harvest_date Starting_date Season avg temp Season avg humid
0 2022-10-06 2022-08-06 0.5 28.0
1 2022-02-22 2021-12-220 -2.0 25.0
EDIT: For add new condition for match by room columns use:
for i,harvest_date,starting_date, room in zip(df3.index,
df3['Harvest_date'],
df3['Starting_date'], df3['Room']):
mask = df["date"].between(starting_date,harvest_date) & df['Room'].eq(room)
avg = df.loc[mask, ["temp",'humid']].mean()
df3.loc[i, ["Season avg temp",'Season avg humid']] = avg.to_numpy()
print (df3)

Calculate delta between two columns and two following rows for different group

Are there any vector operations for improving runtime?
I found no other way besides for loops.
Sample DataFrame:
df = pd.DataFrame({'ID': ['1', '1','1','2','2','2'],
'start_date': ['01-Jan', '05-Jan', '08-Jan', '05-Jan','06-Jan', '10-Jan'],
'start_value': [12, 15, 1, 3, 2, 6],
'end_value': [20, 17, 6,19,13.5,9]})
ID start_date start_value end_value
0 1 01-Jan 12 20.0
1 1 05-Jan 15 17.0
2 1 08-Jan 1 6.0
3 2 05-Jan 3 19.0
4 2 06-Jan 2 13.5
5 2 10-Jan 6 9.0
I've tried:
import pandas as pd
df_original # contains data
data_frame_diff= pd.DataFrame()
for ID in df_original ['ID'].unique():
tmp_frame = df_original .loc[df_original ['ID']==ID]
tmp_start_value = 0
for label, row in tmp_frame.iterrows():
last_delta = tmp_start_value - row['value']
tmp_start_value = row['end_value']
row['last_delta'] = last_delta
data_frame_diff= data_frame_diff.append(row,True)
Expected Result:
df = pd.DataFrame({'ID': ['1', '1','1','2','2','2'],
'start_date': ['01-Jan', '05-Jan', '08-Jan', '05-Jan', '06-Jan',
'10-Jan'],
'last_delta': [0, 5, 16, 0, 17, 7.5]})
ID start_date last_delta
0 1 01-Jan 0.0
1 1 05-Jan 5.0
2 1 08-Jan 16.0
3 2 05-Jan 0.0
4 2 06-Jan 17.0
5 2 10-Jan 7.5
I want to calculate the delta between start_value and end_value of the timestamp and the following timestamp after for each user ID.
Is there a way to improve runtime of this code?
Use DataFrame.groupby
on ID and shift the column end_value then use Series.sub to subtract it from start_value, finally use Series.fillna and assign this new column s to the dataframe using DataFrame.assign:
s = df.groupby('ID')['end_value'].shift().sub(df['start_value']).fillna(0)
df1 = df[['ID', 'start_date']].assign(last_delta=s)
Result:
print(df1)
ID start_date last_delta
0 1 01-Jan 0.0
1 1 05-Jan 5.0
2 1 08-Jan 16.0
3 2 05-Jan 0.0
4 2 06-Jan 17.0
5 2 10-Jan 7.5
It's a bit difficult to follow from your description what you need, but you might find this helpful:
import pandas as pd
df = (pd.DataFrame({'t1': pd.date_range(start="2020-01-01", end="2020-01-02", freq="H"),
})
.reset_index().rename(columns={'index': 'ID'})
)
df['t2'] = df['t1']+pd.Timedelta(value=10, unit="H")
df['delta_t1_t2'] = df['t2']-df['t1']
df['delta_to_previous_t1'] = df['t1'] - df['t1'].shift()
print(df)
It results in
ID t1 t2 delta_t1_t2 delta_to_previous_t1
0 0 2020-01-01 00:00:00 2020-01-01 10:00:00 10:00:00 NaT
1 1 2020-01-01 01:00:00 2020-01-01 11:00:00 10:00:00 01:00:00
2 2 2020-01-01 02:00:00 2020-01-01 12:00:00 10:00:00 01:00:00
3 3 2020-01-01 03:00:00 2020-01-01 13:00:00 10:00:00 01:00:00

How to select an item by its ID and not by its index position [duplicate]

I have a pandas dataframe:
import pandas as pnd
d = pnd.Timestamp('2013-01-01 16:00')
dates = pnd.bdate_range(start=d, end = d+pnd.DateOffset(days=10), normalize = False)
df = pnd.DataFrame(index=dates, columns=['a'])
df['a'] = 6
print(df)
a
2013-01-01 16:00:00 6
2013-01-02 16:00:00 6
2013-01-03 16:00:00 6
2013-01-04 16:00:00 6
2013-01-07 16:00:00 6
2013-01-08 16:00:00 6
2013-01-09 16:00:00 6
2013-01-10 16:00:00 6
2013-01-11 16:00:00 6
I am interested in find the label location of one of the labels, say,
ds = pnd.Timestamp('2013-01-02 16:00')
Looking at the index values, I know that is integer location of this label 1. How can get pandas to tell what the integer value of this label is?
You're looking for the index method get_loc:
In [11]: df.index.get_loc(ds)
Out[11]: 1
Get dataframe integer index given a date key:
>>> import pandas as pd
>>> df = pd.DataFrame(
index=pd.date_range(pd.datetime(2008,1,1), pd.datetime(2008,1,5)),
columns=("foo", "bar"))
>>> df["foo"] = [10,20,40,15,10]
>>> df["bar"] = [100,200,40,-50,-38]
>>> df
foo bar
2008-01-01 10 100
2008-01-02 20 200
2008-01-03 40 40
2008-01-04 15 -50
2008-01-05 10 -38
>>> df.index.get_loc(df["bar"].argmax())
1
>>> df.index.get_loc(df["foo"].argmax())
2
In column bar, the index of the maximum value is 1
In column foo, the index of the maximum value is 2
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Index.get_loc.html
get_loc can be used for rows and columns according to:
import pandas as pnd
d = pnd.Timestamp('2013-01-01 16:00')
dates = pnd.bdate_range(start=d, end = d+pnd.DateOffset(days=10), normalize = False)
df = pnd.DataFrame(index=dates)
df['a'] = 5
df['b'] = 6
print(df.head())
a b
2013-01-01 16:00:00 5 6
2013-01-02 16:00:00 5 6
2013-01-03 16:00:00 5 6
2013-01-04 16:00:00 5 6
2013-01-07 16:00:00 5 6
#for rows
print(df.index.get_loc('2013-01-01 16:00:00'))
0
#for columns
print(df.columns.get_loc('b'))
1
Because get_loc returns a mask rather than a list of integer index locations when there are multiple instances of the key in the index, I was toying with an answer using reset_index():
# Add a duplicate!!!
dup = pd.Timestamp('2013-01-07 16:00')
df = df.append(pd.DataFrame([7],columns=['a'],index=[dup]))
df
a
2013-01-01 16:00:00 6
2013-01-02 16:00:00 6
2013-01-03 16:00:00 6
2013-01-04 16:00:00 6
2013-01-07 16:00:00 6
2013-01-08 16:00:00 6
2013-01-09 16:00:00 6
2013-01-10 16:00:00 6
2013-01-11 16:00:00 6
2013-01-07 16:00:00 7
2013-01-08 16:00:00 3
# Only use this method if the key has duplicates
if (df.loc[dup].index.has_duplicates):
df.reset_index().loc[df.index.get_loc(dup)].index.to_list()
array([4, 9])

Group-wise expansion of rows based on start and end date

I have the following pandas dataframe:
import numpy as np
import pandas as pd
dfw = pd.DataFrame({"id": ["A", "B"],
"start_date": pd.to_datetime(["2012-01-01", "2013-02-13"], format="%Y-%m-%d"),
"end_date": pd.to_datetime(["2012-04-17", "2014-11-18"], format="%Y-%m-%d")})
Result:
end_date id start_date
2012-04-17 A 2012-01-01
2014-11-18 B 2013-02-13
I am looking for the most efficient way to transform this dataframe to the following dataframe:
dates = np.empty(0, dtype="datetime64[M]")
dates = np.append(dates, pd.date_range(start="2012-01-01", end="2012-06-01", freq="MS").astype("object"))
dates = np.append(dates, pd.date_range(start="2013-02-01", end="2014-12-01", freq="MS").astype("object"))
dfl = pd.DataFrame({"id": np.repeat(["A", "B"], [6, 23]),
"counter": np.concatenate((np.arange(0, 6, dtype="float"), np.arange(0, 23, dtype="float"))),
"date": pd.to_datetime(dates, format="%Y-%m-%d")})
Result:
counter date id
0.0 2012-01-01 A
1.0 2012-02-01 A
2.0 2012-03-01 A
3.0 2012-04-01 A
4.0 2012-05-01 A
0.0 2013-02-01 B
1.0 2013-03-01 B
2.0 2013-04-01 B
3.0 2013-05-01 B
4.0 2013-06-01 B
5.0 2013-07-01 B
6.0 2013-08-01 B
7.0 2013-09-01 B
8.0 2013-10-01 B
9.0 2013-11-01 B
10.0 2013-12-01 B
11.0 2014-01-01 B
12.0 2014-02-01 B
13.0 2014-03-01 B
14.0 2014-04-01 B
15.0 2014-05-01 B
16.0 2014-06-01 B
17.0 2014-07-01 B
18.0 2014-08-01 B
19.0 2014-09-01 B
20.0 2014-10-01 B
21.0 2014-11-01 B
22.0 2014-12-01 B
A naive solution I came up so far is the following function:
def expand(df):
dates = np.empty(0, dtype="datetime64[ns]")
ids = np.empty(0, dtype="object")
counter = np.empty(0, dtype="float")
for name, group in df.groupby("id"):
start_date = group["start_date"].min()
start_date = pd.to_datetime(np.array(start_date, dtype="datetime64[M]").tolist())
end_date = group["end_date"].min()
end_date = end_date + pd.Timedelta(1, unit="M")
end_date = pd.to_datetime(np.array(end_date, dtype="datetime64[M]").tolist())
tmp = pd.date_range(start=start_date, end=end_date, freq="MS", closed=None).values
dates = np.append(dates, tmp)
ids = np.append(ids, np.repeat(group.id.values[0], len(tmp)))
counter = np.append(counter, np.arange(0, len(tmp)))
dfl = pd.DataFrame({"id": ids, "counter": counter, "date": dates})
return dfl
But it is not very fast:
%timeit expand(dfw)
100 loops, best of 3: 4.84 ms per loop
normally I adivise to avoid itertuples, but in some situations it can be more intuitive. You can get fine-grained control of the endpoints via kwargs to pd.date_range if desired (e.g. to include an endpoint or not)
In [27]: result = pd.concat([pd.Series(r.id,pd.date_range(r.start_date, r.end_date)) for r in dfw.itertuples()]).reset_index()
In [28]: result.columns = ['counter', 'date']
In [29]: result
Out[29]:
counter date
0 2012-01-01 A
1 2012-01-02 A
2 2012-01-03 A
3 2012-01-04 A
4 2012-01-05 A
5 2012-01-06 A
.. ... ...
746 2014-11-13 B
747 2014-11-14 B
748 2014-11-15 B
749 2014-11-16 B
750 2014-11-17 B
751 2014-11-18 B
[752 rows x 2 columns]
In [26]: %timeit pd.concat([pd.Series(r.id,pd.date_range(r.start_date, r.end_date)) for r in dfw.itertuples()]).reset_index()
100 loops, best of 3: 2.15 ms per loop
Not really sure of the purpose of making this super fast. You would generally do this kind of expansion a single time.
You wanted month starts, so here is that.
In [23]: result = pd.concat([pd.Series(r.id,pd.date_range(r.start_date, r.end_date+pd.offsets.MonthBegin(1), freq='MS', closed=None)) for r in dfw.itertuples()]).reset_index()
In [24]: result.columns=['counter', 'date']
In [25]: result
Out[25]:
counter date
0 2012-01-01 A
1 2012-02-01 A
2 2012-03-01 A
3 2012-04-01 A
4 2012-05-01 A
5 2013-03-01 B
6 2013-04-01 B
7 2013-05-01 B
8 2013-06-01 B
9 2013-07-01 B
10 2013-08-01 B
11 2013-09-01 B
12 2013-10-01 B
13 2013-11-01 B
14 2013-12-01 B
15 2014-01-01 B
16 2014-02-01 B
17 2014-03-01 B
18 2014-04-01 B
19 2014-05-01 B
20 2014-06-01 B
21 2014-07-01 B
22 2014-08-01 B
23 2014-09-01 B
24 2014-10-01 B
25 2014-11-01 B
26 2014-12-01 B
You can adjust dates like this
In [17]: pd.Timestamp('2014-01-17')-pd.offsets.MonthBegin(1)
Out[17]: Timestamp('2014-01-01 00:00:00')
In [18]: pd.Timestamp('2014-01-31')-pd.offsets.MonthBegin(1)
Out[18]: Timestamp('2014-01-01 00:00:00')
In [19]: pd.Timestamp('2014-02-01')-pd.offsets.MonthBegin(1)
Out[19]: Timestamp('2014-01-01 00:00:00')

pandas - Resampling datetime index and extending to end of the month

I am trying to resample a datetime index into hourly data. I also want the resampling until the end of the month.
So given the following df:
data = np.arange(6).reshape(3,2)
rng = ['Jan-2016', 'Feb-2016', 'Mar-2016']
df = pd.DataFrame(data, index=rng)
df.index = pd.to_datetime(df.index)
0 1
2016-01-01 0 1
2016-02-01 2 3
2016-03-01 4 5
I know I can resample this into an hourly index by: df = df.resample('H').ffill() However, when I call the df it gets cut at 2016-03-01. I am essentially making the index run from 1/1/2016 to 3/31/2016 with an hourly granularity.
How can I extend this to the end of the month 2015-03-31 given that the last index is the beginning of the month.
UPDATE:
In [37]: (df.set_index(df.index[:-1].union([df.index[-1] + pd.offsets.MonthEnd(0)]))
....: .resample('H')
....: .ffill()
....: .head()
....: )
Out[37]:
0 1
2016-01-01 00:00:00 0 1
2016-01-01 01:00:00 0 1
2016-01-01 02:00:00 0 1
2016-01-01 03:00:00 0 1
2016-01-01 04:00:00 0 1
In [38]: (df.set_index(df.index[:-1].union([df.index[-1] + pd.offsets.MonthEnd(0)]))
....: .resample('H')
....: .ffill()
....: .tail()
....: )
Out[38]:
0 1
2016-03-30 20:00:00 2 3
2016-03-30 21:00:00 2 3
2016-03-30 22:00:00 2 3
2016-03-30 23:00:00 2 3
2016-03-31 00:00:00 4 5
Explanation:
In [40]: df.index[-1] + pd.offsets.MonthEnd(0)
Out[40]: Timestamp('2016-03-31 00:00:00')
In [41]: df.index[:-1].union([df.index[-1] + pd.offsets.MonthEnd(0)])
Out[41]: DatetimeIndex(['2016-01-01', '2016-02-01', '2016-03-31'], dtype='datetime64[ns]', freq=None)
Old incorrect answer:
In [77]: df.resample('M').ffill().resample('H').ffill().tail()
Out[77]:
0 1
2016-03-30 20:00:00 2 3
2016-03-30 21:00:00 2 3
2016-03-30 22:00:00 2 3
2016-03-30 23:00:00 2 3
2016-03-31 00:00:00 4 5
Maybe it's late for this, but I think this way is easier:
import pandas as pd
import numpy as np
data = np.arange(6).reshape(3,2)
rng = ['Jan-2016', 'Feb-2016', 'Mar-2016']
df = pd.DataFrame(data, index=rng)
df.index = pd.to_datetime(df.index)
# Create the desired time range
t_index = pd.DatetimeIndex(pd.date_range(start='2016-01-01', end='2016-12-31', freq='h'))
# Resample
df_rsmpld = df.reindex(t_index, method='ffill')

Categories

Resources