I have a million-row time-series dataframe, in which some of the values in the Date column have muddled day/month values.
How do I efficiently unmuddle them without also ruining those that are correct?
# this creates a dataframe with muddled dates
import pandas as pd
import numpy as np
from pandas import Timestamp
start = Timestamp(2013,1,1)
dates = pd.date_range(start, periods=942)[::-1]
muddler = {}
for d in dates:
if d.day < 13:
muddler[d] = Timestamp(d.year, d.day, d.month)
else:
muddler[d] = Timestamp(d.year, d.month, d.day)
df = pd.DataFrame()
df['Date'] = dates
df['Date'] = df['Date'].map(muddler)
# now what? (assuming I don't know how the dates are muddled)
An option might be to calculate a fit for the timestamps and modify those that deviate from the fit greater than a certain threshold. Example:
import pandas as pd
import numpy as np
start = pd.Timestamp(2013,1,1)
dates = pd.date_range(start, periods=942)[::-1]
muddler = {}
for d in dates:
if d.day < 13:
muddler[d] = pd.Timestamp(d.year, d.day, d.month)
else:
muddler[d] = pd.Timestamp(d.year, d.month, d.day)
df = pd.DataFrame()
df['Date'] = dates
df['Date'] = df['Date'].map(muddler)
# convert date col to posix timestamp
df['ts'] = df['Date'].values.astype(np.float) / 10**9
# calculate a linear fit for ts col
x = np.linspace(df['ts'].iloc[0], df['ts'].iloc[-1], df['ts'].size)
df['ts_linfit'] = np.polyval(np.polyfit(x, df['ts'], 1), x)
# set a thresh and derive a mask that masks differences between
# fit and timestamp greater than thresh:
thresh = 1.2e6 # you might want to tweak this...
m = np.absolute(df['ts']-df['ts_linfit']) > thresh
# create new date col as copy of original
df['Date_filtered'] = df['Date']
# modify values that were caught in the mask
df.loc[m, 'Date_filtered'] = df['Date_filtered'][m].apply(lambda x: pd.Timestamp(x.year, x.day, x.month))
# also to posix timestamp
df['ts_filtered'] = df['Date_filtered'].values.astype(np.float) / 10**9
ax = df['ts'].plot(label='original')
ax = df['ts_filtered'].plot(label='filtered')
ax.legend()
While attempting to create a minimal reproducible example, I have actually solved my problem -- but I expect there is a more efficient and effective way to do what I'm trying to do...
# i first define a function to examine the dates
def disordered_muddle(date_series, future_first=True):
"""Check whether a series of dates is disordered or just muddled"""
disordered = []
muddle = []
dates = date_series
different_dates = pd.Series(dates.unique())
date = different_dates[0]
for i, d in enumerate(different_dates[1:]):
# we expect the date's dayofyear to decrease by one
if d.dayofyear!=date.dayofyear-1:
# unless the year is changing
if d.year!=date.year-1:
try:
# we check if the day and month are muddled
# if d.day > 12 this will cause an Exception
unmuddle = Timestamp(d.year,d.day,d.month)
if unmuddle.dayofyear==date.dayofyear-1:
muddle.append(d)
d = unmuddle
elif unmuddle.year==date.year-1:
muddle.append(d)
d = unmuddle
else:
disordered.append(d)
except:
disordered.append(d)
date=d
if len(disordered)==0 and len(muddle)==0:
return False
else:
return disordered, muddle
disorder, muddle = disordered_muddle(df['Date'])
# finally unmuddle the dates
date_correction = {}
for d in df['Date']:
if d in muddle:
date_correction[d] = Timestamp(d.year, d.day, d.month)
else:
date_correction[d] = Timestamp(d.year, d.month, d.day)
df['CorrectedDate'] = df['Date'].map(date_correction)
disordered_muddle(df['CorrectedDate'])
Related
I'm pulling data from an API and placing it into a Pandas dataframe. I want to then create a new df that includes only the rows that have today's date in. I know how to select between two static dates, but can't seem to filter by a 'today' timestamp.
from matplotlib import pyplot as plt
#Access API
r = requests.get('REMOVED')
x = r.json()
keys = x.keys()
old_df = pd.DataFrame(x['results'])
#set dataframe
df = old_df[['valid_from','valid_to','value_inc_vat']].copy()
df['valid_from'] = pd.to_datetime(df['valid_from'])
df['valid_to'] = pd.to_datetime(df['valid_to'])
#only today's rows
today = pd.Timestamp.today().date()
mask = (df['from'] == today)
df_today = df.loc[mask]```
Use Series.dt.date for compare by dates:
mask = (df['from'].dt.date == today)
df_today = df[mask]
I have DataFrame like below:
df = pd.DataFrame({"data" : ["25.01.2020", and many more other dates...]})
df["data"] = pd.to_datetime(df["data"], format = "%d%m%Y")
And I have a series of special dates like below:
special_date = pd.Series(pd.to_datetime(["16.01.2020",
"27.01.2020",
and many more other dates...], dayfirst=True))
And I need to calculate 2 more columns in this DataFrame:
col1 = number of weeks to the next special date
col2 = number of weeks after las special date
So I need results like below:
col1 = 1 because next special date after 25.01 is 27.01 so it is the same week
col2 = 2 because last special date before 25.01 is 16.01 so i is 2 weeks ago
*please be aware that I have many more dates, so code needs to work for more dates than only 2 special dates or only 1 data from df.
You can use broadcasting to create a matrix of time deltas and than calculate the minima for your new columns
import numpy as np, pandas as pd
df = pd.DataFrame({'data': pd.to_datetime(["01.01.2020","25.01.2020","20.02.2020"], dayfirst=True)})
s = pd.Series(pd.to_datetime(["16.01.2020","27.01.2020","08.02.2020","19.02.2020"], dayfirst=True))
delta = (s.to_numpy()[:,None] - df['data'].to_numpy()).astype('timedelta64[D]') / np.timedelta64(1, 'D')
n = np.min( delta, 0, where=delta> 0, initial=np.inf)
p = np.min(-delta, 0, where=delta<=0, initial=np.inf)
df['next'] = np.ceil(n/7) #consider np.floor
df['prev'] = np.ceil(p/7)
Alternatively to using the where argument you could perform the steps by hand:
n = delta.copy(); n[delta<=0] = np.inf; n = np.abs(np.min(n,0))
p = delta.copy(); p[delta> 0] = -np.inf; p = np.abs(np.min(-p,0))
I have below dataframe called "df" and calculating the sum by unique id called "Id".
Can anyone help me in optimizing the code i have tried.
import pandas as pd
from datetime import datetime, timedelta
df= {'Date':['2019-01-11 10:23:45','2019-01-09 10:23:45', '2019-01-11 10:27:45',
'2019-01-11 10:25:45', '2019-01-11 10:30:45', '2019-01-11 10:35:45',
'2019-02-09 10:25:45'],
'Id':['100','200','300','100','100', '100','200'],
'Amount':[200,400,330,100,300,200,500],
}
df= pd.DataFrame(df)
df["Date"] = pd.to_datetime(df['Date'])
You can try to use groupby, after this each adjust within sub-groupby not to the whole df
s = {}
for x , y in df.groupby(['Id','NCC']):
for i in y.index:
start_date = y['Date'][i] - timedelta(seconds=300)
end_date = y['Date'][i]
mask = (y['Date'] >= start_date) & (y['Date'] < end_date)
count = y.loc[mask]
count = count.loc[(y['Sys'] == 1)]
if len(count) == 0:
s.update({i : 0})
else:
s.update({i : count['Amount'].sum()})
df['New']=pd.Series(s)
If the original data frame has 2 million rows, it would probably be faster to convert the 'Date' column to an index and sort it. Then you can sub select each 5-minute interval:
df = df.set_index('Date').sort_index()
df['Sum_Amt'] = 0
for end in df.index:
start = end - pd.Timedelta('5min')
current_window = df[start : end] # data frame with 5-minute look-back
sum_amt = <calc logic applied to `current_window` goes here>
df.at[end, 'Sum_Amt'] = sum_amt
print(current_window)
print()
I'm not following the logic for calculating Sum_Amt, so I left that out.
I'm making a function to calculate the time difference between two durations using Pandas.
The function is:
def time_calc(dur1, dur2):
date1 = pd.to_datetime(pd.Series(dur2))
date2 = pd.to_datetime(pd.Series(dur1))
df = pd.DataFrame(dict(ID = ids, DUR1 = date2, DUR2 = date1))
df1 = pd.DataFrame(dict(ID = ids, Duration1 = date2, Duration2 = date1))
df1['Duration1'] = df['DUR1'].dt.strftime('%H:%M:%S.%f')
df1['Duration2'] = df['DUR2'].dt.strftime('%H:%M:%S.%f')
cols = df.columns.tolist()
cols = ['ID', 'DUR1', 'DUR2']
df = df[cols]
df['diff_seconds'] = df['DUR2'] - df['DUR1']
df['diff_seconds'] = df['diff_seconds']/np.timedelta64(1,'s')
df['TimeDelta'] = df['diff_seconds'].apply(lambda d: str(datetime.timedelta(seconds=abs(d))))
df3 = df1.merge(df, on='ID')
cols = df3.columns.tolist()
cols = ['ID', 'Duration1', 'Duration2', 'TimeDelta', 'diff_seconds']
df3 = df3[cols]
print(df3)
The math is: Duration2-Duration1=TimeDelta
The function does it nicely:
Duration1 Duration2 TimeDelta diff_seconds
00:00:23.999891 00:00:25.102076 0:00:01.102185 1.102185
00:00:43.079173 00:00:44.621481 0:00:01.542308 1.542308
But when Duration2 < Duration1 we have a negative diff_seconds, but TimeDelta is still positive:
Duration1 Duration2 TimeDelta diff_seconds
00:05:03.744332 00:04:58.008081 0:00:05.736251 -5.736251
So what I need my function to do is to convert TimeDelta to negative value like this:
Duration1 Duration2 TimeDelta diff_seconds
00:05:03.744332 00:04:58.008081 -0:00:05.736251 -5.736251
I suppose that I need to convert 'TimeDelta' in another way, but all my attempts were useless.
I'll be very thankful if somebody will help me with this.
Thanks in advance!
I've solved this issue.
Made one by one timestamp picking logic and pass timestamps to 'time_convert' function
df['diff_seconds'] = df['DUR2'] - df['DUR1']
df['diff_seconds'] = df['diff_seconds']/np.timedelta64(1,'s')
for i in df['diff_seconds']:
df['TimeDelta'] = time_convert(i)
And the time_convert function just appends "-" to formatted timestamp if the seconds were negative:
def time_convert(d):
if d > 0:
lst.append(str(datetime.timedelta(seconds=d)))
else:
lst.append('-' + str(datetime.timedelta(seconds=abs(d))))
And then, I've just created new data frame using lst, and merged all together
df_t = pd.DataFrame(dict(ALERTS = alerts, TimeDelta = lst))
df_f = df_t.merge(df3, on='ID')
Hope this will help somebody.
I am trying to loop over a dataframe and apply a customer function, however my date column keeps either corrupting, or adds brackets to each element
Does anyone know what I am doing wrong?
import numpy as np
import string
import random
# This is a the custom function I use
def summarise_dummy(x):
d = {}
date_index = x['groups_2'] == max(x['groups_2'])
d['date'] = x['date'][date_index] # do something with date
d['y'] = x['y'][date_index] # do something with y
return pd.Series(d, index=['date', 'y']) # return a series
# Generate some dummy data
todays_date = datetime.datetime.now().date()
date = pd.date_range(todays_date-datetime.timedelta(10), periods=10, freq='D')
columns = ['y']
data = [random.randint(0,10) for i in range(0,10)]
df = pd.DataFrame(data, columns=columns)
df['date'] = date
random.choice(string.letters)
df['date'] = pd.to_datetime(df['date'])
df['groups_1'] = list(np.random.choice(list(string.ascii_lowercase[0:5]), 10))
df['groups_2'] = list(np.random.choice(list(string.ascii_lowercase[0:2]), 10))
# ***
#df.loc[:,'date'] = df.loc[:,'date'].dt.strftime('%Y-%m-%d')
# Apply the function for each group_1
grouped = df.groupby(['groups_1'])
summarised = grouped.apply(summarise_dummy)
# Upon expecting the date column, they are all Nat. However if you uncomment *** (above)
# and re-run, dates are returned?
summarised['date']
# But when I finally run with *** un-commented and convert my output to a json, date has []'s in it's series
summarised_json = summarised.to_json(orient='records')
What final output are you looking to get?
Does it work if you change pd.Series to pd.DataFrame within def summarise_dummy(x), setting the date and y along the columns.
import numpy as np
import string
import random
import pandas as pd
import datetime
# This is a the custom function I use
def summarise_dummy(x):
d = {}
date_index = x['groups_2'] == max(x['groups_2'])
d['date'] = x['date'][date_index] # do something with date
d['y'] = x['y'][date_index] # do something with y
return pd.DataFrame(d, columns=['date', 'y']) # return a series
# Generate some dummy data
date = pd.date_range(datetime.datetime.now().date() - datetime.timedelta(10), periods=10, freq='D')
print(date)
columns = ['y']
data = [random.randint(0, 10) for i in range(0, 10)]
df = pd.DataFrame(data, columns=columns)
df['date'] = date
random.choice(string.ascii_letters)
# df['date'] = pd.to_datetime(df['date'])
df['groups_1'] = list(np.random.choice(list(string.ascii_lowercase[0:5]), 10))
df['groups_2'] = list(np.random.choice(list(string.ascii_lowercase[0:2]), 10))
df['date'] = df['date'].dt.strftime('%Y-%m-%d')
print(df)
# Apply the function for each group_1
grouped = df.groupby(['groups_1'])
summarised = grouped.apply(summarise_dummy)
print(summarised)
# Upon expecting the date column, they are all Nat. However if you uncomment *** (above)
# and re-run, dates are returned?
# But when I finally run with *** un-commented and convert my output to a json, date has []'s in it's series
summarised_json = summarised.to_json(orient='records')
print(summarised_json)
After apply:
date y
groups_1
a 9 2018-08-21 0
b 6 2018-08-18 7
c 4 2018-08-16 0
7 2018-08-19 5
8 2018-08-20 1
d 1 2018-08-13 6
3 2018-08-15 8
e 5 2018-08-17 1
After to_json:
[{"date":"2018-08-21","y":0},{"date":"2018-08-18","y":7},{"date":"2018-08-16","y":0},{"date":"2018-08-19","y":5},{"date":"2018-08-20","y":1},{"date":"2018-08-13","y":6},{"date":"2018-08-15","y":8},{"date":"2018-08-17","y":1}]
Additionally, you can configure the json format with orient.