I'm making a function to calculate the time difference between two durations using Pandas.
The function is:
def time_calc(dur1, dur2):
date1 = pd.to_datetime(pd.Series(dur2))
date2 = pd.to_datetime(pd.Series(dur1))
df = pd.DataFrame(dict(ID = ids, DUR1 = date2, DUR2 = date1))
df1 = pd.DataFrame(dict(ID = ids, Duration1 = date2, Duration2 = date1))
df1['Duration1'] = df['DUR1'].dt.strftime('%H:%M:%S.%f')
df1['Duration2'] = df['DUR2'].dt.strftime('%H:%M:%S.%f')
cols = df.columns.tolist()
cols = ['ID', 'DUR1', 'DUR2']
df = df[cols]
df['diff_seconds'] = df['DUR2'] - df['DUR1']
df['diff_seconds'] = df['diff_seconds']/np.timedelta64(1,'s')
df['TimeDelta'] = df['diff_seconds'].apply(lambda d: str(datetime.timedelta(seconds=abs(d))))
df3 = df1.merge(df, on='ID')
cols = df3.columns.tolist()
cols = ['ID', 'Duration1', 'Duration2', 'TimeDelta', 'diff_seconds']
df3 = df3[cols]
print(df3)
The math is: Duration2-Duration1=TimeDelta
The function does it nicely:
Duration1 Duration2 TimeDelta diff_seconds
00:00:23.999891 00:00:25.102076 0:00:01.102185 1.102185
00:00:43.079173 00:00:44.621481 0:00:01.542308 1.542308
But when Duration2 < Duration1 we have a negative diff_seconds, but TimeDelta is still positive:
Duration1 Duration2 TimeDelta diff_seconds
00:05:03.744332 00:04:58.008081 0:00:05.736251 -5.736251
So what I need my function to do is to convert TimeDelta to negative value like this:
Duration1 Duration2 TimeDelta diff_seconds
00:05:03.744332 00:04:58.008081 -0:00:05.736251 -5.736251
I suppose that I need to convert 'TimeDelta' in another way, but all my attempts were useless.
I'll be very thankful if somebody will help me with this.
Thanks in advance!
I've solved this issue.
Made one by one timestamp picking logic and pass timestamps to 'time_convert' function
df['diff_seconds'] = df['DUR2'] - df['DUR1']
df['diff_seconds'] = df['diff_seconds']/np.timedelta64(1,'s')
for i in df['diff_seconds']:
df['TimeDelta'] = time_convert(i)
And the time_convert function just appends "-" to formatted timestamp if the seconds were negative:
def time_convert(d):
if d > 0:
lst.append(str(datetime.timedelta(seconds=d)))
else:
lst.append('-' + str(datetime.timedelta(seconds=abs(d))))
And then, I've just created new data frame using lst, and merged all together
df_t = pd.DataFrame(dict(ALERTS = alerts, TimeDelta = lst))
df_f = df_t.merge(df3, on='ID')
Hope this will help somebody.
Related
I've the following code:
def excel_date(date1):
temp = datetime.datetime(1899, 12, 30)
delta = date1 - temp if date1 != 0 else temp - temp
return float(delta.days) + (float(delta.seconds) / 86400)
df3['SuperID'] = df3['Break_date'].apply(excel_date)
df3['SuperID2'] = df3['ticker'] + str(df3['SuperID'])
Where I use a date to insert in date1 and I get a number from the excel date function.
My ticker and SuperID fields are OK:
I want to concatenate both and get TSLA44462 BUT it's concatenating the whole series if I use str() or .astype(str) in my SuperID column.
The column types:
Here my solution if I understood your problem :
import pandas as pd
df = pd.DataFrame({"Col1":[1.0,2.0,3.0,4.4], "Col2":["Michel", "Sardou", "Paul", "Jean"], "Other Col":[2,3,5,2]})
df["Concat column"] = df["Col1"].astype(int).astype(str) + df["Col2"]
df[df["Concat column"] == "1Michel"]
or
df = pd.DataFrame({"Col1":[1.0,2.0,3.0,4.4], "Col2":["Michel", "Sardou", "Paul", "Jean"], "Other Col":[2,3,5,2]})
df[(df["Col1"]==1) & (df["Col2"]=="Michel")]
After some hours of investigation and the help of comments the way to work with series, integers, floats and strings which worked for me is this:
def excel_date(date1):
temp = datetime.datetime(1899, 12, 30)
delta = date1 - temp if date1 != 0 else temp - temp
return float(delta.days) + (float(delta.seconds) / 86400)
First of all I convert float to integer to avoid decimals. int(x) is not feasible for series, so you better use .astype(int) which works fine.
df3['SuperID'] = df3['Break_date'].apply(excel_date).astype(int)
After that, convert everything to char with char.array and not str(x) or .astype. You then just need to sum columns using .astype(str) to get the desired result.
a = np.char.array(df3['ticker'].values)
b = np.char.array(df3['SuperID'].values)
df3['SuperID2'] = (a + b).astype(str)
Hope this help to others working with series.
regards
I have the following dataframe containing start & end dates of available slots (called 'operability_df_x' in the code):
I'm trying to calculate how many hours are available per month, let's call it the operability ratio. The output is the following (called 'g' in the code):
The current code I wrote is the following, however since I'm new to pandas & python I get the impression there is a lot of redundancy. Is there an easier way to achieve my goal directly without writing 30 lines of code ? Thanks in advance for any piece of advice
for i in operability_df_x.index :
a = operability_df_x['Start Date'].loc[i].month
b = operability_df_x['Start Date'].loc[i].year
c = datetime.datetime(b,a+1,1,0,0,0)
# The following condition is in case there is an interval that stretches in two months :
window = chain_seq_glob[(chain_seq_glob['Start Date'] < c) & (chain_seq_glob['End Date'] > c)].reset_index(drop=True)
if not window.empty :
if (operability_df_x.iloc[i].equals(window.iloc[0])):
operability_df_x['End Date'].loc[i] = c
window['Start Date'].loc[0] = c
operability_df_x = operability_df_x.append(window)
operability_df_x['Duration [Hours]'] = (operability_df_x['End Date'] - operability_df_x['Start Date'] ) / pd.to_timedelta(1, unit ='h')
operability_df_x = operability_df_x[['Start Date','End Date', 'Duration [Hours]']]
operability_df_x = operability_df_x.sort_values(by="Start Date").reset_index(drop=True)
operability_df_x.to_csv('Operab.csv')
### Final operability table:
g = operability_df_x.set_index('Start Date').groupby(pd.Grouper(freq="M")).sum().reset_index(drop=False)
g['Year/Month'] = g['Start Date'].apply(lambda x: x.strftime('%Y-%m'))
g = g.reindex(columns=['Year/Month','Duration [Hours]','Start Date'])
g.columns = ['Year/Month','Total Available Hours','End month']
g['Total Monthly Hours'] = (g['End month'].apply(lambda x: int(x.strftime('%d'))))*24
g['Operability ratio'] = g['Total Available Hours'] / g['Total Monthly Hours']
g = g.drop(['End month','Total Monthly Hours'], 1)
Let's consider an example dataframe:
df = pd.DataFrame({
'start': pd.to_datetime(['20200103', '20200104', '20200123', '20200205']),
'end': pd.to_datetime(['20200105', '20200107', '20200203', '20200209']),
})
Let's also define a utility function:
def intervals(row):
if row.start.month == row.end.month:
return [(row.start, row.end)]
middle = row.end.replace(day=1, hour=0, minute=0, second=0)
return [(row.start, middle), (middle, row.end)]
Now, let's use it to get a list of intervals (either one or two) for each row depending on how many months the row spans:
df['intervals'] = df.apply(intervals, axis=1)
Now, let's explode this list so that each interval has a row of its own:
df = df.explode('intervals')['intervals']
df = pd.DataFrame(df.tolist(), columns=['start', 'end'])
Let's add a column we'll use later for grouping:
df['month'] = df['start'].dt.strftime('%Y-%m')
And one for the number of hours between start and end:
df['hours'] = (df['end'] - df['start']).astype('timedelta64[h]')
I'm sure there is a better way to get the total number of hours for a given month. I'm doing it by adding two separate columns for the beginning of the current month and the beginning of the next month. Then, I add yet another column to store the difference between the two:
df['month_start'] = df['start'].apply(
lambda d: d.replace(day=1, hour=0, second=0))
df['month_end'] = df['month_start'].apply(
lambda d: d.replace(month=(d.month+1) % 12, year=d.year + int(d.month==12)))
df['total_hours'] = (df['month_end'] - df['month_start']).astype('timedelta64[h]')
Finally, perform the group-by and aggregate:
df = df.groupby('month').agg({'hours': 'sum', 'total_hours': 'first'})
df['ratio'] = df['hours'] / df['total_hours']
There are lots of built-in utility functions for dates I am not familiar with so I'm sure some of the stages could be substituted for an idiomatic expression but this works and is quite readable.
I have below dataframe called "df" and calculating the sum by unique id called "Id".
Can anyone help me in optimizing the code i have tried.
import pandas as pd
from datetime import datetime, timedelta
df= {'Date':['2019-01-11 10:23:45','2019-01-09 10:23:45', '2019-01-11 10:27:45',
'2019-01-11 10:25:45', '2019-01-11 10:30:45', '2019-01-11 10:35:45',
'2019-02-09 10:25:45'],
'Id':['100','200','300','100','100', '100','200'],
'Amount':[200,400,330,100,300,200,500],
}
df= pd.DataFrame(df)
df["Date"] = pd.to_datetime(df['Date'])
You can try to use groupby, after this each adjust within sub-groupby not to the whole df
s = {}
for x , y in df.groupby(['Id','NCC']):
for i in y.index:
start_date = y['Date'][i] - timedelta(seconds=300)
end_date = y['Date'][i]
mask = (y['Date'] >= start_date) & (y['Date'] < end_date)
count = y.loc[mask]
count = count.loc[(y['Sys'] == 1)]
if len(count) == 0:
s.update({i : 0})
else:
s.update({i : count['Amount'].sum()})
df['New']=pd.Series(s)
If the original data frame has 2 million rows, it would probably be faster to convert the 'Date' column to an index and sort it. Then you can sub select each 5-minute interval:
df = df.set_index('Date').sort_index()
df['Sum_Amt'] = 0
for end in df.index:
start = end - pd.Timedelta('5min')
current_window = df[start : end] # data frame with 5-minute look-back
sum_amt = <calc logic applied to `current_window` goes here>
df.at[end, 'Sum_Amt'] = sum_amt
print(current_window)
print()
I'm not following the logic for calculating Sum_Amt, so I left that out.
I tried the following code.
The result1 is filtered by a given date, but the result2 isn't filtered.
How can I filter by date in a function?
import pandas as pd
over20='https://gist.githubusercontent.com/shinokada/dfcdc538dedf136d4a58b9bcdcfc8f18/raw/d1db4261b76af67dd67c00a400e373c175eab428/LNS14000024.csv'
df_over20 = pd.read_csv(over20)
display(df_over20)
result1=df_over20[df_over20['DATE']>='1972-01-01']
display(result1)
def changedate(item):
# something more here
item['DATE']=pd.to_datetime(item['DATE'])
start=pd.to_datetime('1972-01-01')
item[item['DATE']>=start]
return item
result2=changedate(df_over20)
display(result2)
In my experience I would make the Date column the index by running:
df.index = df[“DATE”]
df.drop(“DATE” , inplace = True , axis = 1 )
Try to use the index column
date = DT.datetime(‘2020-04-01’)
x = df[df.index > date]
You can also use the following command to make sure your index is a datetime index
df.index = pd.to_datetime( df.index )
You should not compare datetime by own string. it leads bad result.
please use this.
import datetime
def compare (date1,date2):
date1 = datetime.datetime.fromisoformat(date1).timestamp()
date2 = datetime.datetime.fromisoformat(date2).timestamp()
if(date1>date2):
return 1
elif(date1 == date2):
return 0
else:
return -1
I have a million-row time-series dataframe, in which some of the values in the Date column have muddled day/month values.
How do I efficiently unmuddle them without also ruining those that are correct?
# this creates a dataframe with muddled dates
import pandas as pd
import numpy as np
from pandas import Timestamp
start = Timestamp(2013,1,1)
dates = pd.date_range(start, periods=942)[::-1]
muddler = {}
for d in dates:
if d.day < 13:
muddler[d] = Timestamp(d.year, d.day, d.month)
else:
muddler[d] = Timestamp(d.year, d.month, d.day)
df = pd.DataFrame()
df['Date'] = dates
df['Date'] = df['Date'].map(muddler)
# now what? (assuming I don't know how the dates are muddled)
An option might be to calculate a fit for the timestamps and modify those that deviate from the fit greater than a certain threshold. Example:
import pandas as pd
import numpy as np
start = pd.Timestamp(2013,1,1)
dates = pd.date_range(start, periods=942)[::-1]
muddler = {}
for d in dates:
if d.day < 13:
muddler[d] = pd.Timestamp(d.year, d.day, d.month)
else:
muddler[d] = pd.Timestamp(d.year, d.month, d.day)
df = pd.DataFrame()
df['Date'] = dates
df['Date'] = df['Date'].map(muddler)
# convert date col to posix timestamp
df['ts'] = df['Date'].values.astype(np.float) / 10**9
# calculate a linear fit for ts col
x = np.linspace(df['ts'].iloc[0], df['ts'].iloc[-1], df['ts'].size)
df['ts_linfit'] = np.polyval(np.polyfit(x, df['ts'], 1), x)
# set a thresh and derive a mask that masks differences between
# fit and timestamp greater than thresh:
thresh = 1.2e6 # you might want to tweak this...
m = np.absolute(df['ts']-df['ts_linfit']) > thresh
# create new date col as copy of original
df['Date_filtered'] = df['Date']
# modify values that were caught in the mask
df.loc[m, 'Date_filtered'] = df['Date_filtered'][m].apply(lambda x: pd.Timestamp(x.year, x.day, x.month))
# also to posix timestamp
df['ts_filtered'] = df['Date_filtered'].values.astype(np.float) / 10**9
ax = df['ts'].plot(label='original')
ax = df['ts_filtered'].plot(label='filtered')
ax.legend()
While attempting to create a minimal reproducible example, I have actually solved my problem -- but I expect there is a more efficient and effective way to do what I'm trying to do...
# i first define a function to examine the dates
def disordered_muddle(date_series, future_first=True):
"""Check whether a series of dates is disordered or just muddled"""
disordered = []
muddle = []
dates = date_series
different_dates = pd.Series(dates.unique())
date = different_dates[0]
for i, d in enumerate(different_dates[1:]):
# we expect the date's dayofyear to decrease by one
if d.dayofyear!=date.dayofyear-1:
# unless the year is changing
if d.year!=date.year-1:
try:
# we check if the day and month are muddled
# if d.day > 12 this will cause an Exception
unmuddle = Timestamp(d.year,d.day,d.month)
if unmuddle.dayofyear==date.dayofyear-1:
muddle.append(d)
d = unmuddle
elif unmuddle.year==date.year-1:
muddle.append(d)
d = unmuddle
else:
disordered.append(d)
except:
disordered.append(d)
date=d
if len(disordered)==0 and len(muddle)==0:
return False
else:
return disordered, muddle
disorder, muddle = disordered_muddle(df['Date'])
# finally unmuddle the dates
date_correction = {}
for d in df['Date']:
if d in muddle:
date_correction[d] = Timestamp(d.year, d.day, d.month)
else:
date_correction[d] = Timestamp(d.year, d.month, d.day)
df['CorrectedDate'] = df['Date'].map(date_correction)
disordered_muddle(df['CorrectedDate'])