Converting imported dates from .txt to date format python - python

I have issues with converting dates in an imported .txt file and I wonder what I'm doing wrong.
I import the data by:
df_TradingMonthlyDates = pd.read_csv(TradingMonthlyDates, dtype=str, sep=',') # header=True,
and it looks like the following table (dates represents start/end of month and have a header Date):
Date
0 2008-12-30
1 2008-12-31
2 2009-01-01
3 2009-01-02
4 2009-01-29
.. ...
557 2020-06-29
558 2020-06-30
559 2020-07-01
560 2020-07-02
561 2020-07-30
.. ...
624 2021-11-30
625 2021-12-01
626 2021-12-02
627 2021-12-30
628 2021-12-31
[629 rows x 1 columns]
<class 'pandas.core.frame.DataFrame'>
I then calculate today's date:
df_EndDate = datetime.now().date()
I'm trying to apply the data above in this function to get the closest date before a given date (given date = today's date in my case):
# https://stackoverflow.com/questions/32237862/find-the-closest-date-to-a-given-date
def nearest(items, pivot):
return min([i for i in items if i < pivot], key=lambda x: abs(x - pivot))
date_output = nearest(df_TradingMonthlyDates, df_EndDate)
# date_output should be = 2020-07-02 given today's date of 2020-07-12
The error messages I receive is that the df_TradingMonthlyDates is not in date format. So I have tried to convert the dataframe to datetime format but can't make it work.
What I have tried to convert the data to date format:
# df_TradingMonthlyDates["Date"] = pd.to_datetime(df_TradingMonthlyDates["Date"], format="%Y-%m-%d")
# df_TradingMonthlyDates = datetime.strptime(df_TradingMonthlyDates, "%Y-%m-%d").date()
# df_TradingMonthlyDates['Date'] = df_TradingMonthlyDates['Date'].apply(lambda x: pd.to_datetime(x[0], format="%Y-%m-%d"))
# df_TradingMonthlyDates = df_TradingMonthlyDates.iloc[1:]
# print(df_TradingMonthlyDates)
# df_TradingMonthlyDates = datetime.strptime(str(df_TradingMonthlyDates), "%Y-%m-%d").date()
# for line in split_source[1:]: # skip the first line
Code:
import pandas as pd
from datetime import datetime
# Version 1
TradingMonthlyDates = "G:/MonthlyDates.txt"
# Import file where all the first/end month date exists
df_TradingMonthlyDates = pd.read_csv(TradingMonthlyDates, dtype=str, sep=',') # header=True,
print(df_TradingMonthlyDates)
# https://community.dataquest.io/t/datetime-and-conversion/213425
# df_TradingMonthlyDates["Date"] = pd.to_datetime(df_TradingMonthlyDates["Date"], format="%Y-%m-%d")
# df_TradingMonthlyDates = datetime.strptime(df_TradingMonthlyDates, "%Y-%m-%d").date()
# df_TradingMonthlyDates['Date'] = df_TradingMonthlyDates['Date'].apply(lambda x: pd.to_datetime(x[0], format="%Y-%m-%d"))
# df_TradingMonthlyDates = df_TradingMonthlyDates.iloc[1:]
# print(df_TradingMonthlyDates)
# df_TradingMonthlyDates = datetime.strptime(str(df_TradingMonthlyDates), "%Y-%m-%d").date()
# for line in split_source[1:]: # skip the first line # maybe header is the problem
print(type(df_TradingMonthlyDates))
df_TradingMonthlyDates = df_TradingMonthlyDates.datetime.strptime(df_TradingMonthlyDates, "%Y-%m-%d")
df_TradingMonthlyDates = df_TradingMonthlyDates.time()
print(df_TradingMonthlyDates)
df_EndDate = datetime.now().date()
print(type(df_EndDate))
# https://stackoverflow.com/questions/32237862/find-the-closest-date-to-a-given-date
def nearest(items, pivot):
return min([i for i in items if i < pivot], key=lambda x: abs(x - pivot))
date_output = nearest(df_TradingMonthlyDates, df_EndDate)
Error messages are different depending on how I tried to convert data type, but I interpret that they all notice that my date format is not successful :
df_TradingMonthlyDates = df_TradingMonthlyDates.datetime.strptime(df_TradingMonthlyDates, "%Y-%m-%d")
Traceback (most recent call last):
File "g:/till2.py", line 25, in <module>
df_TradingMonthlyDates = df_TradingMonthlyDates.datetime.strptime(df_TradingMonthlyDates, "%Y-%m-%d")
File "C:\Users\ID\AppData\Roaming\Python\Python38\site-packages\pandas\core\generic.py", line 5274, in __getattr__
return object.__getattribute__(self, name)
AttributeError: 'DataFrame' object has no attribute 'datetime'
df_TradingMonthlyDates["Date"] = pd.to_datetime(df_TradingMonthlyDates["Date"], format="%Y-%m-%d")
Traceback (most recent call last):
File "g:/till2.py", line 40, in <module>
date_output = nearest(df_TradingMonthlyDates, df_EndDate)
File "g:/till2.py", line 38, in nearest
return min([i for i in items if i < pivot], key=lambda x: abs(x - pivot))
File "g:/till2.py", line 38, in <listcomp>
return min([i for i in items if i < pivot], key=lambda x: abs(x - pivot))
TypeError: '<' not supported between instances of 'str' and 'datetime.date'

Edit: Added Method 3, which might be the easiest with.loc and then .iloc
You could take a slightly different approach (with Method #1 or Method #2 below) by taking the absolute minimum of the difference between today's date and the data, but a key thing you weren't doing was wrapping pd.to_datetime() around the datetime.date object df_EndDate in order to transform it into a DatetimeArray so that it could be compared against your Date column. They both have to be in the same format of DatetimeArray in order to be compared.
Method 1:
import pandas as pd
import datetime as dt
df_TradingMonthlyDates = pd.DataFrame({'Date': {'0': '2008-12-30',
'1': '2008-12-31',
'2': '2009-01-01',
'3': '2009-01-02',
'4': '2009-01-29',
'557': '2020-06-29',
'558': '2020-06-30',
'559': '2020-07-01',
'560': '2020-07-02',
'561': '2020-07-30',
'624': '2021-11-30',
'625': '2021-12-01',
'626': '2021-12-02',
'627': '2021-12-30',
'628': '2021-12-31'}})
df_TradingMonthlyDates['Date'] = pd.to_datetime(df_TradingMonthlyDates['Date'])
df_TradingMonthlyDates['EndDate'] = pd.to_datetime(dt.datetime.now().date())
df_TradingMonthlyDates['diff'] = (df_TradingMonthlyDates['Date'] - df_TradingMonthlyDates['EndDate'])
a=min(abs(df_TradingMonthlyDates['diff']))
df_TradingMonthlyDates = df_TradingMonthlyDates.loc[(df_TradingMonthlyDates['diff'] == a)
| (df_TradingMonthlyDates['diff'] == -a)]
df_TradingMonthlyDates
output 1:
Date EndDate diff
560 2020-07-02 2020-07-11 -9 days
If you don't want the extra columns and just the date, then assign variables to create series rather than new columns:
Method 2:
d = pd.to_datetime(df_TradingMonthlyDates['Date'])
t = pd.to_datetime(dt.datetime.now().date())
e = (d-t)
a=min(abs(e))
df_TradingMonthlyDates = df_TradingMonthlyDates.loc[(e == a) | (e == -a)]
df_TradingMonthlyDates
output 2:
Date
560 2020-07-02
Method 3:
df_TradingMonthlyDates['Date'] = pd.to_datetime(df_TradingMonthlyDates['Date'])
date_output = df_TradingMonthlyDates.sort_values('Date') \
.loc[df_TradingMonthlyDates['Date'] <=
pd.to_datetime(dt.datetime.now().date())] \
.iloc[-1,:]
date_output
output 3:
Date 2020-07-02
Name: 560, dtype: datetime64[ns]

Related

Time difference between two timedate columns without considering Non-business hours

I want to calculate difference between two time columns without considering non-business hours. I have used pyholidays, which worked totally fine. But even when i define starttime and endtime for Business-duration, Result still includes Non-Business Hours as you shown in attached photos.
for index, row in df.iterrows():
first=row['New']
second=row['Assigned']
third=row['In Progress']
if(pd.notnull(second)):
starttime = (8,0,0)
endtime = (17,0,0)
holidaylist = pyholidays.Germany()
unit='hour'
row['AP'] = businessDuration(first,second,holidaylist=holidaylist,unit=unit)
else:
starttime = (8,0,0)
endtime = (17,0,0)
holidaylist = pyholidays.Germany()
unit='hour'
row['AP'] = businessDuration(first,third,holidaylist=holidaylist,unit=unit)
ap.append(row['AP'])
DataFrame
Printed Result
Thank you for your suggestion. I have tried your method, i have also defined calendar instance. Later i was getting 'relativedelta' error which i have somehow solved by 'dateutil'. Now i am at final stage to compute business-hour difference between two columns.
`de_holidays = pyholidays.Germany()
cal = Calendar(holidays=de_holidays, weekdays=['Saturday', 'Sunday'])
df['rp'] = df.apply(lambda row: compute_bizhours_diff(row['Resolved'], row['Pending'], cal=cal, biz_open_time = time(8, 0, 0), biz_close_time = time(17, 0, 0)), axis=1)`
Now i am getting error about month number, which can not be nan. I have also attached photo of errors.
Pic1
Pic2
I do not know if this works, but try this:
# == Imports needed ===========================
from __future__ import annotations
from typing import Any
import pandas as pd
import holidays as pyholidays
from datetime import time
from bizdays import Calendar
from dateutil.relativedelta import relativedelta
# == Functions ==================================
def is_null_dates(*dates: Any) -> bool:
"""Determine whether objects are valid dates.
Parameters
----------
dates : Any
Variables to check whether they hold a valid date, or not.
Returns
-------
bool
True, if at least one informed value is not a date.
False otherwise.
"""
for date in dates:
if pd.isna(pd.to_datetime(date, errors='coerce')):
return True
return False
def compute_bizhours_diff(
start_date: str | pd.Timestamp,
end_date: str | pd.Timestamp,
biz_open_time: datetime.time | None = None,
biz_close_time: datetime.time | None = None,
cal: bizdays.Calendar | None = None,
) -> float:
"""Compute the number of business hours between two dates.
Parameters
----------
start_date : str | pd.Timestamp
The first date.
end_date : str | pd.Timestamp
The final date.
biz_open_time : datetime.time | None
The beginning hour/minute of a business day.
biz_close_time : datetime.time | None
The ending hour/minute of a business day.
cal : bizdays.Calendar | None
The calendar object used to figure out the number of days between `start_date`
and `end_date` that are not holidays. If None, consider every day as a business day,
except Saturdays, or Sundays.
Returns
-------
float
The total number of business hours between `start_date`, and `end_date`.
Examples
--------
>>> import holidays as pyholidays
>>> from datetime import time
>>> from bizdays import Calendar
>>> # 2022-09-07 is a national holiday in Brazil, therefore only
>>> # the hours between 2022-09-08 09:00:00, and 2022-09-08 15:48:00
>>> # should be considered. This should equal 6.8 hours.
>>> start_date = pd.to_datetime('2022-09-07 15:55:00')
>>> end_date = pd.to_datetime('2022-09-08 15:48:00')
>>> BR_holiday_list = pyholidays.BR(years={start_date.year, end_date.year}, state='RJ')
>>> cal = Calendar(holidays=BR_holiday_list, weekdays=['Saturday', 'Sunday'])
>>> print(compute_bizhours_diff(start_date, end_date, cal=cal))
6.8
>>> # Both dates in the next example are holidays, therefore, the result should be 0.0
>>> start_date = pd.to_datetime('2022-09-07 15:55:00')
>>> end_date = pd.to_datetime('2022-09-07 15:48:00')
>>> print(compute_bizhours_diff(start_date, end_date, cal=cal))
0.0
>>> # What if the end_date preceeds start_date by mistake?
>>> # In such cases, we switch start_date to end_date, and vice-versa.
>>> start_date = pd.to_datetime('2022-09-02 00:00:00')
>>> end_date = pd.to_datetime('2022-09-01 15:55:00')
>>> print(compute_bizhours_diff(start_date, end_date, cal=cal))
2.0833333333333335
>>> # What if the start_date, and end_date begin and finish on the same day, but they both have timestamps that end before
>>> # or after the business hours?
>>> # In such cases, the total number of hours is equal to 0.0
>>> start_date = pd.to_datetime('2022-09-02 00:00:00')
>>> end_date = pd.to_datetime('2022-09-02 8:00:00')
>>> print(compute_bizhours_diff(start_date, end_date, cal=cal))
0.0
"""
if is_null_dates(start_date, end_date):
return pd.NA
if biz_open_time is None:
biz_open_time = time(9, 0, 0)
if biz_close_time is None:
biz_close_time = time(18, 0, 0)
if cal is None:
cal = Calendar(weekdays=['Saturday', 'Sunday'])
open_delta = relativedelta(hour=biz_open_time.hour, minute=biz_open_time.minute)
end_delta = relativedelta(hour=biz_close_time.hour, minute=biz_close_time.minute)
start_date = pd.to_datetime(start_date)
end_date = pd.to_datetime(end_date)
_end_date = max(start_date, end_date)
_start_date = min(start_date, end_date)
start_date = _start_date
end_date = _end_date
start_date = (
start_date if cal.isbizday(start_date) else cal.following(start_date) + open_delta
)
end_date = (
end_date if cal.isbizday(end_date) else cal.preceding(end_date) + end_delta
)
if end_date < start_date:
return 0.00
start_date_biz = max(start_date, start_date + open_delta)
end_first_day = start_date_biz + end_delta
end_date_biz = min(
end_date,
end_date + end_delta
)
start_last_day = end_date_biz + open_delta
if start_last_day > end_date:
end_date_biz = start_last_day
if end_first_day < start_date:
end_first_day = start_date_biz
if end_first_day.date() == end_date_biz.date():
return (end_date_biz - start_date_biz).seconds / 3600
return (
(end_first_day - start_date_biz).seconds
+ (end_date_biz - start_last_day).seconds
+ (
max((len(list(cal.seq(start_date, end_date))) - 2), 0)
* (end_first_day - (start_date + open_delta)).seconds
)
) / 3600
Before running the preceding code, you need to install the following packages, if you do not already have them:
pip install holidays bizdays
Link to both packages' documentation:
bizdays
python-holidays
Examples
Here is how you can use compute_bizhours_diff:
import pandas as pd
import holidays as pyholidays
from datetime import time
from bizdays import Calendar
# OPTIONAL: define custom start, and end to your business hours.
biz_open_time = time(9, 0, 0)
biz_close_time = time(18, 0, 0)
# Define your start, and end dates.
start_date = pd.to_datetime('2022-09-07 04:48:00')
end_date = pd.to_datetime('2022-09-10 15:55:00')
# Create a list of holidays, and create a Calendar instance.
BR_holiday_list = pyholidays.BR(years={start_date.year, end_date.year}, state='RJ')
# For German holidays, you can use something like:
German_holiday_list = pyholidays.Germany(years={start_date.year, end_date.year})
# Define the Calendar instance. Here, we use the German holidays, excluding Saturday, and Sunday from weekdays.
cal = Calendar(holidays=German_holiday_list, weekdays=['Saturday', 'Sunday'])
# Finally, compute the total number of working hours between your two dates:
compute_bizhours_diff(start_date, end_date, cal=cal)
# Returns: 27.0
You can also use the function with pandas dataframes, using apply:
df['working_hours_delta'] = df.apply(lambda row: compute_bizhours_diff(row[START_DATE_COLNAME], row[END_DATE_COLNAME], cal=cal), axis=1)
Notes
The function compute_bizhours_diff is far from perfect. Before using it in any production environment, or for any serious use case, I strongly recommend refactoring it.
Edit
I made some changes to the original answer, to account for instances where start_date, or end_date have null or invalid representations of dates.
Using the example dataframe from your question it now runs fine:
de_holidays = pyholidays.Germany()
cal = Calendar(holidays=de_holidays, weekdays=['Saturday', 'Sunday'])
df = pd.DataFrame(
{
'Assigned': [None, '2022-07-28 10:53:00', '2022-07-28 18:08:00', None, '2022-07-29 12:56:00'],
'In Progress': ['2022-08-01 10:53:00', '2022-08-02 09:32:00', '2022-07-29 12:08:00', '2022-08-02 10:23:00', '2022-07-29 14:54:00'],
'New': ['2022-07-27 15:01:00', '2022-07-28 10:09:00', '2022-07-28 13:37:00', '2022-07-29 00:12:00', '2022-07-29 09:51:00'],
}
).apply(pd.to_datetime)
df['rp'] = df.apply(
lambda row: compute_bizhours_diff(
row['Assigned'], row['In Progress'], cal=cal, biz_open_time = time(8, 0, 0), biz_close_time = time(17, 0, 0)
), axis=1
)
print(df)
# Prints:
# Assigned In Progress New rp
# 0 NaT 2022-08-01 10:53:00 2022-07-27 15:01:00 <NA>
# 1 2022-07-28 10:53:00 2022-08-02 09:32:00 2022-07-28 10:09:00 25.65
# 2 2022-07-28 18:08:00 2022-07-29 12:08:00 2022-07-28 13:37:00 4.133333
# 3 NaT 2022-08-02 10:23:00 2022-07-29 00:12:00 <NA>
# 4 2022-07-29 12:56:00 2022-07-29 14:54:00 2022-07-29 09:51:00 1.966667

KeyError: During handling of the above exception, another exception occurred

I have this error :
KeyError: 'id_cont'
During handling of the above exception, another exception occurred:
<ipython-input-11-4604edb9a0b7> in generateID(self, outputMode, data_df)
84
85 if outputMode.getModeCB() == CONST_MODE_CONT:
---> 86 data_df['id_cont'] = data_df.apply(lambda row:row['product_name']+'-'+row['hour_local'],axis=1)
87 #data_df['id_cont'] = data_df.apply(lambda row:row['equipement']+'-'+row['product_name']+'-'+row['hour_shift'].strftime('%Y-%m-%d %H:%M:%S'),axis=1)
88 else:
/dataiku/dss_data/code-envs/python/Python3_6/lib/python3.6/site-packages/pandas/core/frame.py in __setitem__(self, key, value)
2936 else:
2937 # set column
-> 2938 self._set_item(key, value)
2939
2940 def _setitem_slice(self, key, value):
ValueError: Wrong number of items passed 149, placement implies 1
Adding this line brings up this error, I think that it's a data type problem :
data_df['id_cont'] = data_df.apply(lambda row:row['product_name']+'-'+row['hour_shift'].strftime('%Y-%m-%d %H:%M:%S'),axis=1)
hour_shift is a datetime and product_name, equipment are object.
I think the reason you're getting this error is because the data_df is an empty dataframe due to no rows satisfy the condition data_df['hour_local'].isin(target_hours), causing all hour_shift column values to be NaT, making all rows to be dropped at data_df = data_df.dropna(subset=['hour_shift']). You can test this by using the sample data that has hour_local values that satisfy the condition vs that doesn't
Satisfy condition:
from datetime import datetime
from datetime import timedelta
import time
import pandas as pd
data_df = pd.DataFrame({'local_time': [datetime.strptime("08:30:00",'%H:%M:%S'), datetime.strptime("08:24:00",'%H:%M:%S')], 'product_name': ['A', 'B']})
delta = timedelta(minutes=5)
# Start time
start_time = datetime.strptime("08:20:00",'%H:%M:%S')
cur_time = start_time
target_hours = []
while cur_time.date() <= start_time.date():
target_hours.append(cur_time.time())
cur_time += delta
data_df['hour_local'] = pd.to_datetime(data_df["local_time"].astype(str)).dt.time
data_df = data_df.drop(columns=['hour_shift'], errors='ignore')
data_df.loc[data_df['hour_local'].isin(target_hours),'hour_shift'] = data_df['local_time']
data_df = data_df.sort_values(by=['local_time'])
data_df['hour_shift'] = data_df['hour_shift'].ffill()
data_df = data_df.dropna(subset=['hour_shift'])
# This will print dataframe with one row
print(data_df)
data_df['id_cont'] = data_df.apply(lambda row:row['product_name']+'- '+row['hour_shift'].strftime('%Y-%m-%d %H:%M:%S'),axis=1)
print(data_df)
Not satisfy condition:
from datetime import datetime
from datetime import timedelta
import time
import pandas as pd
# NOTE: no data satisfy the below condition
data_df = pd.DataFrame({'local_time': [datetime.strptime("08:31:00",'%H:%M:%S'), datetime.strptime("08:24:00",'%H:%M:%S')], 'product_name': ['A', 'B']})
delta = timedelta(minutes=5)
# Start time
start_time = datetime.strptime("08:20:00",'%H:%M:%S')
cur_time = start_time
target_hours = []
while cur_time.date() <= start_time.date():
target_hours.append(cur_time.time())
cur_time += delta
data_df['hour_local'] = pd.to_datetime(data_df["local_time"].astype(str)).dt.time
data_df = data_df.drop(columns=['hour_shift'], errors='ignore')
data_df.loc[data_df['hour_local'].isin(target_hours),'hour_shift'] = data_df['local_time']
data_df = data_df.sort_values(by=['local_time'])
data_df['hour_shift'] = data_df['hour_shift'].ffill()
data_df = data_df.dropna(subset=['hour_shift'])
# This will print empty dataframe
print(data_df)
data_df['id_cont'] = data_df.apply(lambda row:row['product_name']+'- '+row['hour_shift'].strftime('%Y-%m-%d %H:%M:%S'),axis=1)
One way I think you can avoid this error is the add a check to only run the apply line if the dataframe is not empty
if len(data_df):
data_df['id_cont'] = data_df.apply(lambda row:row['product_name']+'- '+row['hour_shift'].strftime('%Y-%m-%d %H:%M:%S'),axis=1)
print(data_df)

Error with pandas.dt while extracting year from a date

The data in test.csv are like this:
TIMESTAMP POLYLINE
0 1408039037 [[-8.585676,41.148522],[-8.585712,41.148639],[...
1 1408038611 [[-8.610876,41.14557],[-8.610858,41.145579],[-...
2 1408038568 [[-8.585739,41.148558],[-8.58573,41.148828],[-...
3 1408039090 [[-8.613963,41.141169],[-8.614125,41.141124],[...
4 1408039177 [[-8.619903,41.148036],[-8.619894,41.148036]]
.. ... ...
315 1419171485 [[-8.570196,41.159484],[-8.570187,41.158962],[...
316 1419170802 [[-8.613873,41.141232],[-8.613882,41.141241],[...
317 1419172121 [[-8.6481,41.152536],[-8.647461,41.15241],[-8....
318 1419171980 [[-8.571699,41.156073],[-8.570583,41.155929],[...
319 1419171420 [[-8.574561,41.180184],[-8.572248,41.17995],[-...
[320 rows x 2 columns]
I read them from csv file in this way:
train = pd.read_csv("path/train.csv",engine='python',error_bad_lines=False)
So, I have this timestamp in Unix format. I want to convert in UTC time and then extract year, month, day and so on.
This is the code for the conversion from Unix timestamp to UTC date time:
train["TIMESTAMP"] = [float(time) for time in train["TIMESTAMP"]]
train["data_time"] = [datetime.datetime.fromtimestamp(time, datetime.timezone.utc) for time in train["TIMESTAMP"]]
To extract year and other information I do this:
train["year"] = train["data_time"].dt.year
train["month"] = train["data_time"].dt.month
train["day"] = train["data_time"].dt.day
train["hour"] = train["data_time"].dt.hour
train["min"] = train["data_time"].dt.minute
But I obtain this error when the execution arrives at the extraction point:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-30-d2249cabe965> in <module>()
67 train["TIMESTAMP"] = [float(time) for time in train["TIMESTAMP"]]
68 train["data_time"] = [datetime.datetime.fromtimestamp(time, datetime.timezone.utc) for time in train["TIMESTAMP"]]
---> 69 train["year"] = train["data_time"].dt.year
70 train["month"] = train["data_time"].dt.month
71 train["day"] = train["data_time"].dt.day
2 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/accessors.py in __new__(cls, data)
478 return PeriodProperties(data, orig)
479
--> 480 raise AttributeError("Can only use .dt accessor with datetimelike values")
AttributeError: Can only use .dt accessor with datetimelike values
I also read a lot of similiar discussion but I can't figure out why I obtain this error.
Edited:
So the train["TIMESTAMP"] data are like this:
1408039037
1408038611
1408039090
Then I do this with this data:
train["TIMESTAMP"] = [float(time) for time in train["TIMESTAMP"]]
train["data_time"] = [datetime.datetime.fromtimestamp(time, datetime.timezone.utc) for time in train["TIMESTAMP"]]
train["year"] = train["data_time"].dt.year
train["month"] = train["data_time"].dt.month
train["day"] = train["data_time"].dt.day
train["hour"] = train["data_time"].dt.hour
train["min"] = train["data_time"].dt.minute
train = train[["year", "month", "day", "hour","min"]]

How do I compare the product dates to today's date and create a new column where it indicates whether they have expired or not?

RM
RACK
CROP
Towers
Transplant date
harvest
harvest date
GR1
R1a
LLQ
7
2021-4-03
21
2021-4-24
O: overdue
H: need to clear by today
N: not due yet
Today's date: 2021-04-29
This is my code
import pandas as pd
import time
from datetime import date
df = pd.read_csv("CGA Towers Spatial.csv")
growth = {'LLQ': 21, 'AZK': 24, 'JER': 21, 'MUS': 35, 'CHA': 28, 'SOR': 28, 'ICE': 35, 'TMD': 21}
df['HARVEST'] = df['CROP'].apply(lambda x:growth[x])
df['HARVEST DATE'] = df['TRANSPLANT DATE'] = pd.to_datetime(df['TRANSPLANT DATE'])
df['HARVEST DATE'] = df.apply(lambda x: x['TRANSPLANT DATE'] + pd.offsets.DateOffset(days=x['HARVEST']), 1)
df.to_csv('CGA Towers Spatial.csv',index=False)
df.set_index('RM', inplace=True)
pd.set_option('display.max_rows', 258)
current_date = time.strptime(pd.to_datetime(date.today()).strftime('%d/%m/%Y'), '%d/%m/%Y')
for index in df.index:
date_check = time.strptime(pd.to_datetime(df.loc[index, 'HARVEST DATE']).strftime('%d/%m/%Y'), '%d/%m/%Y')
if current_date >= date_check:
df.loc[index, 'check'] = 'H'
print(df)
This is the error.
Traceback (most recent call last):
File "/Users/kohjiayu/Desktop/PycharmProjects/towers/tower_add_harvest.py", line 21, in
date_check = time.strptime(pd.to_datetime(df.loc[index, 'HARVEST DATE']).strftime('%d/%m/%Y'), '%d/%m/%Y')
File "/Users/kohjiayu/.conda/envs/towers/lib/python3.9/site-packages/pandas/core/generic.py", line 5465, in getattr
return object.getattribute(self, name)
AttributeError: 'Series' object has no attribute 'strftime'
I would like to add a new column which indicates whether my product has expired/needs to be cleared/not yet but i have no idea how to get the csv mod to read these dates and add indications. Ive tried but there is error. Pls help, I'm new to python.
import time
df = pd.read_csv(file)
df.loc[(df['manufacture date'] == time.strftime('%Y-%m-%d')), 'expired/ due today/ not yet'] = 'H'
df.loc[(df['manufacture date'] < time.strftime('%Y-%m-%d')),'expired/ due today/ not yet'] = 'O'
df.loc[(df['manufacture date'] > time.strftime('%Y-%m-%d')),'expired/ due today/ not yet'] = 'N'

Parse Timestamp String with Quarter to Python datetime

I am searching for a way to parse a rather unusual timestamp string to a Python datetime object. The problem here is, that this string includes the corresponding quarter, which seems not to be supported by the datetime.strptime function. The format of the string is as follows: YYYY/qq/mm/dd/HH/MM e.g 1970/Q1/01/01/00/00. I am searching for a function, which is allows me to parse string in such a format, including a validity check, if the quarter is correct for the date.
Question: Datetime String with Quarter to Python datetime
This implements a OOP solution which extends Python datetime with a directive: %Q.
Possible values: Q1|Q2|Q3|Q4, for example:
data_string = '1970/Q1/01/01/00/00'
# '%Y/%Q/%m/%d/%H/%M'
Note: This depends on the module _strptime class TimeRE and may fail if the internal implementation changes!
from datetime import datetime
class Qdatetime(datetime):
re_compile = None
#classmethod
def _strptime(cls):
import _strptime
_class = _strptime.TimeRE
if not 'strptime_compile' in _class.__dict__:
setattr(_class, 'strptime_compile', getattr(_class, 'compile'))
setattr(_class, 'compile', cls.compile)
def compile(self, format):
import _strptime
self = _strptime._TimeRE_cache
# Add directive %Q
if not 'Q' in self:
self.update({'Q': r"(?P<Q>Q[1-4])"})
Qdatetime.re_compile = self.strptime_compile(format)
return Qdatetime.re_compile
def validate(self, quarter):
# 1970, 1, 1 is the lowest date used in timestamp
month = [1, 4, 7, 10][quarter - 1]
day = [31, 30, 30, 31][quarter - 1]
q_start = datetime(self.year, month, 1).timestamp()
q_end = datetime(self.year, month + 2, day).timestamp()
dtt = self.timestamp()
return dtt >= q_start and dtt<= q_end
#property
def quarter(self): return self._quarter
#quarter.setter
def quarter(self, data):
found_dict = Qdatetime.re_compile.match(data).groupdict()
self._quarter = int(found_dict['Q'][1])
#property
def datetime(self):
return datetime(self.year, self.month, self.day,
hour=self.hour, minute=self.minute, second=self.second)
def __str__(self):
return 'Q{} {}'.format(self.quarter, super().__str__())
#classmethod
def strptime(cls, data_string, _format):
cls._strptime()
dt = super().strptime(data_string, _format)
dt.quarter = data_string
if not dt.validate(dt.quarter):
raise ValueError("time data '{}' does not match quarter 'Q{}'"\
.format(data_string, dt.quarter))
return dt
Usage:
for data_string in ['1970/Q1/01/01/00/00',
'1970/Q3/12/31/00/00',
'1970/Q2/05/05/00/00',
'1970/Q3/07/01/00/00',
'1970/Q4/12/31/00/00',
]:
try:
d = Qdatetime.strptime(data_string, '%Y/%Q/%m/%d/%H/%M')
except ValueError as e:
print(e)
else:
print(d, d.datetime)
Output:
Q1 1970-01-01 00:00:00 1970-01-01 00:00:00
time data '1970/Q3/12/31/00/00' does not match quarter 'Q3'
Q2 1970-05-05 00:00:00 1970-05-05 00:00:00
Q3 1970-07-01 00:00:00 1970-07-01 00:00:00
Q4 1970-12-31 00:00:00 1970-12-31 00:00:00
Tested with Python: 3.6 - verified with Python 3.8 source

Categories

Resources