How to compare date from csv(string) to actual date - python

filenameA ="ApptA.csv"
filenameAc = "CheckoutA.csv"
def checkouttenantA():
global filenameA
global filenameAc
import csv
import datetime
with open(filenameA, 'r') as inp, open(filenameAc, 'a' , newline = "") as out:
my_writer = csv.writer(out)
for row in csv.reader(inp):
my_date= datetime.date.today()
string_date = my_date.strftime("%d/%m/%Y")
if row[5] <= string_date:
my_writer.writerow(row)
Dates are saved in format %d/%m/%Y in an excel file on column [5]. I am trying to compare dates in csv file with actual date, but it is only comparing the %d part. I assume it is because dates are in string format.

Ok so there are a few improvements to make as well, which I'll put as an edit to this, but you're converting todays date to a string with strftime() and comparing the two strings, you should be converting the string date from the csv file to a datetime object and comparing those instead.
I'll add plenty of comments to try and explain the code and the reasoning behind it.
# imports should go at the top
import csv
# notice we are importing datetime from datetime (we are importing the `datetime` type from the module datetime
import from datetime import datetime
# try to avoid globals where possible (they're not needed here)
def check_dates_in_csv(input_filepath):
''' function to load csv file and compare dates to todays date'''
# create a list to store the rows which meet our criteria
# appending the rows to this will make a list of lists (nested list)
output_data = []
# get todays date before loop to avoid calling now() every line
# we only need this once and it'll slow the loop down calling it every row
todays_date = datetime.now()
# open your csv here using the function argument
with open(input_filepath, output_filepath) as csv_file:
reader = csv.reader(csv_file)
# iterate over the rows and grab the date in each row
for row in reader:
string_date = row[5]
# convert the string to a datetime object
csv_date = datetime.strptime(string_date, '%d/%m/%Y')
# compare the dates and append if it meets the criteria
if csv_date <= todays_date:
output_data.append(row)
# function should only do one thing, compare the dates
# save the output after
return output_data
# then run the script here
# this comparison is basically the entry point of the python program
# this answer explains it better than I could: https://stackoverflow.com/questions/419163/what-does-if-name-main-do
if __name__ == "__main__":
# use our new function to get the output data
output_data = check_dates_in_csv("input_file.csv")
# save the data here
with open("output.csv", "w") as output_file:
writer = csv.writer(output_file)
writer.writerows(output_data)

I would recommend to use Pandas for such tasks:
import pandas as pd
filenameA ="ApptA.csv"
filenameAc = "CheckoutA.csv"
today = pd.datetime.today()
df = pd.read_csv(filenameA, parse_dates=[5])
df.loc[df.iloc[:, 5] <= today].to_csv(filenameAc, index=False)

Related

Combine date and time from two separate columns in a csvreader generator object

I have a CSV file like this:
2021-08-09 15:50:44 38962 part-00000-6baa0883-5212-49f7-9ba2-63a352211fdd-c000.snappy.parquet
2021-08-09 16:50:44 38962 part-00000-6baa0883-5212-49f7-9ba2-63a352211fdd-c000.snappy.parquet
I'd like to extract all the timestamps into one list so that I can perform the evaluation function below (ie evaluating if check_timestamps_updated is true).
Problem is also taking the date into account, not just the time. What's the most efficient way of combining the two separate columns (date and time) from the csvreader object so that it can be compared with control_time?
from datetime import datetime as dt
control_time = str(str(dt.now()))
reader = csv.reader(results, delimiter=" ")
time_column = list(zip(*reader))[1]
check_timestamps_updated = all(i >= control_time for i in time_column)
As far as I understand what you want to do can be implemented as below,
import csv
from datetime import datetime as dt
check_timestamps_updated = True
control_time = dt.now().timestamp()
with open('example.csv', newline='\n') as f:
reader = csv.reader(f, delimiter=" ")
for line in reader:
date = dt.strptime(f'{line[0]} {line[1]}', '%Y-%m-%d %H:%M:%S').timestamp()
if date >= control_time:
check_timestamps_updated = False
print(check_timestamps_updated)
You asked the most efficient way to merge two columns but I think it depends on what you are mentioning as efficiency. If the csv file is too big and if there is a chance to have a memory problem what I implemented above can work without an issue. But if you are mentioning time this is still a good one.

Determine date columns in a csv file with python

I have simple *.csv file where some of the columns are dates of the format mm/dd/yy. Here is an example:
$ cat somefile.csv
05/09/15,8,Apple,05/09/15
06/10/15,5,Banana,06/10/12
05/11/18,4,Carrot,09/03/18
02/09/15,2,Apple,01/09/15
I want to easily determine if a column only contains valid dates,
but I find myself struggling with counting '/' and counting characters. Surely there is some simple way of doing it right?
EDIT (Answer from #RahulAgarwal)
Here's my script (which still doesn't work :(( )
###########
# IMPORTS #
###########
import csv
import sys
import numpy
from dateutil.parser import parse
###########################
# [1] Open input csv file #
###########################
myfile=open("input4.csv","r")
myreader = csv.reader(myfile)
############################
# [2] read header csv file #
############################
for myline in myreader:
myheader=myline
break
####################################################################
# [3] read and put in ds only data originating in specific columns #
####################################################################
for myline in myreader:
for myColIndex in range(len(myline)):
if (parse(myline[myColIndex])):
print("column = {0}".format(myColIndex))
######################
# [4] Close csv file #
######################
myfile.close()
You can try below to check for valid dates:
from dateutil.parser import parse
parse("05/09/15")
You can use a set to keep track of columns seen in the file and a set of columns that didn't parse successfully as a valid date, then the difference between those two is columns that did parse as date, eg:
import csv
from datetime import datetime
with open('yourfile.csv') as fin:
seen_columns = set()
invalid_columns = set()
for row in csv.reader(fin):
for colno, col in enumerate(row, 1):
# We've seen it contains a non-date - don't try and parse it again
if colno in invalid_columns:
continue
# Make a note we've seen column N
seen_columns.add(colno)
# Try and see if we can parse it to the desired date format
try:
datetime.strptime(col, '%m/%d/%y')
# Nope - we couldn't... not a date - so don't both checking again
except ValueError:
invalid_columns.add(colno)
# Columns containing dates are those we've seen that
# didn't fail to parse as a date...
valid_columns = seen_columns - invalid_columns
You could use the strptime method of the datetime object:
from datetime import datetime
def isDateValid(date, pattern = "%d/%m/%y"):
try:
datetime.strptime(date, pattern)
return True
except ValueError:
return False
The strptime method raises a ValueError if the string doesn't match the pattern.
EDIT:
to let this work:
from datetime import datetime
def isDateValid(date, pattern = "%d/%m/%y"):
try:
datetime.strptime(date, pattern)
return True
except ValueError:
return False
# load file
with open("filename.csv") as f:
# split file into lines
lines = f.readlines()
# replace new-line character
lines = [x.replace("\n", "") for x in lines]
# extract the header
header = lines[0]
# extract rows
rows = lines[1:]
# loop over every row
for rowNumber, row in enumerate(rows, 1):
# split row into the seperate columns
columns = line.split(",")
# setting default value for every row
gotValidDate = False
# loop over every column
for column in columns:
# check if the column got a valid date
if isDateValid(column):
gotValidDate = True
# if at least one out of all columns in that row got a valid date
# the row number gets printed
if gotValidDate:
print(f"Row {rowNumber} got at least one valid date")
(Code is written in Python 3.7)

how to remove DD in YYYYMMDD in python

I need to remove the day in date and I tried to use datetime.strftime and datetime.strptime but it couldn't work. I need to create a tuple of 2 items(date,price) from a nested list but I need to change the date format first.
here's part of the code:
def get_data(my_csv):
with open("my_csv.csv", "r") as csv_file:
csv_reader = csv.reader(csv_file, delimiter = (','))
next(csv_reader)
data = []
for line in csv_reader:
data.append(line)
return data
def get_monthly_avg(data):
oldformat = '20040819'
datetimeobject = datetime.strptime(oldformat,'%y%m%d')
newformat = datetime.strftime('%y%m ')
You miss print with date formats. 'Y' has to be capitalized.
from datetime import datetime
# use datetime to convert
def strip_date(data):
d = datetime.strptime(data,'%Y%m%d')
return datetime.strftime(d,'%Y%m')
data = '20110513'
print (strip_date(data))
# or just cut off day (2 last symbols) from date string
print (data[:6])
The first variant is better because you can verify that string is in proper date format.
Output:
201105
201105
You didnt specify any code, but this might work:
date = functionThatGetsDate()
date = date[0:6]

Python - Read only the time from a .csv Datetime string column, then convert time to UTC

In a nutshell, I have a program that opens a .csv file, reads the .csv file, and then merges a column with datetime string data to a new .csv file. However, before the program merges the column to the new file, I first need to read only the time from the datetime string, then convert the time to UTC and then merge it to the new .csv file.
Since the data is being stored in the .csv file, and when retrieved it comes out as a string like:
"1/28/2016 3:52:49 PM"
How do I read only the 3:52:49 and make it 35249, then convert it to UTC time, before storing the time as a new column in the new .csv file?
In case you need my code:
import os
import csv
import datetime as dt
from os import listdir
from os.path import join
import matplotlib.pyplot as plt
#get the list of files in mypath and store in a list
mypath = 'C:/Users/Alan Cedeno/Desktop/Test_Folder/'
onlycsv = [f for f in listdir(mypath) if '.csv' in f]
#print out all the files with it's corresponding index
for i in range(len(onlycsv)):
print(i,onlycsv[i])
#prompt the user to select the files
option = input('please select file1 by number: ')
option2 = input('please select file2 by number: ')
#build out the full paths of the files and open them
fullpath1 = join(mypath, onlycsv[option])
fullpath2 = join(mypath, onlycsv[option2])
#create third new.csv file
root, ext = os.path.splitext(fullpath2)
output = root + '-new.csv'
with open(fullpath1) as r1, open(fullpath2) as r2, open(output, 'a') as w:
writer = csv.writer(w)
merge_from = csv.reader(r1)
merge_to = csv.reader(r2)
# skip 3 lines of headers
for _ in range(3):
next(merge_from)
for _ in range(1):
next(merge_to)
for merge_from_row, merge_to_row in zip(merge_from, merge_to):
# insert from col 0 as to col 0
merge_to_row.insert(1, merge_from_row[2])
# replace from col 1 with to col 3
#merge_to_row[0] = merge_from_row[2]
# delete merge_to rows 5,6,7 completely
#del merge_to_row[5:8]
writer.writerow(merge_to_row)
The datetime library is what you're looking for: https://docs.python.org/2/library/datetime.html
>>> from datetime import datetime
>>> dt = datetime.strptime("21/11/06 16:30", "%d/%m/%y %H:%M")
>>> dt
datetime.datetime(2006, 11, 21, 16, 30)
use dt.strftime(format) to put the date into the format you need
-- below is the solution to your question, above should hopefully link you to some resources if you need to do some other date manipulations
How do I convert local time to UTC in Python?
>>> def local_to_utc(t):
... secs = time.mktime(t)
... return time.gmtime(secs)
>>> a = local_to_utc(dt.timetuple())
we take the result, and pass it back, and dump it out in the desired format
>>> datetime.fromtimestamp(time.mktime(a)).strftime("%H:%m:%S")

Remove specific rows from CSV file in python

I am trying to remove rows with a specific ID within particular dates from a large CSV file.
The CSV file contains a column [3] with dates formatted like "1962-05-23" and a column with identifiers [2]: "ddd:011232700:mpeg21:a00191".
Within the following date range:
01-01-1951 to 12-31-1951
07-01-1962 to 12-31-1962
01-01 to 09-30-1963
7-01 to 07-31-1965
10-01 to 10-31-1965
04-01-1966 to 11-30-1966
01-01-1969 to 12-31-1969
01-01-1970 to 12-31-1989
I want to remove rows that contain the ID ddd:11*
I think I have to create a variable that contains both the date range and the ID. And look for these in every row, but I'm very new to python so I'm not sure what would be an eloquent way to do this.
This is what I have now. -CODE UPDATED
import csv
import collections
import sys
import re
from datetime import datetime
csv.field_size_limit(sys.maxsize)
dateranges = [("01-01-1951","12-31-1951"),("07-01-1962","12-31-1962")]
dateranges = list(map(lambda dr: tuple(map(lambda x: datetime.strptime(x,"%m-%d-%Y"),dr)),dateranges))
def datefilter(x):
x = datetime.strptime(x,"%Y-%m-%d")
for r in dateranges:
if r[0]<=x and r[1]>=x: return True
return False
writer = csv.writer(open('filtered.csv', 'wb'))
for row in csv.reader('my_file.csv', delimiter='\t'):
if datefilter(row[3]):
if not row[2].startswith("dd:111"):
writer.writerow(row)
else:
writer.writerow(row)
writer.close()
I'd recommend using pandas: it's great for filtering tables. Nice and readable.
import pandas as pd
# assumes the csv contains a header, and the 2 columns of interest are labeled "mydate" and "identifier"
# Note that "date" is a pandas keyword so not wise to use for column names
df = pd.read_csv(inputFilename, parse_dates=[2]) # assumes mydate column is the 3rd column (0-based)
df = df[~df.identifier.str.contains('ddd:11')] # filters out all rows with 'ddd:11' in the 'identifier' column
# then filter out anything not inside the specified date ranges:
df = df[((pd.to_datetime("1951-01-01") <= df.mydate) & (df.mydate <= pd.to_datetime("1951-12-31"))) |
((pd.to_datetime("1962-07-01") <= df.mydate) & (df.mydate <= pd.to_datetime("1962-12-31")))]
df.to_csv(outputFilename)
See Pandas Boolean Indexing
Here is how I would approach that, but it may not be the best method.
from datetime import datetime
dateranges = [("01-01-1951","12-31-1951"),("07-01-1962","12-31-1962")]
dateranges = list(map(lambda dr: tuple(map(lambda x: datetime.strptime(x,"%m-%d-%Y"),dr)),dateranges))
def datefilter(x):
# The date format is different here to match the format of the csv
x = datetime.strptime(x,"%Y-%m-%d")
for r in dateranges:
if r[0]<=x and r[1]>=x: return True
return False
with open(main_file, "rb") as fp:
root = csv.reader(fp, delimiter='\t')
result = collections.defaultdict(list)
for row in root:
if datefilter(row[3]):
# use a regular expression or any other means to filter on id here
if row[2].startswith("dd:111"): #code to remove item
What I have done is create a list of tuples of your date ranges (for brevity, I only put 2 ranges in it), and then I convert those into datetime objects.
I have used maps for doing that in one line: first loop over all tuples in that list, applying a function which loops over all entries in that tuple and converts to a date time, using the tuple and list functions to get back to the original structure. Doing it the long way would look like:
dateranges2=[]
for dr in dateranges:
dateranges2.append((datetime.strptime(dr[0],"%m-%d-%Y"),datetime.strptime(dr[1],"%m-%d-%Y"))
dateranges = dateranges2
Notice that I just convert each item in the tuple into a datetime, and add the tuples to the new list, replacing the original (which I don't need anymore).
Next, I create a datefilter function which takes a datestring, converts it to a datetime, and then loops over all the ranges, checking if the value is in the range. If it is, we return True (indicating this item should be filtered), otherwise return False if we have checking all ranges with no match (indicating that we don't filter this item).
Now you can check out the id using any method that you want once the date has matched, and remove the item if desired. As your example is constant in the first few characters, we can just use the string startswith function to check the id. If it is more complex, we could use a regex.
My kinda approach workds like this -
import csv
import re
import datetime
field_id = 'ddd:11'
d1 = datetime.date(1951,1,01) #change the start date
d2 = datetime.date(1951,12,31) #change the end date
diff = d2 - d1
date_list = []
for i in range(diff.days + 1):
date_list.append((d1 + datetime.timedelta(i)).isoformat())
with open('mwevers_example_2016.01.02-07.25.55.csv','rb') as csv_file:
reader = csv.reader(csv_file)
for row in reader:
for date in date_list:
if row[3] == date:
print row
var = re.search('\\b'+field_id,row[2])
if bool(var) == True:
print 'olalala'#here you can make a function to copy those rows into another file or any list
import csv
import sys
import re
from datetime import datetime
csv.field_size_limit(sys.maxsize)
field_id = 'ddd:11'
dateranges = [("1951-01-01", "1951-12-31"),
("1962-07-01", "1962-12-31"),
("1963-01-01", "1963-09-30"),
("1965-07-01", "1965-07-30"),
("1965-10-01", "1965-10-31"),
("1966-04-01", "1966-11-30"),
("1969-01-01", "1989-12-31")
]
dateranges = list(map(lambda dr:
tuple(map(lambda x:
datetime.strptime(x, "%Y-%m-%d"), dr)),
dateranges))
def datefilter(x):
x = datetime.strptime(x, "%Y-%m-%d")
for r in dateranges:
if r[0] <= x and r[1] >= x:
return True
return False
output = []
with open('my_file.csv', 'r') as f:
reader = csv.reader(f, delimiter='\t', quotechar='"')
next(reader)
for row in reader:
if datefilter(row[4]):
var = re.search('\\b'+field_id, row[3])
if bool(var) == False:
output.append(row)
else:
output.append(row)
with open('output.csv', 'w') as outputfile:
writer = csv.writer(outputfile, delimiter='\t', quotechar='"')
writer.writerows(output)

Categories

Resources