Determine date columns in a csv file with python

Determine date columns in a csv file with python - python

I have simple *.csv file where some of the columns are dates of the format mm/dd/yy. Here is an example:
$ cat somefile.csv
05/09/15,8,Apple,05/09/15
06/10/15,5,Banana,06/10/12
05/11/18,4,Carrot,09/03/18
02/09/15,2,Apple,01/09/15
I want to easily determine if a column only contains valid dates,
but I find myself struggling with counting '/' and counting characters. Surely there is some simple way of doing it right?
EDIT (Answer from #RahulAgarwal)
Here's my script (which still doesn't work :(( )
###########
# IMPORTS #
###########
import csv
import sys
import numpy
from dateutil.parser import parse
###########################
# [1] Open input csv file #
###########################
myfile=open("input4.csv","r")
myreader = csv.reader(myfile)
############################
# [2] read header csv file #
############################
for myline in myreader:
myheader=myline
break
####################################################################
# [3] read and put in ds only data originating in specific columns #
####################################################################
for myline in myreader:
for myColIndex in range(len(myline)):
if (parse(myline[myColIndex])):
print("column = {0}".format(myColIndex))
######################
# [4] Close csv file #
######################
myfile.close()

You can try below to check for valid dates:
from dateutil.parser import parse
parse("05/09/15")

You can use a set to keep track of columns seen in the file and a set of columns that didn't parse successfully as a valid date, then the difference between those two is columns that did parse as date, eg:
import csv
from datetime import datetime
with open('yourfile.csv') as fin:
seen_columns = set()
invalid_columns = set()
for row in csv.reader(fin):
for colno, col in enumerate(row, 1):
# We've seen it contains a non-date - don't try and parse it again
if colno in invalid_columns:
continue
# Make a note we've seen column N
seen_columns.add(colno)
# Try and see if we can parse it to the desired date format
try:
datetime.strptime(col, '%m/%d/%y')
# Nope - we couldn't... not a date - so don't both checking again
except ValueError:
invalid_columns.add(colno)
# Columns containing dates are those we've seen that
# didn't fail to parse as a date...
valid_columns = seen_columns - invalid_columns

You could use the strptime method of the datetime object:
from datetime import datetime
def isDateValid(date, pattern = "%d/%m/%y"):
try:
datetime.strptime(date, pattern)
return True
except ValueError:
return False
The strptime method raises a ValueError if the string doesn't match the pattern.
EDIT:
to let this work:
from datetime import datetime
def isDateValid(date, pattern = "%d/%m/%y"):
try:
datetime.strptime(date, pattern)
return True
except ValueError:
return False
# load file
with open("filename.csv") as f:
# split file into lines
lines = f.readlines()
# replace new-line character
lines = [x.replace("\n", "") for x in lines]
# extract the header
header = lines[0]
# extract rows
rows = lines[1:]
# loop over every row
for rowNumber, row in enumerate(rows, 1):
# split row into the seperate columns
columns = line.split(",")
# setting default value for every row
gotValidDate = False
# loop over every column
for column in columns:
# check if the column got a valid date
if isDateValid(column):
gotValidDate = True
# if at least one out of all columns in that row got a valid date
# the row number gets printed
if gotValidDate:
print(f"Row {rowNumber} got at least one valid date")
(Code is written in Python 3.7)

Related

calculating difference between timestamp values from same column

I am working on converting the Blf file into Tab separated file. I am able to extract all the useful information from the file in a list below. I want to calculate the difference between timestamp values coming in one column. Please find attached my code so far:
import can
import csv
import datetime
import pandas as pd
filename = open('C:\\Users\\shraddhasrivastav\\Downloads\\BLF File\\output.csv', "w")
log = can.BLFReader('C:\\Users\\shraddhasrivastav\\Downloads\\BLF File\\test.blf')
# print ("We are here!")
log_output = []
for msg in log:
msg = str(msg).split()
#print (msg)
data_list = msg[7:(7 + int(msg[6]))]
log_output_entry = [(msg[1]), msg[3], msg[6], " ".join(data_list), msg[-1]]
log_output_entry.insert(1, 'ID=')
test_entry = " \t ".join(log_output_entry) # join the list and remove string quotes in the csv file
filename.write(test_entry + '\n')
df = pd.DataFrame(log_output)
df.columns = ['Timestamp', 'ID', 'DLC','Channel']
filename.close() # Close the file outside the loop
The output I am getting so far is below:
Under my first column, I want the difference between the timestamp values (Example- 2nd row value - 1st row timestamp value... 4th row timestamp value - 3rd row timestamp value...and so on... What should I add in my code to achieve this?
Below is the screenshot of how I want my file's Timestamp field to look like. (Calculating the difference between consecutive rows)
enter image description here

You can use pandas.DataFrame.shift:
df['Time Delta'] = df['Timestamp'] - df['Timestamp'].shift(periods=1, axis=0)
Keep in mind that the file you currently have seems to have variable length between the columns in the text file you write to, so it might be hard to directly insert into pandas. Maybe following would work:
import can
import csv
import datetime
import pandas as pd
#filename = open('C:\\Users\\shraddhasrivastav\\Downloads\\BLF File\\output.csv', "w")
log = can.BLFReader('C:\\Users\\shraddhasrivastav\\Downloads\\BLF File\\test.blf')
# print ("We are here!")
log_output = []
for msg in log:
msg = str(msg).split()
#print (msg)
data_list = msg[7:(7 + int(msg[6]))]
log_output_entry = [(msg[1]), msg[3], msg[6], " ".join(data_list), msg[-1]]
log_output_entry.insert(1, 'ID=')
assert len(log_output_entry)==4
log_output.append(log_output_entry)
#test_entry = " \t ".join(log_output_entry) # join the list and remove string quotes in the csv file
#filename.write(test_entry + '\n')
df = pd.DataFrame(log_output)
df.columns = ['Timestamp', 'ID', 'DLC','Channel']
df['Timestamp'] = pd.to_datetime(df['Timestamp'],unit='s')
df['Time Delta'] = df['Timestamp'] - df['Timestamp'].shift(periods=1, axis=0)
df.to_csv('C:\\Users\\shraddhasrivastav\\Downloads\\BLF File\\output_df.csv')
#filename.close() # Close the file outside the loop

How to compare date from csv(string) to actual date

filenameA ="ApptA.csv"
filenameAc = "CheckoutA.csv"
def checkouttenantA():
global filenameA
global filenameAc
import csv
import datetime
with open(filenameA, 'r') as inp, open(filenameAc, 'a' , newline = "") as out:
my_writer = csv.writer(out)
for row in csv.reader(inp):
my_date= datetime.date.today()
string_date = my_date.strftime("%d/%m/%Y")
if row[5] <= string_date:
my_writer.writerow(row)
Dates are saved in format %d/%m/%Y in an excel file on column [5]. I am trying to compare dates in csv file with actual date, but it is only comparing the %d part. I assume it is because dates are in string format.

Ok so there are a few improvements to make as well, which I'll put as an edit to this, but you're converting todays date to a string with strftime() and comparing the two strings, you should be converting the string date from the csv file to a datetime object and comparing those instead.
I'll add plenty of comments to try and explain the code and the reasoning behind it.
# imports should go at the top
import csv
# notice we are importing datetime from datetime (we are importing the `datetime` type from the module datetime
import from datetime import datetime
# try to avoid globals where possible (they're not needed here)
def check_dates_in_csv(input_filepath):
''' function to load csv file and compare dates to todays date'''
# create a list to store the rows which meet our criteria
# appending the rows to this will make a list of lists (nested list)
output_data = []
# get todays date before loop to avoid calling now() every line
# we only need this once and it'll slow the loop down calling it every row
todays_date = datetime.now()
# open your csv here using the function argument
with open(input_filepath, output_filepath) as csv_file:
reader = csv.reader(csv_file)
# iterate over the rows and grab the date in each row
for row in reader:
string_date = row[5]
# convert the string to a datetime object
csv_date = datetime.strptime(string_date, '%d/%m/%Y')
# compare the dates and append if it meets the criteria
if csv_date <= todays_date:
output_data.append(row)
# function should only do one thing, compare the dates
# save the output after
return output_data
# then run the script here
# this comparison is basically the entry point of the python program
# this answer explains it better than I could: https://stackoverflow.com/questions/419163/what-does-if-name-main-do
if __name__ == "__main__":
# use our new function to get the output data
output_data = check_dates_in_csv("input_file.csv")
# save the data here
with open("output.csv", "w") as output_file:
writer = csv.writer(output_file)
writer.writerows(output_data)

I would recommend to use Pandas for such tasks:
import pandas as pd
filenameA ="ApptA.csv"
filenameAc = "CheckoutA.csv"
today = pd.datetime.today()
df = pd.read_csv(filenameA, parse_dates=[5])
df.loc[df.iloc[:, 5] <= today].to_csv(filenameAc, index=False)

Arrays' Length exploding when appending Values to it from CSV

below is some code written to open a CSV file. Its values are stored like this:
03/05/2017 09:40:19,21.2,35.0
03/05/2017 09:40:27,21.2,35.0
03/05/2017 09:40:38,21.1,35.0
03/05/2017 09:40:48,21.1,35.0
This is just a snippet of code I use in a real time plotting program, which fully works but the fact that the array is getting so big is unclean. Normally new values get added to the CSV while the program is running and the length of the arrays is very high. Is there a way to not have exploding arrays like this?
Just run the program, you will have to make a CSV with those values too and you will see my problem.
from datetime import datetime
import time
y = [] #temperature
t = [] #time object
h = [] #humidity
def readfile():
readFile = open('document.csv', 'r')
sepFile = readFile.read().split('\n')
readFile.close()
for idx, plotPair in enumerate(sepFile):
if plotPair in '. ':
# skip. or space
continue
if idx > 1: # to skip the first line
xAndY = plotPair.split(',')
time_string = xAndY[0]
time_string1 = datetime.strptime(time_string, '%d/%m/%Y %H:%M:%S')
t.append(time_string1)
y.append(float(xAndY[1]))
h.append(float(xAndY[2]))
print([y])
while True:
readfile()
time.sleep(2)
This is the output I get:
[[21.1]]
[[21.1, 21.1]]
[[21.1, 21.1, 21.1]]
[[21.1, 21.1, 21.1, 21.1]]
[[21.1, 21.1, 21.1, 21.1, 21.1]]
Any help is appreciated.

You can use Python's deque if you also want to limit the total number of entries you wish to keep. It produces a list which features a maximum length. Once the list is full, any new entries push the oldest entry off the start.
The reason your list is growing is that you need to re-read your file up to the point of you last entry before continuing to add new entries. Assuming your timestamps are unique, you could use takewhile() to help you do this, which reads entries until a condition is met.
from itertools import takewhile
from collections import deque
from datetime import datetime
import csv
import time
max_length = 1000 # keep this many entries
t = deque(maxlen=max_length) # time object
y = deque(maxlen=max_length) # temperature
h = deque(maxlen=max_length) # humidity
def read_file():
with open('document.csv', newline='') as f_input:
csv_input = csv.reader(f_input)
header = next(csv_input) # skip over the header line
# If there are existing entries, read until the last read item is found again
if len(t):
list(takewhile(lambda row: datetime.strptime(row[0], '%d/%m/%Y %H:%M:%S') != t[-1], csv_input))
for row in csv_input:
print(row)
t.append(datetime.strptime(row[0], '%d/%m/%Y %H:%M:%S'))
y.append(float(row[1]))
h.append(float(row[2]))
while True:
read_file()
print(t)
time.sleep(1)
Also, it is easier to work with the entries using Python's built in csv library to read each of the values into a list for each row. As you have a header row, read this in using next() before starting the loop.

'float' object is not iterable typerror

I've written a script that takes a large excel spreadsheet of data and strips away unwanted columns, rows that contain zero values in particular columns and then saves out to a csv. The piece that I'm stuck on is I'm also trying to remove rows that are missing cells. The way I was trying this was by way of:
for each_row in row_list :
if not all(map(len, each_row)) :
continue
else :
UICData.append(row_list)
But this isn't working correctly as I'm getting the error:
File
"/Users/kenmarold/PycharmProjects/sweetCrude/Work/sweetCrude.py",
line
56, in PrepareRawData
if not all(map(len, each_row)) :
TypeError: 'float' object is not iterable
I'm not exactly sure how to resolve this, what's the way forward on this? I've also attached the full script below.
#!/usr/bin/env python3
import os
import sqlite3
import csv
import unicodecsv
from datetime import date
from xlrd import open_workbook, xldate_as_tuple
from xlwt import Workbook
orig_xls = 'data/all_uic_wells_jun_2016.xls'
temp_xls = 'data/temp.xls'
new_csv = 'data/gh_ready_uic_well_data.csv'
temp_csv = 'data/temp.csv'
input_worksheet_index = 0 # XLS Sheet Number
output_workbook = Workbook()
output_worksheet = output_workbook.add_sheet('Sweet Crude')
lat_col_index = 13
long_col_index = 14
#### SELECT AND FORMAT DATA
def PrepareRawData(inputFile, tempXLSFile, tempCSVFile, outputFile):
# 0 = API# # 7 = Approval Date
# 1 = Operator # 13 = Latitude
# 2 = Operator ID # 14 = Longitude
# 3 = Well Type # 15 = Zone
keep_columns = [0, 1, 2, 3, 7, 13, 14, 15]
with open_workbook(inputFile) as rawUICData:
UICSheet = rawUICData.sheet_by_index(input_worksheet_index)
UICData = []
for each_row_index in range(1, UICSheet.nrows - 1, 1):
row_list = []
lat_num = UICSheet.cell_value(each_row_index, lat_col_index) # Get Lat Values
long_num = UICSheet.cell_value(each_row_index, long_col_index) # Get Long Values
if lat_num != 0.0 and long_num != 0.0: # Find Zero Lat/Long Values
for each_column_index in keep_columns:
cell_value = UICSheet.cell_value(each_row_index, each_column_index)
cell_type = UICSheet.cell_type(each_row_index, each_column_index)
if cell_type == 3:
date_cell = xldate_as_tuple(cell_value, rawUICData.datemode)
date_cell = date(*date_cell[0:3]).strftime('%m/%d/%Y')
row_list.append(date_cell)
else:
row_list.append(cell_value)
for each_row in row_list :
if not all(map(len, each_row)) :
continue
else :
UICData.append(row_list)
# CreateDB(row_list) # Send row data to Database
for each_list_index, output_list in enumerate(UICData):
for each_element_index, element in enumerate(output_list):
output_worksheet.write(each_list_index, each_element_index, element)
output_workbook.save(tempXLSFile)
#### RUN XLS-CSV CONVERSION
workbook = open_workbook(tempXLSFile)
sheet = workbook.sheet_by_index(input_worksheet_index)
fh = open(outputFile, 'wb')
csv_out = unicodecsv.writer(fh, encoding = 'utf-8')
for each_row_number in range(sheet.nrows) :
csv_out.writerow(sheet.row_values(each_row_number))
fh.close()
#### KILL TEMP FILES
filesToRemove = [tempXLSFile]
for each_file in filesToRemove:
os.remove(each_file)
print("Raw Data Conversion Ready for Grasshopper")
# ---------------------------------------------------
PrepareRawData(orig_xls, temp_xls, temp_csv, new_csv)
# ---------------------------------------------------

This is a dirty patch.
for each_row in row_list :
if not isinstance(each_row, list):
each_row = [each_row]
if not any(map(len, each_row)) :
continue
UICData.append(row_list)
EDIT: If the any/map/len raises it still, then I would try a different route to check if it's empty.
Also I'm not sure why you are appending the entire row_list and not the current row. I changed it to appending each_row.
Option1
for each_row in row_list:
if not each_row:
continue
UICData.append(each_row)
Option2
keep_data = [arow in row_list if arow] # Or w/e logic. This will be faster.
UICData.append(keep_data)

Your row_list contains a set of values, for example:
[1.01, 75, 3.56, ...]
When you call for each_row in row_list:, you assign a float value to each_row for every iteration of the loop.
You then try to do this:
if not all(map(len, each_row)):
Python's map function expects a list as the second argument, and tries to iterate over it to apply the function len to each item in the list. You can't iterate a float.
I'm not entirely sure what you are trying to do here, but if you are wanting to check that none of the items in your row_list are None or an empty string, then you could do:
if None not in row_list and '' not in row_list:
UICData.append(row_list)

Your overall object appears to be to copy selected columns from all rows of one sheet of an Excel XLS file to a CSV file. Each output row must contain only valid cells, for some definition of "valid".
As you have seen, using map() is not a good idea; it's only applicable if all the fields are text. You should apply tests depending generally on the datatype and specifically on the individual column.
Once you have validated the items in the row, you are in a position to output the data. You have chosen a path which (1) builds a list of all output rows (2) uses xlwt to write to a temp XLS file (3) uses xlrd to read the temp file and unicodecsv to write a CSV file. Please consider avoiding all that; instead just use unicodecsv.writer.writerow(row_list)

Remove specific rows from CSV file in python

I am trying to remove rows with a specific ID within particular dates from a large CSV file.
The CSV file contains a column [3] with dates formatted like "1962-05-23" and a column with identifiers [2]: "ddd:011232700:mpeg21:a00191".
Within the following date range:
01-01-1951 to 12-31-1951
07-01-1962 to 12-31-1962
01-01 to 09-30-1963
7-01 to 07-31-1965
10-01 to 10-31-1965
04-01-1966 to 11-30-1966
01-01-1969 to 12-31-1969
01-01-1970 to 12-31-1989
I want to remove rows that contain the ID ddd:11*
I think I have to create a variable that contains both the date range and the ID. And look for these in every row, but I'm very new to python so I'm not sure what would be an eloquent way to do this.
This is what I have now. -CODE UPDATED
import csv
import collections
import sys
import re
from datetime import datetime
csv.field_size_limit(sys.maxsize)
dateranges = [("01-01-1951","12-31-1951"),("07-01-1962","12-31-1962")]
dateranges = list(map(lambda dr: tuple(map(lambda x: datetime.strptime(x,"%m-%d-%Y"),dr)),dateranges))
def datefilter(x):
x = datetime.strptime(x,"%Y-%m-%d")
for r in dateranges:
if r[0]<=x and r[1]>=x: return True
return False
writer = csv.writer(open('filtered.csv', 'wb'))
for row in csv.reader('my_file.csv', delimiter='\t'):
if datefilter(row[3]):
if not row[2].startswith("dd:111"):
writer.writerow(row)
else:
writer.writerow(row)
writer.close()

I'd recommend using pandas: it's great for filtering tables. Nice and readable.
import pandas as pd
# assumes the csv contains a header, and the 2 columns of interest are labeled "mydate" and "identifier"
# Note that "date" is a pandas keyword so not wise to use for column names
df = pd.read_csv(inputFilename, parse_dates=[2]) # assumes mydate column is the 3rd column (0-based)
df = df[~df.identifier.str.contains('ddd:11')] # filters out all rows with 'ddd:11' in the 'identifier' column
# then filter out anything not inside the specified date ranges:
df = df[((pd.to_datetime("1951-01-01") <= df.mydate) & (df.mydate <= pd.to_datetime("1951-12-31"))) |
((pd.to_datetime("1962-07-01") <= df.mydate) & (df.mydate <= pd.to_datetime("1962-12-31")))]
df.to_csv(outputFilename)
See Pandas Boolean Indexing

Here is how I would approach that, but it may not be the best method.
from datetime import datetime
dateranges = [("01-01-1951","12-31-1951"),("07-01-1962","12-31-1962")]
dateranges = list(map(lambda dr: tuple(map(lambda x: datetime.strptime(x,"%m-%d-%Y"),dr)),dateranges))
def datefilter(x):
# The date format is different here to match the format of the csv
x = datetime.strptime(x,"%Y-%m-%d")
for r in dateranges:
if r[0]<=x and r[1]>=x: return True
return False
with open(main_file, "rb") as fp:
root = csv.reader(fp, delimiter='\t')
result = collections.defaultdict(list)
for row in root:
if datefilter(row[3]):
# use a regular expression or any other means to filter on id here
if row[2].startswith("dd:111"): #code to remove item
What I have done is create a list of tuples of your date ranges (for brevity, I only put 2 ranges in it), and then I convert those into datetime objects.
I have used maps for doing that in one line: first loop over all tuples in that list, applying a function which loops over all entries in that tuple and converts to a date time, using the tuple and list functions to get back to the original structure. Doing it the long way would look like:
dateranges2=[]
for dr in dateranges:
dateranges2.append((datetime.strptime(dr[0],"%m-%d-%Y"),datetime.strptime(dr[1],"%m-%d-%Y"))
dateranges = dateranges2
Notice that I just convert each item in the tuple into a datetime, and add the tuples to the new list, replacing the original (which I don't need anymore).
Next, I create a datefilter function which takes a datestring, converts it to a datetime, and then loops over all the ranges, checking if the value is in the range. If it is, we return True (indicating this item should be filtered), otherwise return False if we have checking all ranges with no match (indicating that we don't filter this item).
Now you can check out the id using any method that you want once the date has matched, and remove the item if desired. As your example is constant in the first few characters, we can just use the string startswith function to check the id. If it is more complex, we could use a regex.

My kinda approach workds like this -
import csv
import re
import datetime
field_id = 'ddd:11'
d1 = datetime.date(1951,1,01) #change the start date
d2 = datetime.date(1951,12,31) #change the end date
diff = d2 - d1
date_list = []
for i in range(diff.days + 1):
date_list.append((d1 + datetime.timedelta(i)).isoformat())
with open('mwevers_example_2016.01.02-07.25.55.csv','rb') as csv_file:
reader = csv.reader(csv_file)
for row in reader:
for date in date_list:
if row[3] == date:
print row
var = re.search('\\b'+field_id,row[2])
if bool(var) == True:
print 'olalala'#here you can make a function to copy those rows into another file or any list

import csv
import sys
import re
from datetime import datetime
csv.field_size_limit(sys.maxsize)
field_id = 'ddd:11'
dateranges = [("1951-01-01", "1951-12-31"),
("1962-07-01", "1962-12-31"),
("1963-01-01", "1963-09-30"),
("1965-07-01", "1965-07-30"),
("1965-10-01", "1965-10-31"),
("1966-04-01", "1966-11-30"),
("1969-01-01", "1989-12-31")
]
dateranges = list(map(lambda dr:
tuple(map(lambda x:
datetime.strptime(x, "%Y-%m-%d"), dr)),
dateranges))
def datefilter(x):
x = datetime.strptime(x, "%Y-%m-%d")
for r in dateranges:
if r[0] <= x and r[1] >= x:
return True
return False
output = []
with open('my_file.csv', 'r') as f:
reader = csv.reader(f, delimiter='\t', quotechar='"')
next(reader)
for row in reader:
if datefilter(row[4]):
var = re.search('\\b'+field_id, row[3])
if bool(var) == False:
output.append(row)
else:
output.append(row)
with open('output.csv', 'w') as outputfile:
writer = csv.writer(outputfile, delimiter='\t', quotechar='"')
writer.writerows(output)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Determine date columns in a csv file with python - python

You can try below to check for valid dates: from dateutil.parser import parse parse("05/09/15")

Related

calculating difference between timestamp values from same column

How to compare date from csv(string) to actual date

Arrays' Length exploding when appending Values to it from CSV

'float' object is not iterable typerror

Remove specific rows from CSV file in python

Categories

Resources