Remove specific rows from CSV file in python - python

I am trying to remove rows with a specific ID within particular dates from a large CSV file.
The CSV file contains a column [3] with dates formatted like "1962-05-23" and a column with identifiers [2]: "ddd:011232700:mpeg21:a00191".
Within the following date range:
01-01-1951 to 12-31-1951
07-01-1962 to 12-31-1962
01-01 to 09-30-1963
7-01 to 07-31-1965
10-01 to 10-31-1965
04-01-1966 to 11-30-1966
01-01-1969 to 12-31-1969
01-01-1970 to 12-31-1989
I want to remove rows that contain the ID ddd:11*
I think I have to create a variable that contains both the date range and the ID. And look for these in every row, but I'm very new to python so I'm not sure what would be an eloquent way to do this.
This is what I have now. -CODE UPDATED
import csv
import collections
import sys
import re
from datetime import datetime
csv.field_size_limit(sys.maxsize)
dateranges = [("01-01-1951","12-31-1951"),("07-01-1962","12-31-1962")]
dateranges = list(map(lambda dr: tuple(map(lambda x: datetime.strptime(x,"%m-%d-%Y"),dr)),dateranges))
def datefilter(x):
x = datetime.strptime(x,"%Y-%m-%d")
for r in dateranges:
if r[0]<=x and r[1]>=x: return True
return False
writer = csv.writer(open('filtered.csv', 'wb'))
for row in csv.reader('my_file.csv', delimiter='\t'):
if datefilter(row[3]):
if not row[2].startswith("dd:111"):
writer.writerow(row)
else:
writer.writerow(row)
writer.close()

I'd recommend using pandas: it's great for filtering tables. Nice and readable.
import pandas as pd
# assumes the csv contains a header, and the 2 columns of interest are labeled "mydate" and "identifier"
# Note that "date" is a pandas keyword so not wise to use for column names
df = pd.read_csv(inputFilename, parse_dates=[2]) # assumes mydate column is the 3rd column (0-based)
df = df[~df.identifier.str.contains('ddd:11')] # filters out all rows with 'ddd:11' in the 'identifier' column
# then filter out anything not inside the specified date ranges:
df = df[((pd.to_datetime("1951-01-01") <= df.mydate) & (df.mydate <= pd.to_datetime("1951-12-31"))) |
((pd.to_datetime("1962-07-01") <= df.mydate) & (df.mydate <= pd.to_datetime("1962-12-31")))]
df.to_csv(outputFilename)
See Pandas Boolean Indexing

Here is how I would approach that, but it may not be the best method.
from datetime import datetime
dateranges = [("01-01-1951","12-31-1951"),("07-01-1962","12-31-1962")]
dateranges = list(map(lambda dr: tuple(map(lambda x: datetime.strptime(x,"%m-%d-%Y"),dr)),dateranges))
def datefilter(x):
# The date format is different here to match the format of the csv
x = datetime.strptime(x,"%Y-%m-%d")
for r in dateranges:
if r[0]<=x and r[1]>=x: return True
return False
with open(main_file, "rb") as fp:
root = csv.reader(fp, delimiter='\t')
result = collections.defaultdict(list)
for row in root:
if datefilter(row[3]):
# use a regular expression or any other means to filter on id here
if row[2].startswith("dd:111"): #code to remove item
What I have done is create a list of tuples of your date ranges (for brevity, I only put 2 ranges in it), and then I convert those into datetime objects.
I have used maps for doing that in one line: first loop over all tuples in that list, applying a function which loops over all entries in that tuple and converts to a date time, using the tuple and list functions to get back to the original structure. Doing it the long way would look like:
dateranges2=[]
for dr in dateranges:
dateranges2.append((datetime.strptime(dr[0],"%m-%d-%Y"),datetime.strptime(dr[1],"%m-%d-%Y"))
dateranges = dateranges2
Notice that I just convert each item in the tuple into a datetime, and add the tuples to the new list, replacing the original (which I don't need anymore).
Next, I create a datefilter function which takes a datestring, converts it to a datetime, and then loops over all the ranges, checking if the value is in the range. If it is, we return True (indicating this item should be filtered), otherwise return False if we have checking all ranges with no match (indicating that we don't filter this item).
Now you can check out the id using any method that you want once the date has matched, and remove the item if desired. As your example is constant in the first few characters, we can just use the string startswith function to check the id. If it is more complex, we could use a regex.

My kinda approach workds like this -
import csv
import re
import datetime
field_id = 'ddd:11'
d1 = datetime.date(1951,1,01) #change the start date
d2 = datetime.date(1951,12,31) #change the end date
diff = d2 - d1
date_list = []
for i in range(diff.days + 1):
date_list.append((d1 + datetime.timedelta(i)).isoformat())
with open('mwevers_example_2016.01.02-07.25.55.csv','rb') as csv_file:
reader = csv.reader(csv_file)
for row in reader:
for date in date_list:
if row[3] == date:
print row
var = re.search('\\b'+field_id,row[2])
if bool(var) == True:
print 'olalala'#here you can make a function to copy those rows into another file or any list

import csv
import sys
import re
from datetime import datetime
csv.field_size_limit(sys.maxsize)
field_id = 'ddd:11'
dateranges = [("1951-01-01", "1951-12-31"),
("1962-07-01", "1962-12-31"),
("1963-01-01", "1963-09-30"),
("1965-07-01", "1965-07-30"),
("1965-10-01", "1965-10-31"),
("1966-04-01", "1966-11-30"),
("1969-01-01", "1989-12-31")
]
dateranges = list(map(lambda dr:
tuple(map(lambda x:
datetime.strptime(x, "%Y-%m-%d"), dr)),
dateranges))
def datefilter(x):
x = datetime.strptime(x, "%Y-%m-%d")
for r in dateranges:
if r[0] <= x and r[1] >= x:
return True
return False
output = []
with open('my_file.csv', 'r') as f:
reader = csv.reader(f, delimiter='\t', quotechar='"')
next(reader)
for row in reader:
if datefilter(row[4]):
var = re.search('\\b'+field_id, row[3])
if bool(var) == False:
output.append(row)
else:
output.append(row)
with open('output.csv', 'w') as outputfile:
writer = csv.writer(outputfile, delimiter='\t', quotechar='"')
writer.writerows(output)

Related

Determine date columns in a csv file with python

I have simple *.csv file where some of the columns are dates of the format mm/dd/yy. Here is an example:
$ cat somefile.csv
05/09/15,8,Apple,05/09/15
06/10/15,5,Banana,06/10/12
05/11/18,4,Carrot,09/03/18
02/09/15,2,Apple,01/09/15
I want to easily determine if a column only contains valid dates,
but I find myself struggling with counting '/' and counting characters. Surely there is some simple way of doing it right?
EDIT (Answer from #RahulAgarwal)
Here's my script (which still doesn't work :(( )
###########
# IMPORTS #
###########
import csv
import sys
import numpy
from dateutil.parser import parse
###########################
# [1] Open input csv file #
###########################
myfile=open("input4.csv","r")
myreader = csv.reader(myfile)
############################
# [2] read header csv file #
############################
for myline in myreader:
myheader=myline
break
####################################################################
# [3] read and put in ds only data originating in specific columns #
####################################################################
for myline in myreader:
for myColIndex in range(len(myline)):
if (parse(myline[myColIndex])):
print("column = {0}".format(myColIndex))
######################
# [4] Close csv file #
######################
myfile.close()
You can try below to check for valid dates:
from dateutil.parser import parse
parse("05/09/15")
You can use a set to keep track of columns seen in the file and a set of columns that didn't parse successfully as a valid date, then the difference between those two is columns that did parse as date, eg:
import csv
from datetime import datetime
with open('yourfile.csv') as fin:
seen_columns = set()
invalid_columns = set()
for row in csv.reader(fin):
for colno, col in enumerate(row, 1):
# We've seen it contains a non-date - don't try and parse it again
if colno in invalid_columns:
continue
# Make a note we've seen column N
seen_columns.add(colno)
# Try and see if we can parse it to the desired date format
try:
datetime.strptime(col, '%m/%d/%y')
# Nope - we couldn't... not a date - so don't both checking again
except ValueError:
invalid_columns.add(colno)
# Columns containing dates are those we've seen that
# didn't fail to parse as a date...
valid_columns = seen_columns - invalid_columns
You could use the strptime method of the datetime object:
from datetime import datetime
def isDateValid(date, pattern = "%d/%m/%y"):
try:
datetime.strptime(date, pattern)
return True
except ValueError:
return False
The strptime method raises a ValueError if the string doesn't match the pattern.
EDIT:
to let this work:
from datetime import datetime
def isDateValid(date, pattern = "%d/%m/%y"):
try:
datetime.strptime(date, pattern)
return True
except ValueError:
return False
# load file
with open("filename.csv") as f:
# split file into lines
lines = f.readlines()
# replace new-line character
lines = [x.replace("\n", "") for x in lines]
# extract the header
header = lines[0]
# extract rows
rows = lines[1:]
# loop over every row
for rowNumber, row in enumerate(rows, 1):
# split row into the seperate columns
columns = line.split(",")
# setting default value for every row
gotValidDate = False
# loop over every column
for column in columns:
# check if the column got a valid date
if isDateValid(column):
gotValidDate = True
# if at least one out of all columns in that row got a valid date
# the row number gets printed
if gotValidDate:
print(f"Row {rowNumber} got at least one valid date")
(Code is written in Python 3.7)

how to remove DD in YYYYMMDD in python

I need to remove the day in date and I tried to use datetime.strftime and datetime.strptime but it couldn't work. I need to create a tuple of 2 items(date,price) from a nested list but I need to change the date format first.
here's part of the code:
def get_data(my_csv):
with open("my_csv.csv", "r") as csv_file:
csv_reader = csv.reader(csv_file, delimiter = (','))
next(csv_reader)
data = []
for line in csv_reader:
data.append(line)
return data
def get_monthly_avg(data):
oldformat = '20040819'
datetimeobject = datetime.strptime(oldformat,'%y%m%d')
newformat = datetime.strftime('%y%m ')
You miss print with date formats. 'Y' has to be capitalized.
from datetime import datetime
# use datetime to convert
def strip_date(data):
d = datetime.strptime(data,'%Y%m%d')
return datetime.strftime(d,'%Y%m')
data = '20110513'
print (strip_date(data))
# or just cut off day (2 last symbols) from date string
print (data[:6])
The first variant is better because you can verify that string is in proper date format.
Output:
201105
201105
You didnt specify any code, but this might work:
date = functionThatGetsDate()
date = date[0:6]

How to compare date from csv(string) to actual date

filenameA ="ApptA.csv"
filenameAc = "CheckoutA.csv"
def checkouttenantA():
global filenameA
global filenameAc
import csv
import datetime
with open(filenameA, 'r') as inp, open(filenameAc, 'a' , newline = "") as out:
my_writer = csv.writer(out)
for row in csv.reader(inp):
my_date= datetime.date.today()
string_date = my_date.strftime("%d/%m/%Y")
if row[5] <= string_date:
my_writer.writerow(row)
Dates are saved in format %d/%m/%Y in an excel file on column [5]. I am trying to compare dates in csv file with actual date, but it is only comparing the %d part. I assume it is because dates are in string format.
Ok so there are a few improvements to make as well, which I'll put as an edit to this, but you're converting todays date to a string with strftime() and comparing the two strings, you should be converting the string date from the csv file to a datetime object and comparing those instead.
I'll add plenty of comments to try and explain the code and the reasoning behind it.
# imports should go at the top
import csv
# notice we are importing datetime from datetime (we are importing the `datetime` type from the module datetime
import from datetime import datetime
# try to avoid globals where possible (they're not needed here)
def check_dates_in_csv(input_filepath):
''' function to load csv file and compare dates to todays date'''
# create a list to store the rows which meet our criteria
# appending the rows to this will make a list of lists (nested list)
output_data = []
# get todays date before loop to avoid calling now() every line
# we only need this once and it'll slow the loop down calling it every row
todays_date = datetime.now()
# open your csv here using the function argument
with open(input_filepath, output_filepath) as csv_file:
reader = csv.reader(csv_file)
# iterate over the rows and grab the date in each row
for row in reader:
string_date = row[5]
# convert the string to a datetime object
csv_date = datetime.strptime(string_date, '%d/%m/%Y')
# compare the dates and append if it meets the criteria
if csv_date <= todays_date:
output_data.append(row)
# function should only do one thing, compare the dates
# save the output after
return output_data
# then run the script here
# this comparison is basically the entry point of the python program
# this answer explains it better than I could: https://stackoverflow.com/questions/419163/what-does-if-name-main-do
if __name__ == "__main__":
# use our new function to get the output data
output_data = check_dates_in_csv("input_file.csv")
# save the data here
with open("output.csv", "w") as output_file:
writer = csv.writer(output_file)
writer.writerows(output_data)
I would recommend to use Pandas for such tasks:
import pandas as pd
filenameA ="ApptA.csv"
filenameAc = "CheckoutA.csv"
today = pd.datetime.today()
df = pd.read_csv(filenameA, parse_dates=[5])
df.loc[df.iloc[:, 5] <= today].to_csv(filenameAc, index=False)

Python - Extract data from csvfile1 and write to csvfile2 based on values in columns

I have data stored in a csv file :
ID;Event;Date
ABC;In;05/01/2015
XYZ;In;05/01/2016
ERT;In;05/01/2014
... ... ...
ABC;Out;05/01/2017
First, I am trying to extract all rows where Event is "In" and saves thoses rows in a new csv file. Here is the code i've tried so far:
[UPDATED : 05/18/2017]
with open('csv_in', 'r') as f, open('csv_out','w') as f2:
fieldnames=['ID','Event','Date']
reader = csv.DictReader(f, delimiter=';', lineterminator='\n',
fieldnames=fieldnames)
wr = csv.DictWriter(f2,dialect='excel',delimiter=';',
lineterminator='\n',fieldnames=fieldnames)
rows = [row for row in reader if row['Event'] == 'In']
for row in rows:
wr.writerows(row)
I am getting the following error : " ValueError: dict contains fields not in fieldnames: 'I', 'D'
[/UPDATED]
1/ Any thoughts on how to fix this ?
2/ Next step, how would you proceed to do a "lookup" on the ID (if exists several times as per ID "ABC") and extract the given "Date" value where Event is "Out"
output desired :
ID Date Exit date
ABC 05/01/2015 05/01/2017
XYZ 05/01/2016
ERT 05/01/2014
Thanks in advance for your input.
PS : can't use panda .. only standard lib.
you can interpret the raw csv with the standard library like so:
oldcsv=open('csv_in.csv','r').read().split('\n')
newcsv=[]
#this next part checks for events that are in
for line in oldcsv:
if 'In' in line.split(';'):
newcsv.append(line)
new_csv_file=open('new_csv.csv','w')
[new_csv_file.write(line+'\n') for line in newcsv]
new_csv_file.close()
you would use the same method to do your look-up, it's just that you'd change the keyword in that for loop, and if there's more than one item in the newly generated list you have more than one occurance of your ID, then just modify the condition to include two keywords
The error here is because you have not added a delimiter.
Syntax-
csv.DictReader(f, delimiter=';')
For Part 2.
import csv
import datetime
with open('csv_in', 'r') as f, open('csv_out','w') as f2:
reader = csv.DictReader(f, delimiter=';')
wr = csv.writer(f2,dialect='excel',lineterminator='\n')
result = {}
for row in reader:
if row['ID'] not in result:
# Assign Values if not in dictionary
if row['Event'] == 'In':
result[row['ID']] = {'IN' : datetime.datetime.strptime(row['Date'], '%d/%m/%Y') }
else:
result[row['ID']] = {'OUT' : datetime.datetime.strptime(row['Date'], '%d/%m/%Y') }
else:
# Compare dates with those present in csv.
if row['Event'] == 'In':
# if 'IN' is not present, use the max value of Datetime to compare
result[row['ID']]['IN'] = min(result[row['ID']].get('IN', datetime.datetime.max), datetime.datetime.strptime(row['Date'], '%d/%m/%Y'))
else:
# Similarly if 'OUT' is not present, use the min value of datetime to compare
result[row['ID']]['OUT'] = max(result[row['ID']].get('OUT', datetime.datetime.min), datetime.datetime.strptime(row['Date'], '%d/%m/%Y'))
# format the results back to desired representation
for v1 in result.values():
for k2,v2 in v1.items():
v1[k2] = datetime.datetime.strftime(v2, '%d/%m/%Y')
wr.writerow(['ID', 'Entry', 'Exit'])
for row in result:
wr.writerow([row, result[row].get('IN'), result[row].get('OUT')])
This code should work just fine. I have tested it on a small input

Finding maximum values within a data set given restrictions

I have a task where I have been giving a set of data as follows
Station1.txt sample #different sets of data for different no. stations
Date Temperature
19600101 46.1
19600102 46.7
19600103 99999.9 #99999 = not recorded
19600104 43.3
19600105 38.3
19600106 40.0
19600107 42.8
I am trying to create a function
display_maxs(stations, dates, data, start_date, end_date) which displays
a table of maximum temperatures for the given station/s and the given date
range. For example:
stations = load_stations('stations2.txt')
5
data = load_all_stations_data(stations)
dates = load_dates(stations)
display_maxs(stations, dates, data, '20021224','20021228' #these are date yyyy/mm/dd)
I have created functions for data
def load_all_stations_data(stations):
data = {}
file_list = ("Brisbane.txt", "Rockhampton.txt", "Cairns.txt", "Melbourne.txt", "Birdsville.txt", "Charleville.txt") )
for file_name in file_list:
file = open(stations(), 'r')
station = file_name.split()[0]
data[station] = []
for line in file:
values = line.strip().strip(' ')
if len(values) == 2:
data[station] = values[1]
file.close()
return data
functions for stations
def load_all_stations_data(stations):
stations = []
f = open(stations[0] + '.txt', 'r')
stations = []
for line in f:
x = (line.split()[1])
x = x.strip()
temp.append(x)
f.close()
return stations
and functions for dates
def load_dates(stations):
f = open(stations[0] + '.txt', 'r')
dates = []
for line in f:
dates.append(line.split()[0])
f.close()
return dates
Now I just need help with creating the table which displays the max temp for any given date restrictions and calls the above functions with data, dates and station.
Not really sure what those functions are supposed to do, particularly as two of them seem to have the same name. Also there are many errors in your code.
file = open(stations(), 'r') here, you try to call stations as a function, but it seems to be a list.
station = file_name.split()[0] the files names have no space, so this has no effect. Did you mean split('.')?
values = line.strip().strip(' ') probably one of those strip should be split?
data[station] = values[1] overwrites data[station] in each iteration. You probably wanted to append the value?
temp.append(x) the variable temp is not defined; did you mean stations?
Also, instead of reading the dates and the values into two separate list, I suggest you create a list of tuples. This way you will only need a single function:
def get_data(filename):
with open(filename) as f:
data = []
for line in f:
try:
date, value = line.split()
data.append((int(date), float(value)))
except:
pass # pass on header, empty lines ,etc.
return data
If this is not an option, you might create a list of tuples by zipping the lists of dates and values, i.e. data = zip(dates, values). Then, you can use the max builtin function together with a list comprehension or generator expression for filtering the values between the dates and a special key function for sorting by the value.
def display_maxs(data, start_date, end_date):
return max(((d, v) for (d, v) in data
if start_date <= d <= end_date and v < 99999),
key=lambda x: x[1])
print display_maxs(get_data("Station1.txt"), 19600103, 19600106)
Use pandas. Reading in each text file is just a single function, with comment handling, missing data (99999.9) handling, and date handling. The below code will read in the files from a sequence of file names fnames, with handling for comments and converting 9999.9 to "missing" value. Then it will get the date from start to stop, and the sequence of station names (the file names minus the extensions), then get the maximum of each (in maxdf).
import pandas as pd
import os
def load_all_stations_data(stations):
"""Load the stations defined in the sequence of file names."""
sers = []
for fname in stations:
ser = pd.read_csv(fname, sep='\s+', header=0, index_col=0,
comment='#', engine='python', parse_dates=True,
squeeze=True, na_values=['99999.9'])
ser.name = os.path.splitext(fname)[0]
sers.append(ser)
return pd.concat(sers, axis=1)
def get_maxs(startdate, stopdate, stations):
"""Get the stations and date range given, then get the max for each"""
return df.loc[start:stop, sites].max(skipna=True)
Usage of the second function would be like so:
maxdf = get_maxs(df, '20021224','20021228', ("Rockhampton", "Cairns"))
If the #99999 = not recorded comment is not actually in your files, you can get rid of the engine='python' and comment='#' arguments:
def load_all_stations_data(stations):
"""Load the stations defined in the sequence of file names."""
sers = []
for fname in stations:
ser = pd.read_csv(fname, sep='\s+', header=0, index_col=0,
parse_dates=True, squeeze=True,
na_values=['99999.9'])
ser.name = os.path.splitext(fname)[0]
sers.append(ser)
return pd.concat(sers, axis=1)

Categories

Resources