How can I combining like-rows in CSV to fill missing data? - python

I have a CSV file that might look like:
id, value01, value02,
01, , 01b,
01, 01a, ,
02, , 02b,
02, 02a, 02b,
...
As you can see, I have duplicate rows where one of the duplicates (can be more than two) (determined as a duplicate by the id) has missing values, and the other duplicates contain other values missing.
I think someone who managed this CSV output wrote twice to the CSV rather than combine results and output once, so now I need to find a clean way to do this.
So far, my work is:
import csv
def combine_dups():
data = []
with open("file.csv", newline='', mode='r') as csvFile:
csvData = csv.DictReader(csvFile)
for row in csvData:
for lookahead in csvData:
# Check if lookahead and current row have matching ids
if row["id"] == lookahead["id"]:
# Loop through columns of row and lookahead
for col in row:
# If current row's column is blank, take value from lookahead
if row[col] == '' or row[col] is None:
row[col] = lookahead[col]
data.append(row) # Add new filled out, completed row
# Manage data to no longer contain excess duplicates
# Code here??
return data
This code isn't correct, as it as:
If loops through csvData for each row, rather than looping through all data after current row. This is easily solved using a for loop using an index, but I left that out for simplicity.
The row is filled in with the missing data, but this operation is done multiple times for the other values with identical id's. How can I avoid this?
Edit:
For clarity, the NEW csv should look like:
id, value01, value02,
01, 01a, 01b,
02, 02a, 02b,
...

Using the csv module.
Ex:
import csv
data = {}
with open(filename) as csvInFile:
csvData = csv.DictReader(csvInFile)
fieldnames = csvData.fieldnames
for row in csvData:
# Combine data and update.
data.setdefault(row['id'], dict()).update({k:v for k,v in row.items() if v.strip() and not data.get(row['id']).get(k)})
with open(filename, "w", newline='') as csvOutFile:
csvOutData = csv.DictWriter(csvOutFile, fieldnames=fieldnames)
csvOutData.writeheader()
csvOutData.writerows(data.values()) # Write data.
Output:
id, value01, value02
01, 01a, 01b
02, 02a, 02b

The could be solved by using a dictionary object to store the values. This csv would be traversed just once for each row.
You can refer to the code below:
import csv
c_dict = {}
with open("file.csv", newline='', mode='r') as csvFile:
csvData = csv.DictReader(csvFile)
for row in csvData:
if row['id'] not in c_dict.keys():
c_dict[row['id']] = {}
if (row['value01'] == '' or row['value01'] is None) or 'value01' in c_dict[row['id']].keys():
pass
else:
c_dict[row['id']]['value01'] = row['value01']
if (row['value02'] == '' or row['value02'] is None) or 'value02' in c_dict[row['id']].keys():
pass
else:
c_dict[row['id']]['value02'] = row['value02']
keys = ['id', 'value01', 'value02']
with open('output.csv', 'w', newline='') as output_file:
dict_writer = csv.writer(output_file)
dict_writer.writerow(keys)
for k, v in c_dict.items():
dict_writer.writerow([k, v['value01'], v['value02']])
Note: If you have more than these 2 columns, you may if-else clause in a loop

Related

How to delete a row in a CSV file if a cell is empty using Python

I want to go through large CSV files and if there is missing data I want to remove that row completely, This is only row specific so if there is a cell that = 0 or has no value then I want to remove the entire row. I want this to happen for all the columns so if any column has a black cell it should delete the row, and return the corrected data in a corrected csv.
import csv
with open('data.csv', 'r') as csvfile:
csvreader = csv.reader(csvfile)
for row in csvreader:
print(row)
if not row[0]:
print("12")
This is what I found and tried but it doesnt not seem to be working and I dont have any ideas about how to aproach this problem, help please?
Thanks!
Due to the way in which CSV reader presents rows of data, you need to know how many columns there are in the original CSV file. For example, if the CSV file content looks like this:
1,2
3,
4
Then the lists return by iterating over the reader would look like this:
['1','2']
['3','']
['4']
As you can see, the third row only has one column whereas the first and second rows have 2 columns albeit that one is (effectively) empty.
This function allows you to either specify the number of columns (if you know them before hand) or allow the function to figure it out. If not specified then it is assumed that the number of columns is the greatest number of columns found in any row.
So...
import csv
DELIMITER = ','
def valid_column(col):
try:
return float(col) != 0
except ValueError:
pass
return len(col.strip()) > 0
def fix_csv(input_file, output_file, cols=0):
if cols == 0:
with open(input_file, newline='') as indata:
cols = max(len(row) for row in csv.reader(indata, delimiter=DELIMITER))
with open(input_file, newline='') as indata, open(output_file, 'w', newline='') as outdata:
writer = csv.writer(outdata, delimiter=DELIMITER)
for row in csv.reader(indata, delimiter=DELIMITER):
if len(row) == cols:
if all(valid_column(col) for col in row):
writer.writerow(row)
fix_csv('original.csv', 'fixed.csv')
maybe like this
import csv
with open('data.csv', 'r') as csvfile:
csvreader = csv.reader(csvfile)
data=list(csvreader)
data=[x for x in data if '' not in x and '0' not in x]
you can then rewrite the the csv file if you like
Instead of using csv, you should use Pandas module, something like this.
import pandas as pd
df = pd.read_csv('file.csv')
print(df)
index = 1 #index of the row that you want to remove
df = df.drop(index)
print(df)
df.to_csv('file.csv')

Python - Extract data from csvfile1 and write to csvfile2 based on values in columns

I have data stored in a csv file :
ID;Event;Date
ABC;In;05/01/2015
XYZ;In;05/01/2016
ERT;In;05/01/2014
... ... ...
ABC;Out;05/01/2017
First, I am trying to extract all rows where Event is "In" and saves thoses rows in a new csv file. Here is the code i've tried so far:
[UPDATED : 05/18/2017]
with open('csv_in', 'r') as f, open('csv_out','w') as f2:
fieldnames=['ID','Event','Date']
reader = csv.DictReader(f, delimiter=';', lineterminator='\n',
fieldnames=fieldnames)
wr = csv.DictWriter(f2,dialect='excel',delimiter=';',
lineterminator='\n',fieldnames=fieldnames)
rows = [row for row in reader if row['Event'] == 'In']
for row in rows:
wr.writerows(row)
I am getting the following error : " ValueError: dict contains fields not in fieldnames: 'I', 'D'
[/UPDATED]
1/ Any thoughts on how to fix this ?
2/ Next step, how would you proceed to do a "lookup" on the ID (if exists several times as per ID "ABC") and extract the given "Date" value where Event is "Out"
output desired :
ID Date Exit date
ABC 05/01/2015 05/01/2017
XYZ 05/01/2016
ERT 05/01/2014
Thanks in advance for your input.
PS : can't use panda .. only standard lib.
you can interpret the raw csv with the standard library like so:
oldcsv=open('csv_in.csv','r').read().split('\n')
newcsv=[]
#this next part checks for events that are in
for line in oldcsv:
if 'In' in line.split(';'):
newcsv.append(line)
new_csv_file=open('new_csv.csv','w')
[new_csv_file.write(line+'\n') for line in newcsv]
new_csv_file.close()
you would use the same method to do your look-up, it's just that you'd change the keyword in that for loop, and if there's more than one item in the newly generated list you have more than one occurance of your ID, then just modify the condition to include two keywords
The error here is because you have not added a delimiter.
Syntax-
csv.DictReader(f, delimiter=';')
For Part 2.
import csv
import datetime
with open('csv_in', 'r') as f, open('csv_out','w') as f2:
reader = csv.DictReader(f, delimiter=';')
wr = csv.writer(f2,dialect='excel',lineterminator='\n')
result = {}
for row in reader:
if row['ID'] not in result:
# Assign Values if not in dictionary
if row['Event'] == 'In':
result[row['ID']] = {'IN' : datetime.datetime.strptime(row['Date'], '%d/%m/%Y') }
else:
result[row['ID']] = {'OUT' : datetime.datetime.strptime(row['Date'], '%d/%m/%Y') }
else:
# Compare dates with those present in csv.
if row['Event'] == 'In':
# if 'IN' is not present, use the max value of Datetime to compare
result[row['ID']]['IN'] = min(result[row['ID']].get('IN', datetime.datetime.max), datetime.datetime.strptime(row['Date'], '%d/%m/%Y'))
else:
# Similarly if 'OUT' is not present, use the min value of datetime to compare
result[row['ID']]['OUT'] = max(result[row['ID']].get('OUT', datetime.datetime.min), datetime.datetime.strptime(row['Date'], '%d/%m/%Y'))
# format the results back to desired representation
for v1 in result.values():
for k2,v2 in v1.items():
v1[k2] = datetime.datetime.strftime(v2, '%d/%m/%Y')
wr.writerow(['ID', 'Entry', 'Exit'])
for row in result:
wr.writerow([row, result[row].get('IN'), result[row].get('OUT')])
This code should work just fine. I have tested it on a small input

More pythonic way of iteratively assigning csv rows to dictionary values?

I have a CSV file, with columns holding specific values that I read into specific places in a dictionary, and rows separate instances of data that equal one full dictionary. I read in and then use this data to computer certain values, process some of the inputs, etc., for each row before moving on to the next row. My question is, if I have a header that specifics the names of the columns (Key1 versus Key 3A, etc.), can I use that information to avoid the somewhat draw out code I am currently using (below).
with open(input_file, 'rU') as controlFile:
reader = csv.reader(controlFile)
next(reader, None) # skip the headers
for row in reader:
# Grabbing all the necessary inputs
inputDict = {}
inputDict["key1"] = row[0]
inputDict["key2"] = row[1]
inputDict["key3"] = {}
inputDict["key3"].update({"A" : row[2]})
inputDict["key3"].update({"B" : row[3]})
inputDict["key3"].update({"C" : row[4]})
inputDict["key3"].update({"D" : row[5]})
inputDict["key3"].update({"E" : row[6]})
inputDict["Key4"] = {}
inputDict["Key4"].update({"F" : row[7]})
inputDict["Key4"].update({"G" : float(row[8])})
inputDict["Key4"].update({"H" : row[9]})
If you use a DictReader, you can improve your code a bit:
Create an object which operates like a regular reader but maps the
information read into a dict whose keys are given by the optional
fieldnames parameter. The fieldnames parameter is a sequence whose
elements are associated with the fields of the input data in order.
These elements become the keys of the resulting dictionary. If the
fieldnames parameter is omitted, the values in the first row of the
csvfile will be used as the fieldnames.
So, if we utilize that:
import csv
import string
results = []
mappings = [
[(string.ascii_uppercase[i-2], i) for i in range(2, 7)],
[(string.ascii_uppercase[i-2], i) for i in range(7, 10)]]
with open(input_file, 'rU') as control_file:
reader = csv.DictReader(control_file)
for row in reader:
row_data = {}
row_data['key1'] = row['key1']
row_data['key2'] = row['key2']
row_data['key3'] = {k:row[v] for k,v in mappings[0]}
row_data['key4'] = {k:row[v] for k,v in mappings[1]}
results.append(row_data)
yes you can.
import csv
with open(infile, 'rU') as infile:
reader = csv.DictReader(infile)
for row in reader:
print(row)
Take a look at this piece of code.
fields = csv_data.next()
for row in csv_data:
parsed_data.append(dict(zip(fields,row)))

Replace column in csv with modified column

I got a csv file with a couple of columns and a header containing 4 rows. The first column contains the timestamp. Unfortunately it also gives milliseconds, but whenever those are at 00, they are not given in the file. It looks like that:
"TOA5","CR1000","CR1000","E9048"
"TIMESTAMP","RECORD","BattV_Avg","PTemp_C_Avg"
"TS","RN","Volts","Deg C"
"","","Avg","Avg"
"2015-08-28 12:40:23.51",1,12.91,32.13
"2015-08-28 12:50:43.23",2,12.9,32.34
"2015-08-28 13:12:22",3,12.91,32.54
As I don't need the milliseconds, I want to get rid of those, as this makes further calculations containing time a bit complicated. My approach so far:
Extract first 20 digits in each row to get a format such as 2015-08-28 12:40:23
timestamp = []
with open(filepath) as f:
for _ in xrange(4): #skip 4 header rows
next(f)
for line in f:
time = line[1:20] #Get values for the current line
timestamp.append(time) #Add values to list
From here on I'm struggling on how to procede further. I want to exchange the first column in the csv file with the newly created timestamp list.
I tried creating a dictionary, but I don't know how to use the header caption in row 2 as the key:
d = {}
with open(filepath, 'rb') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
for col in csv_reader:
#use header info from row 2 as key here
This would import the whole csv file into a dict and I'd then change the TIMESTAMP entry in the dict with the timestamp list above. Is this even possible?
Or is there an easier approach on how to just change the first column in the csv with my new list so that my csv file in the end contains the timestamp just without the millisecond information?
So the first column in my csv should look like this:
"TOA5"
"TIMESTAMP"
"TS"
""
2015-08-28 12:40:23
2015-08-28 12:50:43
2015-08-28 13:12:22
This should do it and preserve the quoting:
with open(filepath1, 'rb') as fin, open(filepath2, 'wb') as fout:
reader = csv.reader(fin)
writer = csv.writer(fout, quoting=csv.QUOTE_NONNUMERIC)
for _ in xrange(4): # copy first 4 header rows
writer.writerow(next(reader))
for row in reader: # process data lines
row[0] = row[0][:19] # strip fractional seconds from first column
writer.writerow([row[0], int(row[1])] + map(float, row[2:]))
Since a csv.reader returns the columns of each row as a list of strings, it's necessary to convert any which contain numeric values into their actual int or float numeric value before they're written out to prevent them from being quoted.
I believe you can easily create a new csv from iterating over the original csv and replacing the timestamp as you want.
Example -
with open(filepath, 'rb') as csv_file, open('<new file>','wb') as outfile:
csv_reader = csv.reader(csv_file, delimiter=',')
csv_writer = csv.writer(outfile, delimiter=',')
for i, row in enumerate(csv_reader): #Enumerating as we only need to change rows after 3rd index.
if i <= 3:
csv_writer.writerow(row)
else:
csv_writer.writerow([row[0][1:20]] + row[1:])
I'm not entirely sure about how to parse your csv but I would do something of the sort:
time = time.split(".")[0]
so if it does have a millisecond it would get removed and if it doesn't nothing will happen.

How do create new column in csv file using python by shifting one row

I have CSV file like below. It is huge file with thousands of records.
input.csv
No;Val;Rec;CSR
0;10;1;1200
0;100;2;1300
0;100;3;1300
0;100;4;1400
0;10;5;1200
0;11;6;1200
I want to create output.csv file by adding new column "PSR" after 1st column "No". This column value depends on column "PSR" Value. For 1st row, "PSR" shall be zero. From next record on-wards, it depends on "CSR" value in previous row. If present and previous record CSR value is same, then "PSR" shall be zero. If not, PSR value shall have the previous CSR value. For exmple, Value of CSR in 2nd row is 1300 which is different to the value in 1st record ( it is 1200). So PSR value for 2nd row shall be 1200. Where in 2nd and 3rd row, CSR value is same. So PSR value for 3rd row shall be zero. So new value PSR depends on CSR value in present and previous field.
Output.csv
No;PCR;Val;Rec;CSR
0;0;10;1;1200
0;1200;100;2;1300
0;0;100;3;1300
0;1300;100;4;1400
0;1400;10;5;1200
0;0;11;6;1200
My Approach:
Use csv.reader and iterate over the objects in a list. Copy 5th column to 2nd column in list. Shift it one row down.
Then check the values in 2nd and 5th column (PCR and CSR), if both values are same. Replace the PCR value with zero.
I have problem in getting 1st step coded. I am able to duplicate the column but not able to shift it. Also 2nd step is quite straightforward.
Also, I am not sure whether this approach is correct Any pointers/recommendation would be really helpful.
Note: I am not able to install Pandas on CentOS. So help without this module would be better.
My Code:
with open('input.csv', 'r') as input, open('output.csv', 'w') as output:
reader = csv.reader(input, delimiter = ';')
writer = csv.writer(output, delimiter = ';')
mylist = []
header = next(reader)
mylist.append(header)
for rec in reader:
mylist.append(rec)
rec.insert(1, rec[3])
mylist.append(rec)
writer.writerows(mylist)
If your open to non-python solutions then awk could be a good option:
awk 'NR==1{$2="PSR;"$2}NR>1{$2=($4==a?0";"$2:+a";"$2);a=$4}1' FS=';' OFS=';' file
No;PSR;Val;Rec;CSR
0;0;10;1;1200
0;1200;100;2;1300
0;0;100;3;1300
0;1300;100;4;1400
0;1400;10;5;1200
0;0;11;6;1200
Awk is distributed with pretty much all Linux distributions and was designed exactly for this kind of task. It will blaze through your file. Add a redirection to the end > output.csv to save the output in a file.
A simple python approach using the same logic:
#!/usr/bin/env python
last = "0"
with open('input.csv') as csv:
print next(csv).strip().replace(';', ';PSR;', 1)
for line in csv:
field = line.strip().split(';')
if field[3] == last: field.insert(1, "0")
else: field.insert(1, last)
last = field[4]
print ';'.join(field)
Produces the same output:
$ python parse.py
No;PSR;Val;Rec;CSR
0;0;10;1;1200
0;1200;100;2;1300
0;0;100;3;1300
0;1300;100;4;1400
0;1400;10;5;1200
0;0;11;6;1200
Again just redirect the output to save it:
$ python parse.py > output.csv
Just code it as you explained it. Store the previous CSR and refer to it on the next loop through; just be sure to update it.
import csv
with open('input.csv', 'r') as input, open('output.csv', 'w') as output:
reader = csv.reader(input, delimiter = ';')
writer = csv.writer(output, delimiter = ';')
mylist = []
header = next(reader)
mylist.append(header)
mylist.insert(1,'PCR')
prev_csr = 0
for rec in reader:
rec.insert(1,prev_csr)
mylist.append(rec)
prev_csr = rec[4]
writer.writerows(mylist)
with open('input.csv', 'r') as input, open('output.csv', 'w') as output:
reader = csv.reader(input, delimiter = ';')
writer = csv.writer(output, delimiter = ';')
header = next(reader)
header.insert(1, 'PCR')
writer.writerow(header)
prevRow = next(reader)
prevRow.insert(1, '0')
writer.writerow(prevRow)
for row in reader:
if prevRow[-1] == row[-1]:
val = '0'
else:
val = prevRow[-1]
row.insert(1,val)
prevRow = row
writer.writerow(row)
Or, even easier using the DictReader and DictWriter capabilities of csv:
input_header = ['No','Val','Rec','CSR']
output_header = ['No','PCR','Val','Rec','CSR']
with open('input.csv', 'rb') as in_file, open('output.csv', 'wb') as out_file:
in_reader, out_writer = DictReader(in_file, input_header, delemeter =';'), DictWriter(out_file, output_header, delemeter =';')
in_reader.next() # skip the header
out_writer.writeheader() # place the output header
last_csr = None
for row in in_reader():
current_csr = row['CSR']
row['PCR'] = last_csr if current_csr != last_csr else 0
last_csr = current_csr
out_writer.writerow(row)

Categories

Resources