file handling python - vlookup

file handling python - vlookup - python

source.csv as follows.
AB;CD
a;1;
b;2;
c;3;
target.csv as follows.
DE;FG;HI
1;e;1;
2;a;2;
3;f;3;
I need to do a vlookup using file handling mechanisms in python.
So need to update column 'FG' of 'target.csv' by looking up the column 'AB' of 'source.csv' and update with 'CD' column value of 'source.csv'.
So my desired output is like below.
DE;FG;HI
1;e;1;
2;1;2; #a is replaced with 1
3;f;3;
Without using pandas or any other method how I can approach this.
Below is how I approached this.
with open('D:/target.csv', "w+", encoding="utf-8") as Tgt_csvFile:
with open('D:/source.csv', "r", encoding="utf-8") as Src_csvFile:
for line in Src_csvFile:
fields = line.split(";")
x = fields[0]
for line_1 in Tgt_csvFile:
fields_1 = line_1.split(";")
y = fields[1]
if y == x:
update # not sure how to do this
else:
keep as it is
Appreciate on the support

This will solve your particular problem, but if the number of input/output columns changes you will need to adjust the logic accordingly.
It's also worth noting the trailing ; on each non-header row of your csv file will cause most packages to assume there is an extra column. I don't think you want that.
# Read in input, creating a dict where key is column 1 and value is column 2
with open('source.csv', mode='r') as infile:
reader = csv.reader(infile, delimiter=';')
s = {x[0]:x[1] for x in reader}
print(s)
# If column 2 is a key in dict s update with value from dict
output = []
with open('target.csv', mode='r') as infile:
reader = csv.reader(infile, delimiter=';')
for row in reader:
if row[1] in s.keys():
row[1] = s[row[1]]
output.append(row)
# Output to csv
with open('output.csv', mode='w', newline='') as outfile:
writer = csv.writer(outfile, delimiter=';')
writer.writerows(output)

Here is my suggestion:
with open('D:/source.csv', "r", encoding="utf-8") as Src_csvFile:
l=Src_csvFile.readlines()
d={}
for i in l[1:]:
x=i.split(';')
d[x[0]]=x[1]
with open('D:/target.csv', "r", encoding="utf-8") as Tgt_csvFile:
m=Tgt_csvFile.readlines()
for i in range(1,len(m)):
x=m[i].split(';')
if x[1] in d:
x[1]=d.get(x[1])
m[i]=';'.join(x)
with open('D:/target.csv', "w", encoding="utf-8") as Tgt_csvFile:
Tgt_csvFile.writelines(m)
Output:
DE;FG;HI
1;e;1;
2;1;2;
3;f;3;

Related

Compare two CSV files and write difference in the same file as an extra column in python

Hey intelligent community,
I need a little bit of help because i think i don't see the the wood in the trees.
i have to CSV files that look like this:
Name,Number
AAC;2.2.3
AAF;2.4.4
ZCX;3.5.2
Name,Number
AAC;2.2.3
AAF;2.4.4
ZCX;3.5.5
I would like to compare both files and than write any changes like this:
Name,Number,Changes
AAC;2.2.3
AAF;2.4.4
ZCX;5.5.5;change: 3.5.2
So on every line when there is a difference in the number, i want to add this as a new column at the end of the line.
The Files are formated the same but sometimes have a new row so thats why i think i have to map the keys.
I come this far but now iam lost in my thoughts:
Python 3.10.9
import csv
Reading the first csv and set mapping
with open('test1.csv', 'r') as csvfile:
reader= csv.reader(csvfile)
rows = list(reader)
file1_dict = {row[1]: row[0] for row in rows}
Reading the second csv and set mapping
with open('test2.csv', 'r') as csvfile:
reader= csv.reader(csvfile)
rows = list(reader)
file2_dict = {row[1]: row[0] for row in rows}
comparing the keys and find the diff
for k in test1_dict:
if test1_dict[k] != test2:dict[k]
test1_dict[k] = test2_dict[k]
for row in rows:
if row[1] == k:
row.append(test2_dict[k])
#write the csv (not sure how to add the word "change:")
with open('test1.csv', 'w', newline ='') as csvfile:
writer = csv.writer(csvfile)
writer.writerows(rows)
If i try this, i don't get a new column, it just "updates" the csv file with the same columns.
For example this code gives me the diff row but i'am not able to just add it to existing file and row.
with open('test1.csv') as fin1:
with open('test2.csv') as fin2:
read1 = csv.reader(fin1)
read2 = csv.reader(fin2)
diff_rows = (row1 for row1, row2 in zip(read1, read2) if row1 != row2)
with open('test3.csv', 'w') as fout:
writer = csv.writer(fout)
writer.writerows(diff_rows)
Does someone have any tips or help for my problem? I read many answers on here but can't figure it out.
Thanks alot.
#bigkeefer
Thanks for your answer, i tried to change it for the delimiter ; but it gives an "list index out of range error".
with open('test3.csv', 'r') as file1:
reader = csv.reader(file1, delimiter=';')
rows = list(reader)[1:]
file1_dict = {row[0]: row[1] for row in rows}
with open('test4.csv', 'r') as file2:
reader = csv.reader(file2, delimiter=';')
rows = list(reader)[1:]
file2_dict = {row[0]: row[1] for row in rows}
new_file = ["Name;Number;Changes\n"]
with open('output.csv', 'w') as nf:
for key, value in file1_dict.items():
if value != file2_dict[key]:
new_file.append(f"{key};{file2_dict[key]};change: {value}\n")
else:
new_file.append(f"{key};{value}\n")
nf.writelines(new_file)

You will need to adapt this to overwrite your first file etcetera, as you mentioned above, but I've left it like this for your testing purposes. Hopefully this will help you in some way.
I've assumed you've actually got the headers above in each file. If not, remove the slicing on the list creations, and change the new_file variable assignment to an empty list ([]).
with open('f1.csv', 'r') as file1:
reader = csv.reader(file1, delimiter=";")
rows = list(reader)[1:]
file1_dict = {row[0]: row[1] for row in rows if row}
with open('f2.csv', 'r') as file2:
reader = csv.reader(file2, delimiter=";")
rows = list(reader)[1:]
file2_dict = {row[0]: row[1] for row in rows if row}
new_file = ["Name,Number,Changes\n"]
for key, value in file1_dict.items():
if value != file2_dict[key]:
new_file.append(f"{key};{file2_dict[key]};change: {value}\n")
else:
new_file.append(f"{key};{value}\n")
with open('new.csv', 'w') as nf:
nf.writelines(new_file)

Beginner deleting columns from CSV (no pandas)

I've just started coding, I'm trying to remove certain columns from a CSV for a project, we aren't supposed to use pandas. For instance, one of the fields I have to delete is called DwTm, but there's about 15 columns I have to get rid of; I only want the first few, Here's what I've gotten:
import csv
FTemp = "D:/tempfile.csv"
FOut = "D:/NewFile.csv"
with open(FTemp, 'r') as csv_file:
csv_reader = csv.reader(csv_file)
with open(FOut, 'w') as new_file:
fieldnames = ['Stn_Name', 'Lat', 'Long', 'Prov', 'Tm']
csv_writer = csv.DictWriter(new_file, fieldnames=fieldnames)
for line in csv_reader:
del line['DwTm']
csv_writer.writerow(line)
When I run this, I get the error
del line['DwTm']
TypeError: list indices must be integers or slices, not str
This is the only method I've found to almost work without using pandas. Any ideas?

The easiest way around this is to use a DictReader to read the file. Like DictWriter, which you are using to write the file, DictReader uses dictionaries for rows, so your approach of deleting keys from the old row then writing to the new file will work as you expect.
import csv
FTemp = "D:/tempfile.csv"
FOut = "D:/NewFile.csv"
with open(FTemp, 'r') as csv_file:
# Adjust the list to be have the correct order
old_fieldnames = ['Stn_Name', 'Lat', 'Long', 'Prov', 'Tm', 'DwTm']
csv_reader = csv.DictReader(csv_file, fieldnames=old_fieldnames)
with open(FOut, 'w') as new_file:
fieldnames = ['Stn_Name', 'Lat', 'Long', 'Prov', 'Tm']
csv_writer = csv.DictWriter(new_file, fieldnames=fieldnames)
for line in csv_reader:
del line['DwTm']
csv_writer.writerow(line)

Below
import csv
# We only want to read the 'department' field
# We are not interested in 'name' and 'birthday month'
# Make sure the list items are in ascending order
NON_INTERESTING_FIELDS_IDX = [2,0]
rows = []
with open('example.csv') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
for row in csv_reader:
for idx in NON_INTERESTING_FIELDS_IDX:
del row[idx]
rows.append(','.join(row))
with open('example_out.csv','w') as out:
for row in rows:
out.write(row + '\n')
example.csv
name,department,birthday month
John Smith,Accounting,November
Erica Meyers,IT,March
example_out.csv
department
Accounting
IT

It's possible to simultaneously open the file to read from and the file to write to. Let's say you know the indices of the columns you want to keep, say, 0,2, and 4:
good_cols = (0,2,4)
with open(Ftemp, 'r') as fin, open(Fout, 'w') as fout:
for line in fin:
line = line.rstrip() #clean up newlines
temp = line.split(',') #make a list from the line
data = [temp[x] for x in range(len(temp)) if x in good_cols]
fout.write(','.join(data) + '\n')
The list comprehension (data) pulls only the columns you want to keep out of each row and immediately writes line-by-line to your new file, using the join method (plus tacking on an endline for each new row).
If you only know the names of the fields you want to keep/remove it's a bit more involved, you have to extract the indices from the first line of the csv file, but it's not much more difficult.

Add a new column to a csv file in python

I am trying to add a column to a csv file that combines strings from two other columns. Whenever I try this I either get an output csv with only the new column or an output with all of the original data and not the new column.
This is what I have so far:
with open(filename) as csvin:
readfile = csv.reader(csvin, delimiter=',')
with open(output, 'w') as csvout:
writefile = csv.writer(csvout, delimiter=',', lineterminator='\n')
for row in readfile:
result = [str(row[10]) + ' ' + str(row[11])]
writefile.writerow(result)
Any help would be appreciated.

No input to test, but try this. Your current approach doesn't include the existing data for each row that already exists in your input data. extend will take the list that represents each row and then add another item to that list... equivalent to adding a column.
import csv
with open(filename) as csvin:
readfile = csv.reader(csvin, delimiter=',')
with open(output, 'w') as csvout:
writefile = csv.writer(csvout, delimiter=',', lineterminator='\n')
for row in readfile:
row.extend([str(row[10]) + ' ' + str(row[11])])
writefile.writerow(row)

I assume that glayne wants to combine column 10 and 11 into one.
In my approach, I concentrate on how to transform a single row first:
def transform_row(input_row):
output_row = input_row[:]
output_row[10:12] = [' '.join(output_row[10:12])]
return output_row
Once tested to make sure that it works, I can move on to replace all rows:
with open('data.csv') as inf, open('out.csv', 'wb') as outf:
reader = csv.reader(inf)
writer = csv.writer(outf)
writer.writerows(transform_row(row) for row in reader)
Note that I use the writerows() method to write multiple rows in one statement.

Below code snippet combines strings in column 10 and column 11 in each row and add that to the end of the each row
import csv
input = 'test.csv'
output= 'output.csv'
with open(input, 'rb') as csvin:
readfile = csv.reader(csvin, delimiter=',')
with open(output, 'wb') as csvout:
writefile = csv.writer(csvout, delimiter=',', lineterminator='\n')
for row in readfile:
result = row + [row[10]+row[11]]
writefile.writerow(result)

Merge CSVs in Python with different columns

I have hundreds of large CSV files that I would like to merge into one. However, not all CSV files contain all columns. Therefore, I need to merge files based on column name, not column position.
Just to be clear: in the merged CSV, values should be empty for a cell coming from a line which did not have the column of that cell.
I cannot use the pandas module, because it makes me run out of memory.
Is there a module that can do that, or some easy code?

The csv.DictReader and csv.DictWriter classes should work well (see Python docs). Something like this:
import csv
inputs = ["in1.csv", "in2.csv"] # etc
# First determine the field names from the top line of each input file
# Comment 1 below
fieldnames = []
for filename in inputs:
with open(filename, "r", newline="") as f_in:
reader = csv.reader(f_in)
headers = next(reader)
for h in headers:
if h not in fieldnames:
fieldnames.append(h)
# Then copy the data
with open("out.csv", "w", newline="") as f_out: # Comment 2 below
writer = csv.DictWriter(f_out, fieldnames=fieldnames)
for filename in inputs:
with open(filename, "r", newline="") as f_in:
reader = csv.DictReader(f_in) # Uses the field names in this file
for line in reader:
# Comment 3 below
writer.writerow(line)
Comments from above:
You need to specify all the possible field names in advance to DictWriter, so you need to loop through all your CSV files twice: once to find all the headers, and once to read the data. There is no better solution, because all the headers need to be known before DictWriter can write the first line. This part would be more efficient using sets instead of lists (the in operator on a list is comparatively slow), but it won't make much difference for a few hundred headers. Sets would also lose the deterministic ordering of a list - your columns would come out in a different order each time you ran the code.
The above code is for Python 3, where weird things happen in the CSV module without newline="". Remove this for Python 2.
At this point, line is a dict with the field names as keys, and the column data as values. You can specify what to do with blank or unknown values in the DictReader and DictWriter constructors.
This method should not run out of memory, because it never has the whole file loaded at once.

For those of us using 2.7, this adds an extra linefeed between records in "out.csv". To resolve this, just change the file mode from "w" to "wb".

The solution by #Aaron Lockey, which is the accepted answer has worked well for me except, there were no headers for the file. The out put had no headers and only the row data. Each column was without headings (keys). So I inserted following:
writer.writeheader()
and it worked perfectly fine for me! So now the entire code appears like this:
import csv
inputs = ["in1.csv", "in2.csv"] # etc
# First determine the field names from the top line of each input file
fieldnames = []
for filename in inputs:
with open(filename, "r", newline="") as f_in:
reader = csv.reader(f_in)
headers = next(reader)
for h in headers:
if h not in fieldnames:
fieldnames.append(h)
# Then copy the data
with open("out.csv", "w", newline="") as f_out:
writer = csv.DictWriter(f_out, fieldnames=fieldnames)
writer.writeheader() #this is the addition.
for filename in inputs:
with open(filename, "r", newline="") as f_in:
reader = csv.DictReader(f_in) # Uses the field names in this file
for line in reader:
writer.writerow(line)

You can use the pandas module to do this pretty easily. This snippet assumes all your csv files are in the current folder.
import pandas as pd
import os
all_csv = [file_name for file_name in os.listdir(os.getcwd()) if '.csv' in file_name]
li = []
for filename in all_csv:
df = pd.read_csv(filename, index_col=None, header=0, parse_dates=True, infer_datetime_format=True)
li.append(df)
frame = pd.concat(li, axis=0, ignore_index=True)
frame.to_csv('melted_csv.csv', index=False)

I've faced a situation where not only the number of columns are different, but also some column names are missing. For this kind of situation and obviously for your case, this code snippet can help you :)
The tricky part is naming the columns which have no names and adding them to dictionary. The read_csv_file function is playing the main role here.
def read_csv_file(csv_file_path):
headers = []
data = []
with open(csv_file_path, 'r') as f:
csv_reader = csv.reader(f)
rows = []
for i, row in enumerate(csv_reader):
if i == 0:
for j in range(len(row)):
if row[j].strip() == "":
col_name = f"col-{j+1}"
else:
col_name = row[j]
if col_name not in headers:
headers.append(col_name)
else:
rows.append(row)
if len(row) > len(headers):
for j in range(len(row)):
if j+1 > len(headers):
col_name = f"col-{j+1}"
if col_name not in headers:
headers.append(col_name)
for i, row in enumerate(rows):
row_data = {}
for j in range(len(headers)):
if len(row) > j:
row_data[headers[j]] = row[j]
else:
row_data[headers[j]] = ''
data.append(row_data)
return headers, data
def write_csv_file(file_path, rows):
if len(rows) > 0:
headers = list(rows[0].keys())
with open(file_path, 'w', newline='', encoding='UTF8') as f:
writer = csv.DictWriter(f, fieldnames=headers)
writer.writeheader()
writer.writerows(rows)
# The list of the csv file paths which will be merged
files_to_be_merged = [
'file-1.csv',
'file-2.csv',
'file-3.csv'
]
# Read and store all the file data in new_file_data
final_headers = []
new_file_data = []
for f1 in files_to_be_merged:
single_file_data = read_csv_file(f1)
for h in single_file_data[0]:
if h not in final_headers:
final_headers.append(h)
new_file_data += single_file_data[1]
# Add the missing keys to the dictionaries
for d in new_file_data:
for h in final_headers:
if d.get(h) is None:
d[h] = ""
# Write a new file
target_file_name = 'merged_file.csv'
write_csv_file(target_file_name, new_file_data)

How do create new column in csv file using python by shifting one row

I have CSV file like below. It is huge file with thousands of records.
input.csv
No;Val;Rec;CSR
0;10;1;1200
0;100;2;1300
0;100;3;1300
0;100;4;1400
0;10;5;1200
0;11;6;1200
I want to create output.csv file by adding new column "PSR" after 1st column "No". This column value depends on column "PSR" Value. For 1st row, "PSR" shall be zero. From next record on-wards, it depends on "CSR" value in previous row. If present and previous record CSR value is same, then "PSR" shall be zero. If not, PSR value shall have the previous CSR value. For exmple, Value of CSR in 2nd row is 1300 which is different to the value in 1st record ( it is 1200). So PSR value for 2nd row shall be 1200. Where in 2nd and 3rd row, CSR value is same. So PSR value for 3rd row shall be zero. So new value PSR depends on CSR value in present and previous field.
Output.csv
No;PCR;Val;Rec;CSR
0;0;10;1;1200
0;1200;100;2;1300
0;0;100;3;1300
0;1300;100;4;1400
0;1400;10;5;1200
0;0;11;6;1200
My Approach:
Use csv.reader and iterate over the objects in a list. Copy 5th column to 2nd column in list. Shift it one row down.
Then check the values in 2nd and 5th column (PCR and CSR), if both values are same. Replace the PCR value with zero.
I have problem in getting 1st step coded. I am able to duplicate the column but not able to shift it. Also 2nd step is quite straightforward.
Also, I am not sure whether this approach is correct Any pointers/recommendation would be really helpful.
Note: I am not able to install Pandas on CentOS. So help without this module would be better.
My Code:
with open('input.csv', 'r') as input, open('output.csv', 'w') as output:
reader = csv.reader(input, delimiter = ';')
writer = csv.writer(output, delimiter = ';')
mylist = []
header = next(reader)
mylist.append(header)
for rec in reader:
mylist.append(rec)
rec.insert(1, rec[3])
mylist.append(rec)
writer.writerows(mylist)

If your open to non-python solutions then awk could be a good option:
awk 'NR==1{$2="PSR;"$2}NR>1{$2=($4==a?0";"$2:+a";"$2);a=$4}1' FS=';' OFS=';' file
No;PSR;Val;Rec;CSR
0;0;10;1;1200
0;1200;100;2;1300
0;0;100;3;1300
0;1300;100;4;1400
0;1400;10;5;1200
0;0;11;6;1200
Awk is distributed with pretty much all Linux distributions and was designed exactly for this kind of task. It will blaze through your file. Add a redirection to the end > output.csv to save the output in a file.
A simple python approach using the same logic:
#!/usr/bin/env python
last = "0"
with open('input.csv') as csv:
print next(csv).strip().replace(';', ';PSR;', 1)
for line in csv:
field = line.strip().split(';')
if field[3] == last: field.insert(1, "0")
else: field.insert(1, last)
last = field[4]
print ';'.join(field)
Produces the same output:
$ python parse.py
No;PSR;Val;Rec;CSR
0;0;10;1;1200
0;1200;100;2;1300
0;0;100;3;1300
0;1300;100;4;1400
0;1400;10;5;1200
0;0;11;6;1200
Again just redirect the output to save it:
$ python parse.py > output.csv

Just code it as you explained it. Store the previous CSR and refer to it on the next loop through; just be sure to update it.
import csv
with open('input.csv', 'r') as input, open('output.csv', 'w') as output:
reader = csv.reader(input, delimiter = ';')
writer = csv.writer(output, delimiter = ';')
mylist = []
header = next(reader)
mylist.append(header)
mylist.insert(1,'PCR')
prev_csr = 0
for rec in reader:
rec.insert(1,prev_csr)
mylist.append(rec)
prev_csr = rec[4]
writer.writerows(mylist)

with open('input.csv', 'r') as input, open('output.csv', 'w') as output:
reader = csv.reader(input, delimiter = ';')
writer = csv.writer(output, delimiter = ';')
header = next(reader)
header.insert(1, 'PCR')
writer.writerow(header)
prevRow = next(reader)
prevRow.insert(1, '0')
writer.writerow(prevRow)
for row in reader:
if prevRow[-1] == row[-1]:
val = '0'
else:
val = prevRow[-1]
row.insert(1,val)
prevRow = row
writer.writerow(row)

Or, even easier using the DictReader and DictWriter capabilities of csv:
input_header = ['No','Val','Rec','CSR']
output_header = ['No','PCR','Val','Rec','CSR']
with open('input.csv', 'rb') as in_file, open('output.csv', 'wb') as out_file:
in_reader, out_writer = DictReader(in_file, input_header, delemeter =';'), DictWriter(out_file, output_header, delemeter =';')
in_reader.next() # skip the header
out_writer.writeheader() # place the output header
last_csr = None
for row in in_reader():
current_csr = row['CSR']
row['PCR'] = last_csr if current_csr != last_csr else 0
last_csr = current_csr
out_writer.writerow(row)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

file handling python - vlookup - python

Related

Compare two CSV files and write difference in the same file as an extra column in python

Beginner deleting columns from CSV (no pandas)

Add a new column to a csv file in python

Merge CSVs in Python with different columns

How do create new column in csv file using python by shifting one row

Categories

Resources