Compare two csv files in python and retain headers of changes - python

I'm trying to compare two csv files in python and output the differences along with the headers of each column. So far, with what I'm doing, it outputs all columns instead of just the ones with differences
import csv
with open('firstfile.csv', 'r') as f1:
file1 = f1.readlines()
with open('secondfile.csv', 'r') as f2:
file2 = f2.readlines()
with open('results.csv', 'w') as outFile:
outFile.write(file1[0])
for line in file2:
if line not in file1:
outFile.write(line)

I think this code resolves your problem
import sys
with open('file1.csv', 'r') as f1:
file1 = f1.readlines()
with open('file2.csv', 'r') as f2:
file2 = f2.readlines()
delimiter = '\t' # Column delimiter in you file
headers_of_first_file = file1[0].strip().split(delimiter)
headers_of_second_file = file2[0].strip().split(delimiter)
# You can remove this assert if you want to work files with different columns then you have to add some more code in next blocks
different_headers = set(headers_of_first_file).symmetric_difference(headers_of_second_file)
if different_headers:
print('Files have difference in headers: ', different_headers)
sys.exit(-1)
# Build map {header: [all_values]}
first_file_map = {header: [] for header in headers_of_first_file}
for row in file1[1:]:
for index, cell in enumerate(row.strip().split(delimiter)):
first_file_map[headers_of_first_file[index]].append(cell)
# Check by built map. Dont forget that columns may change order
result = set()
for row in file2[1:]:
for index, cell in enumerate(row.strip().split(delimiter)):
if cell not in first_file_map[headers_of_second_file[index]]:
result.add(headers_of_second_file[index])
with open('results.csv', 'w') as out_file:
out_file.write('\t'.join(result))
UPD files example:
Column1 Column2 Column3 Column5 Column4
1 2 3 5 4
10 20 30 50 40
Column1 Column2 Column3 Column4 Column5
11 2 3 4 5
10 10 30 40 50
'\t' is delimiter

import csv
def compareList(l1,l2):
if(len(l1)==len(l2) and len(l1)==sum([1 for i,j in zip(l1,l2) if i==j])):
return "Equal"
else:
return "Non equal"
file1 = "C:/Users/Sarvesh/Downloads/a.csv"
file2 = "C:/Users/Sarvesh/Downloads/b.csv"
with open(file1, 'r') as csv1, open(file2, 'r') as csv2: # Import CSV files
import1 = csv1.readlines()
import2 = csv2.readlines()
# creating an object of csv reader
# with the delimiter as ,
csv_reader = csv.reader(import1, delimiter='|')
# list to store the names of columns
list_of_column_name1 = []
# loop to iterate through the rows of csv
for row in csv_reader:
# adding the first row
list_of_column_name1.append(row)
# breaking the loop after the
# first iteration itself
break
csv_reader = csv.reader(import2, delimiter='|')
# list to store the names of columns
list_of_column_name2 = []
# loop to iterate through the rows of csv
for row in csv_reader:
# adding the first row
list_of_column_name2.append(row)
# breaking the loop after the
# first iteration itself
break
# printing the result
print("1List of column names : ", list_of_column_name1[0])
print("2List of column names : ", list_of_column_name2[0])
print("First comparison",compareList(list_of_column_name1,list_of_column_name2))

Related

Compare two CSV files and write difference in the same file as an extra column in python

Hey intelligent community,
I need a little bit of help because i think i don't see the the wood in the trees.
i have to CSV files that look like this:
Name,Number
AAC;2.2.3
AAF;2.4.4
ZCX;3.5.2
Name,Number
AAC;2.2.3
AAF;2.4.4
ZCX;3.5.5
I would like to compare both files and than write any changes like this:
Name,Number,Changes
AAC;2.2.3
AAF;2.4.4
ZCX;5.5.5;change: 3.5.2
So on every line when there is a difference in the number, i want to add this as a new column at the end of the line.
The Files are formated the same but sometimes have a new row so thats why i think i have to map the keys.
I come this far but now iam lost in my thoughts:
Python 3.10.9
import csv
Reading the first csv and set mapping
with open('test1.csv', 'r') as csvfile:
reader= csv.reader(csvfile)
rows = list(reader)
file1_dict = {row[1]: row[0] for row in rows}
Reading the second csv and set mapping
with open('test2.csv', 'r') as csvfile:
reader= csv.reader(csvfile)
rows = list(reader)
file2_dict = {row[1]: row[0] for row in rows}
comparing the keys and find the diff
for k in test1_dict:
if test1_dict[k] != test2:dict[k]
test1_dict[k] = test2_dict[k]
for row in rows:
if row[1] == k:
row.append(test2_dict[k])
#write the csv (not sure how to add the word "change:")
with open('test1.csv', 'w', newline ='') as csvfile:
writer = csv.writer(csvfile)
writer.writerows(rows)
If i try this, i don't get a new column, it just "updates" the csv file with the same columns.
For example this code gives me the diff row but i'am not able to just add it to existing file and row.
with open('test1.csv') as fin1:
with open('test2.csv') as fin2:
read1 = csv.reader(fin1)
read2 = csv.reader(fin2)
diff_rows = (row1 for row1, row2 in zip(read1, read2) if row1 != row2)
with open('test3.csv', 'w') as fout:
writer = csv.writer(fout)
writer.writerows(diff_rows)
Does someone have any tips or help for my problem? I read many answers on here but can't figure it out.
Thanks alot.
#bigkeefer
Thanks for your answer, i tried to change it for the delimiter ; but it gives an "list index out of range error".
with open('test3.csv', 'r') as file1:
reader = csv.reader(file1, delimiter=';')
rows = list(reader)[1:]
file1_dict = {row[0]: row[1] for row in rows}
with open('test4.csv', 'r') as file2:
reader = csv.reader(file2, delimiter=';')
rows = list(reader)[1:]
file2_dict = {row[0]: row[1] for row in rows}
new_file = ["Name;Number;Changes\n"]
with open('output.csv', 'w') as nf:
for key, value in file1_dict.items():
if value != file2_dict[key]:
new_file.append(f"{key};{file2_dict[key]};change: {value}\n")
else:
new_file.append(f"{key};{value}\n")
nf.writelines(new_file)
You will need to adapt this to overwrite your first file etcetera, as you mentioned above, but I've left it like this for your testing purposes. Hopefully this will help you in some way.
I've assumed you've actually got the headers above in each file. If not, remove the slicing on the list creations, and change the new_file variable assignment to an empty list ([]).
with open('f1.csv', 'r') as file1:
reader = csv.reader(file1, delimiter=";")
rows = list(reader)[1:]
file1_dict = {row[0]: row[1] for row in rows if row}
with open('f2.csv', 'r') as file2:
reader = csv.reader(file2, delimiter=";")
rows = list(reader)[1:]
file2_dict = {row[0]: row[1] for row in rows if row}
new_file = ["Name,Number,Changes\n"]
for key, value in file1_dict.items():
if value != file2_dict[key]:
new_file.append(f"{key};{file2_dict[key]};change: {value}\n")
else:
new_file.append(f"{key};{value}\n")
with open('new.csv', 'w') as nf:
nf.writelines(new_file)

How to delete rows of tsv

1 7 c
5 2 q
4 5 a
5 0 c
for i,line in enumerate(read_tsv):
first = read_tsv[i][0]
second = read_tsv[i][1]
letter = read_tsv[i][2]
if i == 2:
I have a tsv file and I'd like to delete the rows where the 3rd values are not c. So I'd like it to look like this. So far I know how to seperate the values I just don't know how to delete the row based on the third tabbed value.
1 7 c
5 0 c
You can open the doc read/iterate it and filter out the unwanted rows then open it in write and write that data back
import csv
with open('filename.tsv', 'r') as f:
reader = csv.reader(f, delimiter='\t')
data = [row for row in reader if row[2] == 'c']
with open('filename.tsv', 'w') as f:
writer = csv.writer(f, delimiter='\t')
writer.writerows(data)

Check for unique elements of csv

I would like to check for duplicates in a .csv (structure bellow). Every value in this .csv has to be unique! You can find "a" thrice, but it should be there only once.
###start
a
a;b;
d;e
f;g
h
i;
i
d;b
a
c;i
### end
The progress so far:
import os,glob
import csv
folder_path = "csv_entities/"
found_rows = set()
for filepath in glob.glob(os.path.join(folder_path, "*.csv")):
with open(filepath) as fin, open("newfile.csv", "w") as fout:
reader = csv.reader(fin, delimiter=";")
writer = csv.writer(fout, delimiter=";")
for row in reader:
# delete empty list elements
if "" in row:
row = row[:-1]
#delete empt row
if not row:
continue
row = tuple(row) # make row hashable
# don't write if row is there already!
if row in found_rows:
continue
print(row)
writer.writerow(row)
found_rows.add(row)
Which results in this csv:
###start
a
a;b
d;e
f;g
h
i
d;b
c;i
###end
The most important question is right now: How can I get rid of the double values?
e.g in the second row there should be only "b" instead of "a;b", because "a" is already in the row before.
your mistake is to consider the rows themselves as unique elements. You have to consider cells as elements.
So use your marker set to mark elements, not rows.
Example with only one input file (using several input files with only one output file makes no sense)
found_values = set()
with open("input.csv") as fin, open("newfile.csv", "w",newline="") as fout:
reader = csv.reader(fin, delimiter=";")
writer = csv.writer(fout, delimiter=";")
for row in reader:
# delete empty list elements & filter out already seen elements
new_row = [x for x in row if x and x not in found_values]
# update marker set with row contents
found_values.update(row)
if new_row:
# new row isn't empty: write it
writer.writerow(new_row)
the resulting csv file is:
a
b
d;e
f;g
h
i
c

Find matches in two csv files

I have two csv files: the first one has one column called ID and 5 rows, and the second one has 12 columns (col 5 is called ID) with 100 rows. I am trying to find match IDs and write the entire row to a new csv file.
Thank you for your help!
here is my code:
import csv
input_file1 = "/Desktop/New1/file1.csv"
input_file2 = "/Desktop/New1/file2.csv"
output_file = "/Desktop/New1/results.csv"
with open(input_file1) as t1, open(input_file2) as t2:
fileone = csv.reader(t1)
filetwo = csv.reader(t2)
with open(output_file, 'w') as output_res:
for line in filetwo:
if line in fileone:
output_res.write(line)
You can read the IDs in file1 into a set for more efficient lookup. You should also use csv.writer to output the rows as CSV:
with open(input_file1) as t1, open(input_file2) as t2:
ids = set(id for id, in csv.reader(t1))
filetwo = csv.reader(t2)
with open(output_file, 'w') as output_res:
writer = csv.writer(output_res)
for row in filetwo:
if row[4] in ids:
writer.writerow(row)

How to count number of columns in each row?

Each rows have different number of columns but Column A is always file name and rest of columns are fields of that file.
Is there any way I could count number of columns for each row?
import csv
file=('C:/)
with open('C:/Count.csv','w',encoding='cp949',newline='') as testfile:
csv_writer=csv.writer(testfile)
for line in file:
lst=[len(line)]
csv_writer.writerow(lst)
You can either choose to split on commas or open the file with csv.
I'd recommend the latter. Here's how you can do that:
file1 = ... # file to read
file2 = ... # file to write
with open(file1, 'r') as f1, open(file2, 'w', encoding='cp949', newline='') as f2:
csv_reader = csv.reader(f1)
csv_writer = csv.writer(f2)
for row in csv_reader:
csv_writer.writerow([len([x for x in row if x])]) # non-null counts only
Open both files simultaneously, iterate over the file to read, count its columns using len(row) and then write it out.

Categories

Resources