Pandas - How to compare 2 CSV files and output changes - python

Situation
I have 2 CSVs that are 10k rows by 140 columns that are largely identical and need to identify the differences. The headers are the exact same and the rows are almost the same (100 of 10K might have changed).
Example
File1.csv
ID,FirstName,LastName,Phone1,Phone2,Phone3
1,Bob,Jones,5555555555,4444444444,3333333333
2,Jim,Hill,2222222222,1111111111,0000000000
File2.csv
ID,FirstName,LastName,Phone1,,Phone2,,Phone3
1,Bob, Jones,5555555555,4444455444,3333333333
2,Jim, Hill,2222222222,1155111111,0005500000
3,Kim, Grant,2173659851,3214569874,3698521471
Outputfile.csv
ID,FirstName,LastName,Phone1,Phone2,Phone3
1,Bob,Jones,5555555555,4444444444,3333333333
2,Jim,Hill,2222222222,1111111111,0005500000
3,Kim, Grant,2173659851,3214569874,3698521471
I think I want the output to be File2.csv with changes from File1.csv highlighted somehow. I'm new to python and pandas and can't seem to figure out where to start. I did my best to search google for something similar to adapt to my needs but the scripts appeared to be to specific to the situation.
If someone knows of an easier/different way, I'm all ears. I don't care how this happens as long as I don't have to check record-by-record.

CSV generally doesn't support different fonts, but here's a solution that uses bold and colors output to the console (note: I only tested on Mac). If you're using Python 3.7+ (dictionaries sorted by insertion order), then the dictionary ordering and columns list shouldn't be necessary.
from collections import OrderedDict
from csv import DictReader
class Color(object):
GREEN = '\033[92m'
RED = '\033[91m'
BOLD = '\033[1m'
END = '\033[0m'
def load_csv(file):
# Index by ID in order, and keep track of the original column order
with open(file, 'r') as fp:
reader = DictReader(fp, delimiter=',')
rows = OrderedDict((r['ID'], r) for r in reader)
return rows, reader.fieldnames
def print_row(row, cols, color, prefix):
print(Color.BOLD + color + prefix + ','.join(row[c] for c in cols) + Color.END)
def print_diff(row1, row2, cols):
row = []
for col in cols:
value1 = row1[col]
if row2[col] != value1:
row.append(Color.BOLD + Color.GREEN + value1 + Color.END)
else:
row.append(value1)
print(','.join(row))
def diff_csv(file1, file2):
rows1, cols = load_csv(file1)
rows2, _ = load_csv(file2)
for row_id, row1 in rows1.items():
# Pop the matching ID row
row2 = rows2.pop(row_id, None)
# If not in file2, then it was added
if not row2:
print_row(row1, cols, Color.GREEN, '+')
# In both files, print the diff
else:
print_diff(row1, row2, cols)
# Anything remaining from file2 was removed in file1
for row in rows2.values():
print_row(row, cols, Color.RED, '-')

This can be done simply by using python's built in CSV library. If you also care about the order of your entries, you can use an OrderedDict to maintain the original file order.
import csv
f = []
f3 = file('results.csv', 'w')
with open('file1.csv', 'rb') as f1, open('file2.csv', 'rb') as f2:
reader1 = csv.reader(f1, delimiter=",")
reader2 = csv.reader(f2, delimiter=",")
for line in enumerate(reader1):
f.append(line) #For the first file, add them all
for line in enumerate(reader2):
if not any(e[0] == line[0] for e in f): #For the second file, only add them if there is not an entry with the same name already
f.append(line)
for e in f:
if e[0] == line[0]:
changedindexes = i != j for i, j in zip(e[0], line[0])
for val in changedindexes:
e[val] = e[val] + 'c'
c3 = csv.writer(f3, , quoting=csv.QUOTE_ALL)
for line in f: #Write the new merged files into another csv
c3.writerow(line)
#Then find the differences between the two orderedDicts
As for bolding, there is no way to do that in CSV, as csv files contain data, not any formatting information.

A second way:
# get indices of differences
difference_locations = np.where(df1 != df2)
#define reference
changed_from = df1.values[difference_locations]
changed_to = df2.values[difference_locations]
df_differences = pd.DataFrame({'from': changed_from, 'to': changed_to}, index=changed.index)

Related

How do i write csv basis on comparing two csv file[column based]

I have two csv files:
csv1
csv2
(*note headers can be differ)
csv1 has 1 single column an csv2 has 5 columns
now column 1 of csv1 has some matching values in column2 of csv2
my concern is how can i write a csv where column1 of csv1 does not have a MATCHING VALUES to column2 of csv2
I have attached three files csv1, csv2 and expected output..
Expected Output:
ProfileID,id,name,class ,rollnumber
1,lkha,prince,sfasd,DAS
2,hgfhfk,kabir,AD,AD
5,jlh,antriskh,ASDA,AD
CSV 1:
id,name
10927,prince
109582,kabir
f546416,rahul
g44674,saini
r7341,antriskh
CSV 2:
ProfileID,id,name,class ,rollnumber
1,lkha,prince,sfasd,DAS
2,hgfhfk,kabir,AD,AD
3,f546416,rahul,AD,FF
44,g44674,saini,DD,FF
5,jlh,antriskh,ASDA,AD
I tried using converting them into dictionary and match them csv1 keys to csv2 values but it is not working as expected
def read_csv1(filename):
prj_structure = {}
f = open(filename, "r")
data = f.read()
f.close()
lst = data.split("\n")
prj = ""
for i in range(0, len(lst)):
val = lst[i].split(",")
if len(val)>0:
prj = val[0]
if prj!="":
if prj not in prj_structure.keys():
prj_structure[prj] = []
prj_structure[prj].append([val[1], val[2], val[3], val[4])
return prj_structure
def read_csv2(filename):
prj_structure = {}
f = open(filename, "r")
data = f.read()
f.close()
lst = data.split("\n")
prj = ""
for i in range(0, len(lst)):
val = lst[i].split(",")
if len(val)>0:
prj = val[0]
if prj!="":
if prj not in prj_structure.keys():
prj_structure[prj] = []
prj_structure[prj].append([val[0])
return prj_structure
csv1_data = read_csv1("csv1.csv")
csv2_data = read_csv2("csv2.csv")
for k, v in csv1_data.items():
for ks, vs in csv2_data.items():
if k==vs[0][0]:
#here it is not working
sublist = []
sublist.append(k)
Use the DictReader from the csv package.
import csv
f1 = open('csv1.csv')
csv_1 = csv.DictReader(f1)
f2 = open('csv2.csv')
csv_2 = csv.DictReader(f2)
first_dict = {}
for row in csv_1:
first_dict[row['name']]=row
f1.close()
f_out = open('output.csv','w')
csv_out = csv.DictWriter(f_out,csv_2.fieldnames)
csv_out.writeheader()
for second_row in csv_2:
if second_row['name'] in first_dict:
first_row = first_dict[second_row['name']]
if first_row['id']!=second_row['id']:
csv_out.writerow(second_row)
f2.close()
f_out.close()
If you have the option, I have always found pandas as a great tool to import and manipulate CSV files.
import pandas as pd
# Read in both the CSV files
df_1 = pd.DataFrame(pd.read_csv('csv1.csv'))
df_2 = pd.DataFrame(pd.read_csv('csv2.csv'))
# Iterate over both DataFrames and if any id's from in df_2 match
# df_1, remove them from df_2
for num1, row1 in df_1.iterrows():
for num2, row2 in df_2.iterrows():
if row1['id'] == row2['id']:
df_2.drop(num2, inplace=True)
df_2.head()
For any kind of csv processing, using the builtin csv module makes most of the error prone processing trivial. Given your example values, the following code should produce the desired results. I use comprehensions to do the filtering.
import csv
import io
# example data, as StringIO that will behave like file objects
raw_csv_1 = io.StringIO('''\
id,name
10927,prince
109582,kabir
f546416,rahul
g44674,saini
r7341,antriskh''')
raw_csv_2 = io.StringIO('''\
ProfileID,id,name,class,rollnumber
1,lkha,prince,sfasd,DAS
2,hgfhfk,kabir,AD,AD
3,f546416,rahul,AD,FF
44,g44674,saini,DD,FF
5,jlh,antriskh,ASDA,AD''')
# in your actual data, you would use actual file objects instead, like
# with open('location/of/your/csv_1') as file_1:
# raw_csv_1 = file_1.read()
# with open('location/of/your/csv_2') as file_2:
# raw_csv_2 = file_2.read()
Then we need to transform then into csv.reader objects:
csv_1 = csv.reader(raw_csv_1)
next(csv_1) # consume once to skip the header
csv_2 = csv.reader(raw_csv_2)
header = next(csv_2) # consume once to skip the header, but store it
Last but not least, collect the names of the first csv in a set to use them as lookup table, filter the second csv with it, and write it back as 'result.csv' into your file system.
skip_keys = {id_ for id_, name in vals_1}
result = [row for row in vals_2 if row[1] not in skip_keys]
# at this point, result contains
# [['1', 'lkha', 'prince', 'sfasd', 'DAS'],
# ['2', 'hgfhfk', 'kabir', 'AD', 'AD'],
# ['5', 'jlh', 'antriskh', 'ASDA', 'AD']]
with open('result.csv', 'w') as result_file:
csv.writer(result_file).writerows(header+result)

Split a row into multiple cells and keep the maximum value of second value for each gene

I am new to Python and I prepared a script that will modify the following csv file
accordingly:
1) Each row that contains multiple Gene entries separated by the /// such as:
C16orf52 /// LOC102725138 1.00551
should be transformed to:
C16orf52 1.00551
LOC102725138 1.00551
2) The same gene may have different ratio values
AASDHPPT 0.860705
AASDHPPT 0.983691
and we want to keep only the pair with the highest ratio value (delete the pair AASDHPPT 0.860705)
Here is the script I wrote but it does not assign the correct ratio values to the genes:
import csv
import pandas as pd
with open('2column.csv','rb') as f:
reader = csv.reader(f)
a = list(reader)
gene = []
ratio = []
for t in range(len(a)):
if '///' in a[t][0]:
s = a[t][0].split('///')
gene.append(s[0])
gene.append(s[1])
ratio.append(a[t][1])
ratio.append(a[t][1])
else:
gene.append(a[t][0])
ratio.append(a[t][1])
gene[t] = gene[t].strip()
newgene = []
newratio = []
for i in range(len(gene)):
g = gene[i]
r = ratio[i]
if g not in newgene:
newgene.append(g)
for j in range(i+1,len(gene)):
if g==gene[j]:
if ratio[j]>r:
r = ratio[j]
newratio.append(r)
for i in range(len(newgene)):
print newgene[i] + '\t' + newratio[i]
if len(newgene) > len(set(newgene)):
print 'missionfailed'
Thank you very much for any help or suggestion.
Try this:
with open('2column.csv') as f:
lines = f.read().splitlines()
new_lines = {}
for line in lines:
cols = line.split(',')
for part in cols[0].split('///'):
part = part.strip()
if not part in new_lines:
new_lines[part] = cols[1]
else:
if float(cols[1]) > float(new_lines[part]):
new_lines[part] = cols[1]
import csv
with open('clean_2column.csv', 'wb') as csvfile:
writer = csv.writer(csvfile, delimiter=' ',
quotechar='|', quoting=csv.QUOTE_MINIMAL)
for k, v in new_lines.items():
writer.writerow([k, v])
First of all, if you're importing Pandas, know that you have I/O Tools to read CSV files.
So first, let's import it that way :
df = pd.read_csv('2column.csv')
Then, you can extract the indexes where you have your '///' pattern:
l = list(df[df['Gene Symbol'].str.contains('///')].index)
Then, you can create your new rows :
for i in l :
for sub in df['Gene Symbol'][i].split('///') :
df=df.append(pd.DataFrame([[sub, df['Ratio(ifna vs. ctrl)'][i]]], columns = df.columns))
Then, drop the old ones :
df=df.drop(df.index[l])
Then, I'll do a little trick to remove your lowest duplicate values. First, I'll sort them by 'Ratio (ifna vs. ctrl)' then I'll drop all the duplicates but the first one :
df = df.sort('Ratio(ifna vs. ctrl)', ascending=False).drop_duplicates('Gene Symbol', keep='first')
If you want to keep your sorting by Gene Symbol and reset indexes to have simpler ones, simply do :
df = df.sort('Gene Symbol').reset_index(drop=True)
If you want to re-export your modified data to your csv, do :
df.to_csv('2column.csv')
EDIT : I edited my answer to correct syntax errors, I've tested this solution with your csv and it worked perfectly :)
This should work.
It uses the dictionary suggestion of Peter.
import csv
with open('2column.csv','r') as f:
reader = csv.reader(f)
original_file = list(reader)
# gets rid of the header
original_file = original_file[1:]
# create an empty dictionary
genes_ratio = {}
# loop over every row in the original file
for row in original_file:
gene_name = row[0]
gene_ratio = row[1]
# check if /// is in the string if so split the string
if '///' in gene_name:
gene_names = gene_name.split('///')
# loop over all the resulting compontents
for gene in gene_names:
# check if the component is in the dictionary
# if not in dictionary set value to gene_ratio
if gene not in genes_ratio:
genes_ratio[gene] = gene_ratio
# if in dictionary compare value in dictionary to gene_ratio
# if dictionary value is smaller overwrite value
elif genes_ratio[gene] < gene_ratio:
genes_ratio[gene] = gene_ratio
else:
if gene_name not in genes_ratio:
genes_ratio[gene_name] = gene_ratio
elif genes_ratio[gene_name] < gene_ratio:
genes_ratio[gene_name] = gene_ratio
#loop over dictionary and print gene names and their ratio values
for key in genes_ratio:
print key, genes_ratio[key]

Comparing data between 4 csv files and writing them to separate output files

Could someone please advice me on how I could improve my code? I have 4 big csv files. The first is a reference file to which 3 other files (file1, file2 and file3) are compared to. In the files, there are three columns. Each row is a unit (e.g. ABC, DEF, GHI are 3 separate units).
col_1 col_2 col_3
A B C
D E F
G H I
I would like to compare file1, file2 and file3 to the reference file. If unit for each row in the reference file is present in all 3 files, I would like to write them into file A. If unit for each row is present in at least 1 of the 3 files, they should be written to file B. If unit for each each row is not present in any of the 3 files, I would like to write them in file C. My current strategy is to append the files as 4 separate lists and to compare them. I realize that this approach is memory intensive. In addition, my script has been running for a long time without final output. As such I was wondering if there is a more efficient approach to this problem?
Below is my code:
import csv
reference_1 = open ('reference.csv', 'rt', newline = '')
reader = csv.reader(reference_1, delimiter = ',')
file1 = open ('file1.csv','rt', newline = '')
reader1 = csv.reader(file1, delimiter = ',')
file2 = open ('file2.csv', 'rt',newline = '')
reader2 = csv.reader(file2, delimiter = ',')
file3 = open ('file3.csv', 'rt',newline = '')
reader3 = csv.reader(file3, delimiter = ',')
Common = open ('Common.csv', 'w',newline = '')
writer1 = csv.writer(Common, delimiter = ',')
Partial = open ('Partial.csv', 'w',newline = '')
writer2 = csv.writer(Partial, delimiter = ',')
Absent = open ('Absent.csv', 'w',newline = '')
writer3 = csv.writer(Absent, delimiter = ',')
reference = []
fileA = []
fileB = []
fileC = []
for row in reader:
reference.append (row)
for row in reader1:
fileA.append(row)
for row in reader2:
fileB.append(row)
for row in reader3:
fileC.append(row)
for row in reference:
if row in fileA and row in fileB and row in fileC:
writer1.writerow (row)
continue
elif row in fileA or row in fileB or row in fileC:
writer2.writerow (row)
continue
else:
writer3.writerow (row)
reference_1.close()
file1.close()
file2.close()
file3.close()
Common.close()
Partial.close()
Absent.close()
Assuming the order of the rows is not important and that there aren't duplicate rows in the reference file, here is an option using set.
def file_to_set(filename):
"""Opens a file and returns a set containing each line of the file."""
with open(filename) as f:
return set(f.read().splitlines(True))
def set_to_file(s, filename):
"""Writes a set to file."""
with open(filename, 'w') as f:
f.writelines(s)
def compare_files(ref_filename, *files):
"""Compares a reference file to two or more files."""
if len(files) < 2:
raise TypeError("compare_files expected at least 2 files, got %s" %
len(files))
ref = file_to_set(ref_filename)
file_data = [file_to_set(f) for f in files]
all = file_data[0].union(*file_data[1:])
common = ref.intersection(*file_data)
partial = ref.intersection(all).difference(common)
absent = ref.difference(all)
set_to_file(common, 'common.csv')
set_to_file(partial, 'partial.csv')
set_to_file(absent, 'absent.csv')
compare_files('reference.csv', 'file1.csv', 'file2.csv', 'file3.csv')
The idea is:
Create sets containing each line of a file.
Make a set (all) that contains every line in every file (except the reference file).
Make a set (common) that contains only the lines that are in every file, including the reference file.
Make a set (partial) that contains the lines in the reference file that also appear in at least one but not all of the other files.
Make a set (absent) that contains the lines only present in the reference file.
Write common, partial, and absent to files.

howto write column 0 and column 2 header from original CSV file?

I was wondering if someone could tell me how to write out the header for columns 0 and 3 from the original CSV file to the new CSV file? I'm also curious if anyone has any expereince with pushing to google docs?
**
#!/usr/bin/python
import csv
import re
import sys
import gdata.docs.service
email = "myemail#gmail.com"
password = "password"
#string_1 = ('OneTouch AT')
#string_2 = ('LinkRunner AT')
#string_3 = ('AirCheck')
searched = ['aircheck', 'linkrunner at', 'onetouch at']
def find_group(row):
"""Return the group index of a row
0 if the row contains searched[0]
1 if the row contains searched[1]
etc
-1 if not found
"""
for col in row:
col = col.lower()
for j, s in enumerate(searched):
if s in col:
return j
return -1
#def does_match(string):
# stringl = string.lower()
# return any(s in stringl for s in searched)
#Opens Input file for read and output file to write.
inFile = open('data.csv', "rb")
reader = csv.reader(inFile)
outFile = open('data2.csv', "wb")
writer = csv.writer(outFile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
# Read header
header = reader.next()
#for row in reader:
# found = False
# for col in row:
# if col in [string_1, string_2, string_3] and not found:
# writer.writerow(row)
# found = True
#writer.writerow(header(0,2))
"""Built a list of items to sort. If row 12 contains 'LinkRunner AT' (group 1),
one stores a triple (1, 12, row)
When the triples are sorted later, all rows in group 0 will come first, then
all rows in group 1, etc.
"""
stored = []
writer.writerow(row[header] for header in (0,2))
for i, row in enumerate(reader):
g = find_group(row)
if g >= 0:
stored.append((g, i, row))
stored.sort()
for g, i, row in stored:
writer.writerow(tuple(row[k] for k in (0,2))) # output col 1 & 3
#for row in reader:
# if any(does_match(col) for col in row):
# writer.writerow(row[:2]) # write only 2 first columns
# Closing Input and Output files.
inFile.close()
outFile.close()
**
I think what you're looking for is this:
writer.writerow([header[0], header[2]])
You could also use either of the two more complicated mechanisms you use later in the same script:
writer.writerow(header[i] for i in (0,2))
writer.writerow(tuple(header[k] for k in (0,2)))
… but there's really no good reason to. In fact, you'd be better off changing those lines to do things the simple way. Also, you'd be better off not trying to re-use the variable header as a loop index variable… So:
for g, i, row in stored:
writer.writerow([row[0], row[2]])

Python - merging of csv files with one axis in common

I need to merge two csv files, A.csv and B.csv, with one axis in common, extract:
9.358,3.0
9.388,2.0
and
8.551,2.0
8.638,2.0
I want the final file C.csv to have the following pattern:
8.551,0.0,2.0
8.638,0.0,2.0
9.358,3.0,0.0
9.388,2.0,0.0
How to you suggest to do it? Should I go for a for loop?
Just read from each file, writing out to the output file and adding in the 'missing' column:
import csv
with open('c.csv', 'wb') as outcsv:
# Python 3: use open('c.csv', 'w', newline='') instead
writer = csv.writer(outcsv)
# copy a.csv across, adding a 3rd column
with open('a.csv', 'rb') as incsv:
# Python 3: use open('a.csv', newline='') instead
reader = csv.reader(incsv)
writer.writerows(row + [0.0] for row in reader)
# copy b.csv across, inserting a 2nd column
with open('b.csv', 'rb') as incsv:
# Python 3: use open('b.csv', newline='') instead
reader = csv.reader(incsv)
writer.writerows(row[:1] + [0.0] + row[1:] for row in reader)
The writer.writerows() lines do all the work; a generator expression loops over the rows in each reader, either appending a column or inserting a column in the middle.
This works with whatever size of input CSVs you have, as only some read and write buffers are held in memory. Rows are processed in iterative fashion without ever needing to hold all of the input or output files in memory.
import numpy as np
dat1 = np.genfromtxt('dat1.txt', delimiter=',')
dat2 = np.genfromtxt('dat2.txt', delimiter=',')
dat1 = np.insert(dat1, 2, 0, axis=1)
dat2 = np.insert(dat2, 1, 0, axis=1)
dat = np.vstack((dat1, dat2))
np.savetxt('dat.txt', dat, delimiter=',', fmt='%.3f')
Here's a simple solution using a dictionary, which will work for any number of files:
from __future__ import print_function
def process(*filenames):
lines = {}
index = 0
for filename in filenames:
with open(filename,'rU') as f:
for line in f:
v1, v2 = line.rstrip('\n').split(',')
lines.setdefault(v1,{})[index] = v2
index += 1
for line in sorted(lines):
print(line, end=',')
for i in range(index):
print(lines[line].get(i,0.0), end=',' if i < index-1 else '\n')
process('A.csv','B.csv')
prints
8.551,0.0,2.0
8.638,0.0,2.0
9.358,3.0,0.0
9.388,2.0,0.0

Categories

Resources