Comparing 2 Huge csv Files in Python

Comparing 2 Huge csv Files in Python - python

I have 2 csv files.
File1:
EmployeeName,Age,Salary,Address
Vinoth,12,2548.245,"140,North Street,India"
Vinoth,12,2548.245,"140,North Street,India"
Karthick,10,10.245,"140,North Street,India"
File2:
EmployeeName,Age,Salary,Address
Karthick,10,10.245,"140,North Street,India"
Vivek,20,2000,"USA"
Vinoth,12,2548.245,"140,North Street,India"
I want to compare these 2 files and report the differences into another csv file. I've used the below python code ( version 2.7)
#!/usr/bin/env python
import difflib
import csv
with open('./Input/file1', 'r' ) as t1:
fileone = t1.readlines()
with open('./Input/file2', 'r' ) as t2:
filetwo = t2.readlines()
with open('update.csv', 'w') as outFile:
for line in filetwo:
if line not in fileone:
outFile.write(line)
for line in fileone:
if line not in filetwo:
outFile.write(line)
When I execute, below is the output I got:
Actual Output
Vivek,20,2000,"USA"
But my expected output is below since the Records for "Vinoth" in file1 is present 2 times, but only present 1 time in file2.
Expected Output
Vinoth,12,2548.245,"140,North Street,India"
Vivek,20,2000,"USA"
Questions
Please let me know how to get the expected output.
Also , how to get the Filename and line number of the difference record to the output file?

The issue you are running into is that the in keyword only checks for the presence of an item, not if the item exists twice. If you are open to using an external package, you can do this pretty quickly with pandas.
import pandas as pd
df1 = pd.read_csv('Input/file1.csv')
df2 = pd.read_csv('Input/file2.csv')
# create a new column with the count of how many times the row exists
df1['count'] = 0
df2['count'] = 0
df1['count'] = df1.groupby(df1.columns.to_list()[:-1]).cumcount() + 1
df2['count'] = df2.groupby(df2.columns.to_list()[:-1]).cumcount() + 1
# merge the two data frames with and outer join, add an indicator variable
# to show where each row (including the count) exists.
df_all = df1.merge(df2, on=df1.columns.to_list(), how='outer', indicator='exists')
print(df_all)
# prints:
EmployeeName Age Salary Address count exists
0 Vinoth 12 2548.245 140,North Street,India 1 both
1 Vinoth 12 2548.245 140,North Street,India 2 left_only
2 Karthick 10 10.245 140,North Street,India 1 both
3 Vivek 20 2000.000 USA 1 right_only
# clean up exists column and export the rows do not exist in both frames
df_all['exists'] = (df_all.exists.str.replace('left_only', 'file1')
.str.replace('right_only', 'file2'))
df_all.query('exists != "both"').to_csv('update.csv', index=False)
Edit: non-pandas version
You can check for difference in identical line counts using the row as a key and the count as the value.
from collection import defaultdict
c1 = defaultdict(int)
c2 = defaultdict(int)
with open('./Input/file1', 'r' ) as t1:
for line in t1:
c1[line.strip()] += 1
with open('./Input/file2', 'r' ) as t2:
for line in t2:
c2[line.strip()] += 1
# create a set of all rows
all_keys = set()
all_keys.update(c1)
all_keys.update(c2)
# find the difference in the number of instances of the row
out = []
for k in all_keys:
diff = c1[k] - c2[k]
if diff == 0:
continue
if diff > 0:
out.extend([k + ',file1'] * diff) # add which file it came from
if diff < 0:
out.extend([k + ',file2'] * abs(diff)) # add which file it came from
with open('update.csv', 'w') as outFile:
outFile.write('\n'.join(out))

use panda compare
import pandas as pd
f1 = pd.read_csv(file_1.csv)
f2 = pd.read_csv(file_2.csv)
changed = f1.compare(f2)
change = f1[f1.index.isin(changed.index)]
print(change)

Related

Python for loop enumerate

I am reading multiple csv files and combine it in one csv file. The desired outcome of the combined data looks like the following:
0 4 6 8 10 12
1 2 5 4 2 1
5 3 0 1 5 10
....
But in the following code, I intend the column to go from 0,4,6,8,10,12.
for indx, file in enumerate(files_File1):
if file.endswith('csv'): #reading csv filed in the designated folder
filepath = os.path.join(folder_File1, file) #reading csv filed in the designated folder
current = pd.read_csv(filepath, header=None) #reading csv filed in the designated folder
if indx == 0:
mydata_File1 = current.copy()
mydata_File1.columns.values[1] = 4
print(mydata_File1.columns.values)
else:
mydata_File1[2*indx+4] = current.iloc[:,1]
print(mydata_File1.columns.values)
But instead, the outcome looks like this where the column goes from 0,2,4,6,8,10,12.
0 4 2 6 8 10 12
1 2 5 4 2 1
5 3 0 1 5 10
....
I am not quite sure what causes the column named "2".
Any idea?

If there is some reason you need panda, then this will work. Your code references mydata_File1.columns.values which is the name of the columns, not the value in the columns. If this doesn't answer your question, then please provide more complete answer per #juanpa.arrivillaga comment.
#! python3
import os
import pandas as pd
import glob
folder_File1 = r"C:\Users\Public\Documents\Python\CombineCSVFiles"
csv_only = r"\*.csv"
files_File1 = glob.glob(f'{folder_File1}{csv_only}')
new_csv = f'{folder_File1}\\newcsv.csv'
mydata_File1 = []
for indx, file in enumerate(files_File1):
if file == new_csv:
pass
else:
current = pd.read_csv(file, header=None) #reading csv filed in the designated folder
print (current)
if indx == 0:
mydata_File1 = current.copy()
print(mydata_File1.values)
else:
pass
mydata_File1 = mydata_File1.append(current, ignore_index=True)
print(mydata_File1.values)
mydata_File1.to_csv(new_csv)

If you are really just trying to combine .csv files, no need for panda.
#! python3
import glob
folder_File1 = r"C:\Users\Public\Documents\Python\CombineCSVFiles"
csv_only = r"\*.csv"
files_File1 = glob.glob(f'{folder_File1}{csv_only}')
new_csv = f'{folder_File1}\\newcsv.csv'
lines = []
for file in files_File1:
with open(file) as filein:
if filein.name == new_csv:
pass
else:
for line in filein:
line = line.strip() # or some other preprocessing
lines.append(line) # storing everything in memory!
with open(new_csv, 'w') as out_file:
out_file.writelines(line + u'\n' for line in lines)

poblem in python while combining 2 text file and summarizing them into a new text file

I have 2 text file like the following examples. I name one of them first(comma separated) and the other one second(tab separated).
first:
chr1,105000000,105310000,2,1,3,2
chr1,5310000,5960000,2,1,5,4
chr1,1580000,1180000,4,1,5,3
chr19,107180000,107680000,1,1,5,4
chr1,7680000,8300000,3,1,1,2
chr1,109220000,110070000,4,2,3,3
chr1,11060000,12070000,6,2,7,4
second:
AKAP8L chr19 107180100 107650000 transcript
AKAP8L chr19 15514130 15529799 transcript
AKIRIN2 chr6 88384790 88411927 transcript
AKIRIN2 chr6 88410228 88411243 transcript
AKT3 chr1 105002000 105010000 transcript
AKT3 chr1 243663021 244006886 transcript
AKT3 chr1 243665065 244013430 transcript
in the first file columns 2 and 3 are start and end. in the second file column 3 and 4 are start and end respectively. I want to make a new text file from both first and second files.
in the new file I want to count the number of lines in the file second which match to every line in the file first based on the following criteria (3 columns):
1- the 1st column in file first is equal to 2nd column in file second.
2- the 3rd column in the file second is greater than the the 2nd column in the file first and also smaller than the 3rd column in the file first.
3- the 4th column in the file second should be also greater than the the 2nd column in the file first and also smaller than the 3rd column in the file first.
in act the output looks like the expected output. the first 7 columns are directly from the file first but the 9th column is the number of lines in the file second that match every single line in the file first (based on 3 mentioned above criteria). and the 8th column would be "the first column of the line from the file second that matches to specific line of file first"
expected output:
chr19,107180000,107680000,1,1,5,4,AKAP8L, 1
chr1,105000000,105310000,2,1,3,2, AKT3, 1
I am trying to do that in python and wrote this code but it does not return what I am looking for.
first = open('first.csv', 'rb')
second = open('second.txt', 'rb')
first_file = []
for line in first:
first_file.append(line.split(','))
second_file = []
for line2 in second:
second_file.append(line.split())
count=0
final = []
for i in range(len(first_file)):
for j in range(len(second_file)):
first_row = first_file[i]
second_row = second_file[j]
first_col = first_row.split()
second_col = second_row.split()
if first_col[0] == second_col[1] and first_col[1] < second_col[2] < first_col[2] and first_col[1] < second_col[3] < first_col[2]
count+=1
final.append(first_col[i]+second_col[0]+count)

Given that you don't have column names this looks really robust, but it works and uses pandas:
import pandas as pd
first = 'first.csv'
second = 'second.txt'
df1 = pd.read_csv(first, header=None)
df2 = pd.read_csv(second, sep='\s+', header=None)
merged = df1.merge(df2, left_on=[0], right_on=[1], suffixes=('first', 'second'))
a, b, c, d = merged['2second'], merged['1first'], merged['2first'], merged['3second']
cleaned = merged[(c>a)&(a>b)&(c>d)&(d>b)]
counted = cleaned.groupby(['0first', '1first', '2first', '3first', '4first', 5, 6, '0second'])['4second'].count().reset_index()
counted.to_csv('result.csv', index=False, header=False)
This produces the result.csv with following content:
chr1,105000000,105310000,2,1,3,2,AKT3,1
chr19,107180000,107680000,1,1,5,4,AKAP8L,1

In you same setting if you do as below it will work.
first = open('first.csv', 'r')
second = open('second.txt', 'r')
first_file = []
for line in first:
first_file.append(line.strip())
second_file = []
for line2 in second:
second_file.append(line2)
count=0
final = []
for i in range(len(first_file)):
for j in range(len(second_file)):
first_row = first_file[i]
second_row = second_file[j]
first_col = first_row.split(',')
second_col = second_row.split()
if (first_col[0] == second_col[1]) and (first_col[1] < second_col[2] < first_col[2]) and (first_col[1] < second_col[3] < first_col[2]):
count = count + 1
final.append(first_row +','+second_col[0]+',' + str(count))
print(final)
This will generate the same result what you want.
['chr1,105000000,105310000,2,1,3,2,AKT3,1', 'chr19,107180000,107680000,1,1,5,4,AKAP8L,2']

check 2 columns script to run fast

I have a script for checking 2 text files together and print out the common field. However I don't feel it is quick enough and I'm looking for optimization.
FILE1 (10k rows, 3 columns) and FILE2 (200k rows, 2 columns) with 1 field common to both files (csv files).
FILE1:
92073263d,86674404000555506123,Communication
FILE2:
163738212,7a93632111w7-01e7-40e7-9387-1863e7683eca
63729jd83,07633221122c-6598-4489-b539-e42e2dcb3235
8djdy37w8,2b8retyre396-2472-4b2d-8d07-e170fa3d1f64
92073263d,07633221122c-6ew8-4eww-b539-e42dsadsadsa
with open('FILE1') as file1:
file1_contents = { tuple(line.split(',')) for line in file1 }
print file1_contents
with open('FILE2') as file2:
for line in file2:
c1,c2 = line.split()
if c1 in file1_contents:
f = open("FILE3","w")
f.write(c2)
f.close()
this line if c1 in file1_contents is giving me a hard time as I want to avoid any nested loop to maintain high speed. Any suggestion?

thanks COLDSPEED again..here is my new code:
import pandas
data_comreport= pandas.read_csv('FILE1', sep = ',', header = 0)
data_db= pandas.read_csv('FILE2', sep = ',', header = None)
data_db.columns = ['SerialNumber', 'GUID']
data = pandas.merge(data_db,data_comreport,left_on = 'SerialNumber', right_on='SerialNumber', how='inner')
print data
#result = data.loc[data['FailureReason'] != ['Failure to export']]
#if result != None:
clean_data=data.to_csv('list.txt',index=False, columns=['GUID'],header = None)

Python: reading the same rows from a csv file - logic

I have a problem with appending data for missing rows in the csv file: I am reading rows from a csv file for each customer and appending lists with the data the rows have. Each customer needs to have the same id's that are highlighted in green in the example image. If the next customer doesn't have the rows with all needed id's, I still need to append 0 values to the lists for these missing rows. So the customer highlighted in yellow needs to have same number of values appended to the data lists as the one in green.
I am trying to read each row and compare its id with the list of all possible id's that I created, but I am always stuck on the first id and not sure if this is the right way to go and read the previous row again until it's id is equal to the id from the list for possible id's (I do this to add the missing row's data to the list). Please let me know if you have any suggestions?
Note: if take into consideration only the column with id's, for these two customers I would like the list to look like this: list_with_ids = [410, 409, 408, 407, 406, 405, 403, 402, **410, 409, 408, 407, 406, 405, 403, 402**]. So I am looking for a way - once I am on row 409 in yellow - to first append the first needed id 410, and only then 409 and so forth. And same - append the two missing ids at the end: 403, 402.
Code:
def write_data(workbook):
[...]
# Lists.
list_cust = []
list_quantity = [] # from Some_data columns
# Get the start row in the csv file.
for row in range(worksheet.nrows):
base_id = str(410)
value = worksheet.cell(row, 1).value
start = str(value)
if base_id [0] == start[0]:
num_of_row_for_id = row
# Append the first id.
first_cust = str(worksheet.cell(num_of_row_for_id, 0).value)
list_cust.append(first_cust)
# Needed to count id's.
count = 0
# List with all needed id's for each customer.
# instead of ... - all ids' in green from the image.
all_ids = [....]
# Get data.
for row in range(worksheet.nrows):
next_id = str(worksheet.cell(num_of_row_for_id, 1).value)
cust = str(worksheet.cell(num_of_row_for_id, 0).value)
# Append id to the list.
list_cust.append(cust)
# Needed to separate rows for each customer.
if list_cust[len(list_cust)-1] == list_cust[len(list_cust)-2]:
# Get data: I read columns to get data.
# Let's say I read col 4 to 21.
for col_num in range(3, 20):
# Here is the prolem: ############################
if next_id != all_ids[count]:
list_quantity.append(0)
if next_id == all_ids[count]:
qty = worksheet.cell(num_of_row_for_id, col_num).value
list_quantity.append(qty)
# Get the next row in reverse order.
num_of_row_for_id -= 1
# Increment count for id's index.
if list_cust[len(list_cust)-1] == list_cust[len(list_cust)-2]:
# 8 possible id's.
if count < 7:
count += 1
else:
count = 0

Consider the following data wrangling with list comprehensions and loops using following data input containing random data columns:
Input Data
# Cust ID Data1 Data2 Data3 Data4 Data5
# 2011 62,404 0.269101238 KPT 0.438881697 UAX 0.963170513
# 2011 62,405 0.142397746 XYD 0.51668728 PTQ 0.761695425
# 2011 62,406 0.782342616 QCN 0.259141256 FNX 0.870971924
# 2011 62,407 0.221750017 EIU 0.358439487 MAN 0.13633062
# 2011 62,408 0.097509568 CRU 0.410058705 BFK 0.680228327
# 2011 62,409 0.322871333 LAC 0.489425167 GUX 0.449476844
# 919 62,403 0.371461633 PUR 0.626146074 KWX 0.525711736
# 919 62,404 0.384859932 AJZ 0.223408599 JSU 0.914916663
# 919 62,405 0.020630503 SFY 0.260778598 VUU 0.213559498
# 919 62,406 0.952425138 EBI 0.59595738 ZYU 0.283794413
# 919 62,407 0.410368534 BTT 0.252698401 FFY 0.41080646
# 919 62,408 0.553390336 GMA 0.846309022 BIN 0.049852419
# 919 62,409 0.193437955 NBB 0.877311494 XQX 0.080656637
Python code
import csv
i = 0
data = []
# READ CSV AND CAPTURE HEADERS AND DATA
with open('Input.csv', 'r') as f:
rdr = csv.reader(f)
for line in rdr:
if i == 0:
headers = line
else:
line[1] = int(line[1].replace(',',''))
data.append(line)
i += 1
# CREATE NEEDED LISTS
cust_list = list(set([i[0] for i in data]))
id_list = [62402,62403,62404,62405,62406,62407,62408,62409,62410]
# CAPTURE MISSING IDS BY CUSTOMER
for c in cust_list:
currlist = [d[1] for d in data if d[0] == c]
missingids = [i for i in id_list if i not in currlist]
for m in missingids:
data.append([c, m,'','','','',''])
# WRITE DATA TO NEW CSV IN SORTED ORDER
with open('Output.csv', 'w') as f:
wtr = csv.writer(f, lineterminator='\n')
wtr.writerow(headers)
for c in cust_list:
for i in sorted(id_list, reverse=True):
for d in data:
if d[0] == c and d[1] == i:
wtr.writerow(d)
Output Data

Consider even Python third-party modules such as pandas, the data analysis package; and even an SQL solution using pyodbc since Windows' built-in Jet/ACE SQL Engine can query CSV files directly.
You will notice below and previous solution, quite a bit of handling is needed to remove the thousand comma separators in the ID column as modules consider them as string first. If you remove such commas from original csv file, you can reduce lines of code.
Pandas (with left merge on two dataframes)
import pandas as pd
df = pd.read_csv('Input.csv')
cust_list = df['Cust'].unique()
id_list = [62402,62403,62404,62405,62406,62407,62408,62409,62410]
ids = pd.DataFrame({'Cust': [int(c) for i in id_list for c in cust_list],
'ID': [int(i) for i in id_list for c in cust_list]})
df['ID'] = df['ID'].str.replace(',','').astype(int)
df = ids.merge(df, on=['Cust', 'ID'], how='left').\
sort_values(['Cust', 'ID'], ascending=[True, False])
df.to_csv('Output_pandas.csv', index=False)
PyODBC (works only for Windows machines using left join on two csv files)
import pyodbc
conn = pyodbc.connect(r'Driver=Microsoft Access Text Driver (*.txt, *.csv);' + \
'DBQ=C:\Path\To\CSV\Files;Extensions=asc,csv,tab,txt;',
autocommit=True)
cur = conn.cursor()
cust_list = [i[0] for i in cur.execute("SELECT DISTINCT c.Cust FROM Input.csv c")]
id_list = [62402,62403,62404,62405,62406,62407,62408,62409,62410]
cur.close()
with open('ID_list.csv', 'w') as f:
wtr = csv.writer(f, lineterminator='\n')
wtr.writerow(['Cust', 'ID'])
for item in [[int(c),int(i)] for c in cust_list for i in id_list]:
wtr.writerow(item)
i = 0
with open('Input.csv', 'r') as f1, open('Input_without_commas.csv', 'w') as f2:
rdr = csv.reader(f1); wtr = csv.writer(f2, lineterminator='\n')
for line in rdr:
if i > 0:
line[1] = int(line[1].replace(',',''))
wtr.writerow(line)
i += 1
strSQL = "SELECT i.Cust, i.ID, c.Data1, c.Data2, c.Data3, c.Data4, c.Data5 " +\
" FROM ID_list.csv i" +\
" LEFT JOIN Input_without_commas.csv c" +\
" ON i.Cust = c.Cust AND i.ID = c.ID" +\
" ORDER BY i.Cust, i.ID DESC"
cur = conn.cursor()
with open('Output_sql.csv', 'w') as f:
wtr = csv.writer(f, lineterminator='\n')
wtr.writerow(['Cust', 'ID', 'Data1', 'Data2', 'Data3', 'Data4', 'Data5'])
for i in cur.execute(strSQL):
wtr.writerow(i)
cur.close()
conn.close()
Output (for both above solutions)

Comparing two csv files and getting difference

I have two csv file I need to compare and then spit out the differnces:
CSV FORMAT:
Name Produce Number
Adam Apple 5
Tom Orange 4
Adam Orange 11
I need to compare the two csv files and then tell me if there is a difference between Adams apples on sheet and sheet 2 and do that for all names and produce numbers. Both CSV files will be formated the same.
Any pointers will be greatly appreciated

I have used csvdiff
$pip install csvdiff
$csvdiff --style=compact col1 a.csv b.csv
Link to package on pypi
I found this link useful

If your CSV files aren't so large they'll bring your machine to its knees if you load them into memory, then you could try something like:
import csv
csv1 = list(csv.DictReader(open('file1.csv')))
csv2 = list(csv.DictReader(open('file2.csv')))
set1 = set(csv1)
set2 = set(csv2)
print set1 - set2 # in 1, not in 2
print set2 - set1 # in 2, not in 1
print set1 & set2 # in both
For large files, you could load them into a SQLite3 database and use SQL queries to do the same, or sort by relevant keys and then do a match-merge.

One of the best utilities for comparing two different files is diff.
See Python implementation here: Comparing two .txt files using difflib in Python

import csv
def load_csv_to_dict(fname, get_key, get_data):
with open(fname, 'rb') as inf:
incsv = csv.reader(inf)
incsv.next() # skip header
return {get_key(row):get_data(row) for row in incsv}
def main():
key = lambda r: tuple(r[0:2])
data = lambda r: int(r[2])
f1 = load_csv_to_dict('file1.csv', key, data)
f2 = load_csv_to_dict('file2.csv', key, data)
f1keys = set(f1.iterkeys())
f2keys = set(f2.iterkeys())
print("Keys in file1 but not file2:")
print(", ".join(str(a)+":"+str(b) for a,b in (f1keys-f2keys)))
print("Keys in file2 but not file1:")
print(", ".join(str(a)+":"+str(b) for a,b in (f2keys-f1keys)))
print("Differing values:")
for k in (f1keys & f2keys):
a,b = f1[k], f2[k]
if a != b:
print("{}:{} {} <> {}".format(k[0],k[1], a, b))
if __name__=="__main__":
main()

If you want to use Python's csv module along with a function generator, you can use nested looping and compare large .csv files. The example below compares each row using a cursory comparision:
import csv
def csv_lazy_get(csvfile):
with open(csvfile) as f:
r = csv.reader(f)
for row in r:
yield row
def csv_cmp_lazy(csvfile1, csvfile2):
gen_2 = csv_lazy_get(csvfile2)
for row_1 in csv_lazy_get(csvfile1):
row_2 = gen_2.next()
print("row_1: ", row_1)
print("row_2: ", row_2)
if row_2 == row_1:
print("row_1 is equal to row_2.")
else:
print("row_1 is not equal to row_2.")
gen_2.close()

Here a start that does not use difflib. It is really just a point to build from because maybe Adam and apples appear twice on the sheet; can you ensure that is not the case? Should the apples be summed, or is that an error?
import csv
fsock = open('sheet.csv','rU')
rdr = csv.reader(fsock)
sheet1 = {}
for row in rdr:
name, produce, amount = row
sheet1[(name, produce)] = int(amount) # always an integer?
fsock.close()
# repeat the above for the second sheet, then compare
You get the idea?

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Comparing 2 Huge csv Files in Python - python

use panda compare import pandas as pd f1 = pd.read_csv(file_1.csv) f2 = pd.read_csv(file_2.csv) changed = f1.compare(f2) change = f1[f1.index.isin(changed.index)] print(change)

Related

Python for loop enumerate

poblem in python while combining 2 text file and summarizing them into a new text file

check 2 columns script to run fast

Python: reading the same rows from a csv file - logic

Comparing two csv files and getting difference

Categories

Resources