check 2 columns script to run fast - python

I have a script for checking 2 text files together and print out the common field. However I don't feel it is quick enough and I'm looking for optimization.
FILE1 (10k rows, 3 columns) and FILE2 (200k rows, 2 columns) with 1 field common to both files (csv files).
FILE1:
92073263d,86674404000555506123,Communication
FILE2:
163738212,7a93632111w7-01e7-40e7-9387-1863e7683eca
63729jd83,07633221122c-6598-4489-b539-e42e2dcb3235
8djdy37w8,2b8retyre396-2472-4b2d-8d07-e170fa3d1f64
92073263d,07633221122c-6ew8-4eww-b539-e42dsadsadsa
with open('FILE1') as file1:
file1_contents = { tuple(line.split(',')) for line in file1 }
print file1_contents
with open('FILE2') as file2:
for line in file2:
c1,c2 = line.split()
if c1 in file1_contents:
f = open("FILE3","w")
f.write(c2)
f.close()
this line if c1 in file1_contents is giving me a hard time as I want to avoid any nested loop to maintain high speed. Any suggestion?

thanks COLDSPEED again..here is my new code:
import pandas
data_comreport= pandas.read_csv('FILE1', sep = ',', header = 0)
data_db= pandas.read_csv('FILE2', sep = ',', header = None)
data_db.columns = ['SerialNumber', 'GUID']
data = pandas.merge(data_db,data_comreport,left_on = 'SerialNumber', right_on='SerialNumber', how='inner')
print data
#result = data.loc[data['FailureReason'] != ['Failure to export']]
#if result != None:
clean_data=data.to_csv('list.txt',index=False, columns=['GUID'],header = None)

Related

Is there a faster way of reading two files line by line, then adding one line at the end of the other?

so here's my problem:
I have two CSV files with each files having around 500 000 lines.
File 1 looks like this:
ID|NAME|OTHER INFO
353253453|LAURENT|STUFF 1
563636345|MARK|OTHERS
786970908|GEORGES|THINGS
File 2 looks like this:
LOCATION;ID_PERSON;PHONE
CA;786970908;555555
NY;353253453;555666
So what I have to do is look look for the lines where there are the same IDs, and add the line from file 2 to the end of corresponding line from file 1 in a new file, and if there's no corresponding IDs, add empty columns, like this:
ID;NAME;OTHER INFO;LOCATION;ID_PERSON;PHONE
353253453;LAURENT;STUFF 1;NY;353253453;555666
563636345;MARK;OTHERS;;;
786970908;GEORGES;THINGS;CA;786970908;555555
File 1 is the primary one if that makes sense.
The thing is I have found a solution but it takes way too long since for each lines of file 1 I loop through file 2.
Here's my code:
input1 = open(filename1, 'r', errors='ignore')
input2 = open(filename2, 'r', errors='ignore')
output = open('result.csv', 'w', newline='')
for line1 in input1:
line_splitted = line1.split("|")
id_1 = line_splitted[0]
index = 0
find = False
for line2 in file2:
file2_splitted = line2.split(";")
if id_1 in file2_splitted[1]:
output.write((";").join(line1.split("|"))+line2)
find = True
file2.remove(line2)
break
index+=1
if index == len(file2) and find == True:
output.write((";").join(line1.split("|")))
for j in range(nbr_col_2):
output.write(";")
output.write("\n")
So I was wondering if there is a faster way to do that, or if I just have to be patient, because right now after 20 minutes, only 20000 lines have been written...
First read file2 line by line in order to build the lookup dict.
Then read file1 line by line, lookup in the dict for the ID key, build the output line and write to the output file.
Should be quite efficient with complexity = O(n).
Only the dict consumes a little bit of memory.
All the file processing is "stream-based".
with open("file2.txt") as f:
lookup = {}
f.readline() # skip header
while True:
line = f.readline().rstrip()
if not line:
break
fields = line.split(";")
lookup[fields[1]] = line
with open("file1.txt") as f, open("output.txt", "w") as out:
f.readline() # skip header
out.write("ID;NAME;OTHER INFO;LOCATION;ID_PERSON;PHONE\n")
while True:
line_in = f.readline().rstrip()
if not line_in:
break
fields = line_in.split("|")
line_out = ";".join(fields)
if found := lookup.get(fields[0]):
line_out += ";" + found
else:
line_out += ";;;"
out.write(line_out + "\n")
As Alex pointed out in his comment, you can merge both files using pandas.
import pandas as pd
# Load files
file_1 = pd.read_csv("file_1.csv", index_col=0, delimiter="|")
file_2 = pd.read_csv("file_2.csv", index_col=1, delimiter=";")
# Rename PERSON_ID as ID
file_2.index.name = "ID"
# Merge files
file_3 = file_1.merge(file_2, how="left", on="ID")
file_3.to_csv("file_3.csv")
Using your examples file_3.csv looks like this:
ID,NAME,OTHER INFO,LOCATION,PHONE
353253453,LAURENT,STUFF 1,NY,555666.0
563636345,MARK,OTHERS,,
786970908,GEORGES,THINGS,CA,555555.0
Extra
By the way, if you are not familiar with pandas, this is a great introductory course: Learn Pandas Tutorials
You can create indexing to prevent iterating over file2 each time.
Do this by creating a dictionary from file2 and retrieving each related item of it by calling its index.
file1 = open(filename1, 'r', errors='ignore')
file2 = open(filename2, 'r', errors='ignore')
output = open('result.csv', 'w', newline='')
indexed_data = {}
for line2 in file2.readlines()[1:]:
data2 = line2.rstrip('\n').split(";")
indexed_data[data2[1]] = {
'Location': data2[0],
'Phone': data2[2],
}
output.write('ID;NAME;OTHER INFO;LOCATION;ID_PERSON;PHONE\n')
for line1 in file1.readlines()[1:]:
data1 = line1.rstrip('\n').split("|")
if data1[0] in indexed_data:
output.write(f'{data1[0]};{data1[1]};{data1[2]};{indexed_data[data1[0]]["Location"]};{data1[0]};{indexed_data[data1[0]]["Phone"]}\n')
else:
output.write(f'{data1[0]};{data1[1]};{data1[2]};;;\n')

Compare 2 csv files and remove the common lines from 1st file | python

I want to compare 2 csv files master.csv and exclude.csv and remove all the matching lines based on column1 and write the final output in mater.csv file.
master.csv
abc,xyz
cde,fgh
ijk,lmn
exclude.csv
###Exclude list###
cde
####
Expected output (it should overwrite master.csv
abc,xyz
ijk,lmn
Tried till now
with open('exclude.csv','r') as in_file, open('master.csv','w') as out_file:
seen = set()
for line in in_file:
if line in seen: continue # skip duplicate
seen.add(line)
out_file.write(line)
I believe there should be some pandas or other modules approaches, but here is a pure pythonic approach:
with open("master.csv") as f:
master = f.read()
with open("exclude.csv") as f:
exclude = f.read()
master = master.strip().split("\n")
exclude = exclude.strip().split("\n")
returnList = []
for line in master:
check = True
for exc in exclude:
if exc in line:
check = False
break
if check:
returnList.append(line)
with open("master.csv", "w") as f:
f.write("\n".join(returnList))
Output of master.csv
abc,xyz
ijk,lmn
Simplest way by using pandas..
import pandas as pd
# Reading the csv file
df_new = pd.read_csv('Names.csv')
# saving xlsx file
GFG = pd.ExcelWriter('Names.xlsx')
df_new.to_excel(GFG, index=False)
GFG.save()
A purely pythonic answer leveraging list comprehensions:
with open('master.csv', 'r') as f:
keep_lines = f.readlines()
with open('exclude.csv', 'r') as f:
drop_lines = f.readlines()
write_lines = [line[0] for line in zip(keep_lines, drop_lines) if line[0].strip().split(',')[0] != line[1].strip()]
with open('master.csv', 'w') as f:
f.writelines(write_lines)
You can use pandas like this:
import pandas as pd
master_df=pd.read_csv('master.csv')
exclude_df=pd.read_csv('exclude.csv')
conc=pd.concat([master_df,exclude_df]) #concatenate two dataframe
conc.drop_duplicates(subset=['col1'],inplace=True,keep=False)
print(conc)
drop_duplicates with subset = col1 will check for duplicate in col1 only
and keep has 3 values allowed:first,last and False...
i have chosen keep=False to not keep any duplicate
Dataset:
master.csv:
col1,col2
abc,xyz
cde,fgh
ijk,lmn
exclude.csv:
col1
cde

Comparing 2 Huge csv Files in Python

I have 2 csv files.
File1:
EmployeeName,Age,Salary,Address
Vinoth,12,2548.245,"140,North Street,India"
Vinoth,12,2548.245,"140,North Street,India"
Karthick,10,10.245,"140,North Street,India"
File2:
EmployeeName,Age,Salary,Address
Karthick,10,10.245,"140,North Street,India"
Vivek,20,2000,"USA"
Vinoth,12,2548.245,"140,North Street,India"
I want to compare these 2 files and report the differences into another csv file. I've used the below python code ( version 2.7)
#!/usr/bin/env python
import difflib
import csv
with open('./Input/file1', 'r' ) as t1:
fileone = t1.readlines()
with open('./Input/file2', 'r' ) as t2:
filetwo = t2.readlines()
with open('update.csv', 'w') as outFile:
for line in filetwo:
if line not in fileone:
outFile.write(line)
for line in fileone:
if line not in filetwo:
outFile.write(line)
When I execute, below is the output I got:
Actual Output
Vivek,20,2000,"USA"
But my expected output is below since the Records for "Vinoth" in file1 is present 2 times, but only present 1 time in file2.
Expected Output
Vinoth,12,2548.245,"140,North Street,India"
Vivek,20,2000,"USA"
Questions
Please let me know how to get the expected output.
Also , how to get the Filename and line number of the difference record to the output file?
The issue you are running into is that the in keyword only checks for the presence of an item, not if the item exists twice. If you are open to using an external package, you can do this pretty quickly with pandas.
import pandas as pd
df1 = pd.read_csv('Input/file1.csv')
df2 = pd.read_csv('Input/file2.csv')
# create a new column with the count of how many times the row exists
df1['count'] = 0
df2['count'] = 0
df1['count'] = df1.groupby(df1.columns.to_list()[:-1]).cumcount() + 1
df2['count'] = df2.groupby(df2.columns.to_list()[:-1]).cumcount() + 1
# merge the two data frames with and outer join, add an indicator variable
# to show where each row (including the count) exists.
df_all = df1.merge(df2, on=df1.columns.to_list(), how='outer', indicator='exists')
print(df_all)
# prints:
EmployeeName Age Salary Address count exists
0 Vinoth 12 2548.245 140,North Street,India 1 both
1 Vinoth 12 2548.245 140,North Street,India 2 left_only
2 Karthick 10 10.245 140,North Street,India 1 both
3 Vivek 20 2000.000 USA 1 right_only
# clean up exists column and export the rows do not exist in both frames
df_all['exists'] = (df_all.exists.str.replace('left_only', 'file1')
.str.replace('right_only', 'file2'))
df_all.query('exists != "both"').to_csv('update.csv', index=False)
Edit: non-pandas version
You can check for difference in identical line counts using the row as a key and the count as the value.
from collection import defaultdict
c1 = defaultdict(int)
c2 = defaultdict(int)
with open('./Input/file1', 'r' ) as t1:
for line in t1:
c1[line.strip()] += 1
with open('./Input/file2', 'r' ) as t2:
for line in t2:
c2[line.strip()] += 1
# create a set of all rows
all_keys = set()
all_keys.update(c1)
all_keys.update(c2)
# find the difference in the number of instances of the row
out = []
for k in all_keys:
diff = c1[k] - c2[k]
if diff == 0:
continue
if diff > 0:
out.extend([k + ',file1'] * diff) # add which file it came from
if diff < 0:
out.extend([k + ',file2'] * abs(diff)) # add which file it came from
with open('update.csv', 'w') as outFile:
outFile.write('\n'.join(out))
use panda compare
import pandas as pd
f1 = pd.read_csv(file_1.csv)
f2 = pd.read_csv(file_2.csv)
changed = f1.compare(f2)
change = f1[f1.index.isin(changed.index)]
print(change)

Pandas - How to compare 2 CSV files and output changes

Situation
I have 2 CSVs that are 10k rows by 140 columns that are largely identical and need to identify the differences. The headers are the exact same and the rows are almost the same (100 of 10K might have changed).
Example
File1.csv
ID,FirstName,LastName,Phone1,Phone2,Phone3
1,Bob,Jones,5555555555,4444444444,3333333333
2,Jim,Hill,2222222222,1111111111,0000000000
File2.csv
ID,FirstName,LastName,Phone1,,Phone2,,Phone3
1,Bob, Jones,5555555555,4444455444,3333333333
2,Jim, Hill,2222222222,1155111111,0005500000
3,Kim, Grant,2173659851,3214569874,3698521471
Outputfile.csv
ID,FirstName,LastName,Phone1,Phone2,Phone3
1,Bob,Jones,5555555555,4444444444,3333333333
2,Jim,Hill,2222222222,1111111111,0005500000
3,Kim, Grant,2173659851,3214569874,3698521471
I think I want the output to be File2.csv with changes from File1.csv highlighted somehow. I'm new to python and pandas and can't seem to figure out where to start. I did my best to search google for something similar to adapt to my needs but the scripts appeared to be to specific to the situation.
If someone knows of an easier/different way, I'm all ears. I don't care how this happens as long as I don't have to check record-by-record.
CSV generally doesn't support different fonts, but here's a solution that uses bold and colors output to the console (note: I only tested on Mac). If you're using Python 3.7+ (dictionaries sorted by insertion order), then the dictionary ordering and columns list shouldn't be necessary.
from collections import OrderedDict
from csv import DictReader
class Color(object):
GREEN = '\033[92m'
RED = '\033[91m'
BOLD = '\033[1m'
END = '\033[0m'
def load_csv(file):
# Index by ID in order, and keep track of the original column order
with open(file, 'r') as fp:
reader = DictReader(fp, delimiter=',')
rows = OrderedDict((r['ID'], r) for r in reader)
return rows, reader.fieldnames
def print_row(row, cols, color, prefix):
print(Color.BOLD + color + prefix + ','.join(row[c] for c in cols) + Color.END)
def print_diff(row1, row2, cols):
row = []
for col in cols:
value1 = row1[col]
if row2[col] != value1:
row.append(Color.BOLD + Color.GREEN + value1 + Color.END)
else:
row.append(value1)
print(','.join(row))
def diff_csv(file1, file2):
rows1, cols = load_csv(file1)
rows2, _ = load_csv(file2)
for row_id, row1 in rows1.items():
# Pop the matching ID row
row2 = rows2.pop(row_id, None)
# If not in file2, then it was added
if not row2:
print_row(row1, cols, Color.GREEN, '+')
# In both files, print the diff
else:
print_diff(row1, row2, cols)
# Anything remaining from file2 was removed in file1
for row in rows2.values():
print_row(row, cols, Color.RED, '-')
This can be done simply by using python's built in CSV library. If you also care about the order of your entries, you can use an OrderedDict to maintain the original file order.
import csv
f = []
f3 = file('results.csv', 'w')
with open('file1.csv', 'rb') as f1, open('file2.csv', 'rb') as f2:
reader1 = csv.reader(f1, delimiter=",")
reader2 = csv.reader(f2, delimiter=",")
for line in enumerate(reader1):
f.append(line) #For the first file, add them all
for line in enumerate(reader2):
if not any(e[0] == line[0] for e in f): #For the second file, only add them if there is not an entry with the same name already
f.append(line)
for e in f:
if e[0] == line[0]:
changedindexes = i != j for i, j in zip(e[0], line[0])
for val in changedindexes:
e[val] = e[val] + 'c'
c3 = csv.writer(f3, , quoting=csv.QUOTE_ALL)
for line in f: #Write the new merged files into another csv
c3.writerow(line)
#Then find the differences between the two orderedDicts
As for bolding, there is no way to do that in CSV, as csv files contain data, not any formatting information.
A second way:
# get indices of differences
difference_locations = np.where(df1 != df2)
#define reference
changed_from = df1.values[difference_locations]
changed_to = df2.values[difference_locations]
df_differences = pd.DataFrame({'from': changed_from, 'to': changed_to}, index=changed.index)

Comparing data between 4 csv files and writing them to separate output files

Could someone please advice me on how I could improve my code? I have 4 big csv files. The first is a reference file to which 3 other files (file1, file2 and file3) are compared to. In the files, there are three columns. Each row is a unit (e.g. ABC, DEF, GHI are 3 separate units).
col_1 col_2 col_3
A B C
D E F
G H I
I would like to compare file1, file2 and file3 to the reference file. If unit for each row in the reference file is present in all 3 files, I would like to write them into file A. If unit for each row is present in at least 1 of the 3 files, they should be written to file B. If unit for each each row is not present in any of the 3 files, I would like to write them in file C. My current strategy is to append the files as 4 separate lists and to compare them. I realize that this approach is memory intensive. In addition, my script has been running for a long time without final output. As such I was wondering if there is a more efficient approach to this problem?
Below is my code:
import csv
reference_1 = open ('reference.csv', 'rt', newline = '')
reader = csv.reader(reference_1, delimiter = ',')
file1 = open ('file1.csv','rt', newline = '')
reader1 = csv.reader(file1, delimiter = ',')
file2 = open ('file2.csv', 'rt',newline = '')
reader2 = csv.reader(file2, delimiter = ',')
file3 = open ('file3.csv', 'rt',newline = '')
reader3 = csv.reader(file3, delimiter = ',')
Common = open ('Common.csv', 'w',newline = '')
writer1 = csv.writer(Common, delimiter = ',')
Partial = open ('Partial.csv', 'w',newline = '')
writer2 = csv.writer(Partial, delimiter = ',')
Absent = open ('Absent.csv', 'w',newline = '')
writer3 = csv.writer(Absent, delimiter = ',')
reference = []
fileA = []
fileB = []
fileC = []
for row in reader:
reference.append (row)
for row in reader1:
fileA.append(row)
for row in reader2:
fileB.append(row)
for row in reader3:
fileC.append(row)
for row in reference:
if row in fileA and row in fileB and row in fileC:
writer1.writerow (row)
continue
elif row in fileA or row in fileB or row in fileC:
writer2.writerow (row)
continue
else:
writer3.writerow (row)
reference_1.close()
file1.close()
file2.close()
file3.close()
Common.close()
Partial.close()
Absent.close()
Assuming the order of the rows is not important and that there aren't duplicate rows in the reference file, here is an option using set.
def file_to_set(filename):
"""Opens a file and returns a set containing each line of the file."""
with open(filename) as f:
return set(f.read().splitlines(True))
def set_to_file(s, filename):
"""Writes a set to file."""
with open(filename, 'w') as f:
f.writelines(s)
def compare_files(ref_filename, *files):
"""Compares a reference file to two or more files."""
if len(files) < 2:
raise TypeError("compare_files expected at least 2 files, got %s" %
len(files))
ref = file_to_set(ref_filename)
file_data = [file_to_set(f) for f in files]
all = file_data[0].union(*file_data[1:])
common = ref.intersection(*file_data)
partial = ref.intersection(all).difference(common)
absent = ref.difference(all)
set_to_file(common, 'common.csv')
set_to_file(partial, 'partial.csv')
set_to_file(absent, 'absent.csv')
compare_files('reference.csv', 'file1.csv', 'file2.csv', 'file3.csv')
The idea is:
Create sets containing each line of a file.
Make a set (all) that contains every line in every file (except the reference file).
Make a set (common) that contains only the lines that are in every file, including the reference file.
Make a set (partial) that contains the lines in the reference file that also appear in at least one but not all of the other files.
Make a set (absent) that contains the lines only present in the reference file.
Write common, partial, and absent to files.

Categories

Resources