Comparing two files with python...iterating line by line

Comparing two files with python...iterating line by line - python

I have two files that I want to compare:
-File 1 contains SHA1 hash values and the file path of executable files. (Each line is a different file.)
-File 2 is a file containing known files and their SHA1 hash value and other data. (Again, each line is a different file.)
I want to compare each line of file 1 against each line of file 2. If there is a match, I want the script to stop comparing and start with the next line of file 1. If there is no match, I want to write that file hash and path from file 1 to file 3.
Also, the filesize of File 2 is about ~30GB - so the code needs to be optimized to do the searches as efficiently and fast as possible.
Here is what I have so far, but I am not getting the desired output.
#!/usr/bin/env python
import sys
with open(sys.argv[3], 'w') as f3:
f3.write('Hash Values of Unknown/Modified Executables:'+'\n')
f3.close()
with open(sys.argv[1], 'r') as f1:
line_f1 = f1.readline()[0:40]
full_line = f1.readline()
with open(sys.argv[2], 'r') as f2:
next(f2)
line_f2 = f2.readline()[1:41]
for line_f1 in line_f2:
if (line_f1 == line_f2):
pass
elif (line_f1 != line_f2):
with open(sys.argv[3], 'a') as f3:
f3.write(full_line)
Here is the sample data set that I am working with (The data has been modified from its original form to allow for some matches and some that do not match.):
File 1
00062169d823ecb98e47918d61be2fe8a84a303b /usr/test1
000864f38d0a8505ee8b5618a29039fe1e644fbe /usr/test2 #NO MATCH
001988b60a8fb6a22d2bfdd442c723b477f840fb /usr/test3 #NO MATCH
001a3627d954ed621910f3c79489a63db36916ba /usr/test4 #NO MATCH
001bf3fa581c7660216b34834d7ac8dc5a75a83f /usr/test5 #NO MATCH
00307660e0c54193b9cf3630d38312acf10b4093 /usr/bin/test6 #NO MATCH
00420df3e26140830e5e298e51d48d3f3c6ffc7e /usr/bin/test7 #NO MATCH
004d06de7875d2c20b0c5b29c3c658bce24b5869 /usr/bin/test8 #NO MATCH
00512e1dd13f0389322de86027b5b5ff74acc706 /usr/bin/test9
005691edd3987a833fcff92be169a71796a4972a /usr/bin/test10 #NO MATCH
0060b09755987013a7bbe5992866f08c39db9e6b /usr/sbin/test11
0063c0d2ab0c4e89dd87a455bb4142e2aa954b62 /usr/sbin/test12
00646760493c4fd9f4644572449c702b2c43116b /usr/sbin/test13
006732aaf4649b21878f4077af807ac34c71dd5d /usr/sbin/test14
006830799e5673d0d8bb988bfc43b6874661f90f /usr/sbin/test15
006cc4f1004af878422bb0775592769f0b7add42 /home/admin/Desktop/test16
009a4244a3e8932c91c1d1eb2057c67a2a15087f /home/admin/Desktop/test17
00a04e033f1191bb1a993777c28b5ba729ceac28 /home/admin/Desktop/test18
00aacb0db059d1d8ff0ccafbeeafc7e32ba4fd10 /home/admin/Desktop/test19
00ab49b182b0480f3e7eacff7f1d9505dc0a3a32 /home/admin/Desktop/test20
00b64e72857c63592031d2c682c4563e22a35b98 /home/admin/Documents/test21
00b73f67c6f4a6ffe4f9842f57271ed91225e530 /home/admin/Documents/test22
00ba66e659b9b519401ac69c8cf9f3901055ed42 /home/admin/Documents/test23 #NO MATCH
00c0f1ed488ed13f61f1d77d8b1bf1c3bee1b7e5 /home/admin/Documents/test24 #NO MATCH
00c784a2b0a2818e16ce96eddb676ab17f594e9b /home/admin/Documents/test25
File 2
"SHA-1","MD5","CRC32","FileName","FileSize","ProductCode","OpSystemCode","SpecialCode"
"00b73f67c6f4a6ffe4f9842f57271ed91225e530","344428FA4BA313712E4CA9B16D089AC4","7516A25F",".text._ZNSt14overflow_errorC1ERKSs",33,219181,"362",""
"00b64e72857c63592031d2c682c4563e22a35b98","F46CA74CA3D89E9D3CF8D8E5CD77842D","2F9CC135","__DATA__mod_init_func",772,218747,"362",""
"00ab49b182b0480f3e7eacff7f1d9505dc0a3a32","8ED4B4ED952526D89899E723F3488DE4","7A5407CA","wow64_microsoft-windows-i..timezones.resources_31bf3856ad364e35_10.0.16299.579_de-de_f24979c73226184d.manifest",2520,190718,"362",""
"00aacb0db059d1d8ff0ccafbeeafc7e32ba4fd10","497C460BBA43530494F37DF7DE3A5FF4","46B80AC7","bpa10x.ko",12944,17066,"362",""
"00a04e033f1191bb1a993777c28b5ba729ceac28","7C36BE0D2BF2520D564D36C6F4241B4F","66E07FC3",".text",1130496,223308,"362",""
"009a4244a3e8932c91c1d1eb2057c67a2a15087f","EAEB051BACDB9D67605659E3DF80C48C","74F27585","package_3482_for_kb4462939~31bf3856ad364e35~amd64~~10.0.1.5.cat",10660,204580,"362",""
"006cc4f1004af878422bb0775592769f0b7add42","E7990319759290BB6E0D17D7C685D203","F6A2F49D","ultoa.o",692,220911,"362",""
"006830799e5673d0d8bb988bfc43b6874661f90f","9A872042A9CD96B4FB13901000B91982","97D3B7E8","microsoft-windows-internet-browserppipro-package~31bf3856ad364e35~x86~sl-si~10.0.19041.906.cat",8897,236128,"362",""
"006732aaf4649b21878f4077af807ac34c71dd5d","3491EE38124BF5382D0828C5209C83B5","6CC040F2","Batman_Seventies.POR",90,213814,"362",""
"0000030F6D93EC90BDEA54B08BF7B512B13F55F9","CC6B8BA59F74F251DBCA14962A156C9D","ECEDDFD8",".rodata",173816,220501,"362",""
"000003191A199BFA961C18A6F71FF2ED04D0F9DA","84B2CE4DC226E61470EC240593CCBFF3","CC6201BD",".rdata",5120,221574,"362",""
"0000034F77D9314B1B94DBDA3031BECE1198D067","FE330C56554EF007D38C89764864E365","71C6F991","arm64_49016ecbe73216140477e3b16492e87f_31bf3856ad364e35_10.0.17134.81_none_ae8f44b72b46370a.manifest",705,188511,"362",""
"000003802D91BC41F5C89BB6115903ABC35372AB","F85BA698CA9E66D39BA8E223602E136E","41195B49",".gnu.version",192,226194,"362",""
"00646760493c4fd9f4644572449c702b2c43116b","858DEA54B3CBE4664F6652C37180A8AE","210F55CB","ScBrPls1.A05D7955_E27E_48E7_843F_456A4A59DC3A",456632,226257,"362",""
"0063c0d2ab0c4e89dd87a455bb4142e2aa954b62","0DD50DF49C7E9C01B97038FAE5A077E1","7B608B44",".text",5460480,182069,"362",""
"0060b09755987013a7bbe5992866f08c39db9e6b","849C766653FB4C4C6E9727175FE4974B","16C39D0D",".rela.rodata",23328,263769,"362",""
"00512e1dd13f0389322de86027b5b5ff74acc706","DBAE160A16763542AA8C6A7DBCDB05C5","D6E3587F","GodHitWallAdditiveRight-SkeletalAnimation.bin",20896,198268,"362",""
"00c784a2b0a2818e16ce96eddb676ab17f594e9b","41D0DD202B31F022CDB92802567058A5","7AD24105","redbull.erp",8663417,257490,"362",""
"00000760AB4465B9CE24F569BB49958E36723DEF","8D7D1546A3F803D7B4D5428C756B8E36","50D73BD5","gtru_t26596_s54847_6_p_vzdavani_drPI.ogg",38039,200078,"362",""
"00000767994D0189AED15752A566C8D48E1CBDA0","093941287F5F5A9246395ECC29EDFD27","7ED4CBF1","ServDeps.dll.mui",5120,288137,"362",""
"0000079B4CB3F9C043F48DE6F28ED0A6B9708AC7","7C88ED5E462B23E653455484F4BD2D0A","8F32BA13",".data",216,263765,"362",""
"000007B928F4C211CC8ED3C9707196A7C5BA3AF8","68563E2BFC732E10E885BD2DCF49F2EF","34940E24","microsoft-windows-businessscanning-feature-package~31bf3856ad364e35~amd64~pt-br~6.1.7601.17514.mum",1541,201424,"362",""
"00062169d823ecb98e47918d61be2fe8a84a303b","8DFD90838375D367A11C9064092A6807","409D0B8D",".text",33792,163489,"362",""

A few remarks:
When using with open(..) you don't need to close your file, it's closed automatically at the end of the block
iterating through the lines of a file f is as simple as for line in f:
So you can do the following:
import sys
with open(sys.argv[3], 'w') as f3:
f3.write('Hash Values of Unknown/Modified Executables:'+'\n')
with open(sys.argv[1], 'r') as f1:
for line_f1 in f1:
matched = False
hash1, path1, *_ = line_f1.split()
with open(sys.argv[2], 'r') as f2:
for line_f2 in f2:
if hash1 in line_f2:
matched = True
break
if not matched:
f3.write(line_f1)

Related

Adding a comma to end of first row of csv files within a directory using python

Ive got some code that lets me open all csv files in a directory and run through them removing the top 2 lines of each file, Ideally during this process I would like it to also add a single comma at the end of the new first line (what would have been originally line 3)
Another approach that's possible could be to remove the trailing comma's on all other rows that appear in each of the csvs.
Any thoughts or approaches would be gratefully received.
import glob
path='P:\pytest'
for filename in glob.iglob(path+'/*.csv'):
with open(filename, 'r') as f:
lines = f.read().split("\n")
f.close()
if len(lines) >= 1:
lines = lines[2:]
o = open(filename, 'w')
for line in lines:
o.write(line+'\n')
o.close()

adding a counter in there can solve this:
import glob
path=r'C:/Users/dsqallihoussaini/Desktop/dev_projects/stack_over_flow'
for filename in glob.iglob(path+'/*.csv'):
with open(filename, 'r') as f:
lines = f.read().split("\n")
print(lines)
f.close()
if len(lines) >= 1:
lines = lines[2:]
o = open(filename, 'w')
counter=0
for line in lines:
counter=counter+1
if counter==1:
o.write(line+',\n')
else:
o.write(line+'\n')
o.close()

One possible problem with your code is that you are reading the whole file into memory, which might be fine. If you are reading larger files, then you want to process the file line by line.
The easiest way to do that is to use the fileinput module: https://docs.python.org/3/library/fileinput.html
Something like the following should work:
#!/usr/bin/env python3
import glob
import fileinput
# inplace makes a backup of the file, then any output to stdout is written
# to the current file.
# change the glob..below is just an example.
#
# Iterate through each file in the glob.iglob() results
with fileinput.input(files=glob.iglob('*.csv'), inplace=True) as f:
for line in f: # Iterate over each line of the current file.
if f.filelineno() > 2: # Skip the first two lines
# Note: 'line' has the newline in it.
# Insert the comma if line 3 of the file, otherwise output original line
print(line[:-1]+',') if f.filelineno() == 3 else print(line, end="")

Ive added some encoding as well as mine was throwing a error but encoding fixed that up nicely
import glob
path=r'C:/whateveryourfolderis'
for filename in glob.iglob(path+'/*.csv'):
with open(filename, 'r',encoding='utf-8') as f:
lines = f.read().split("\n")
#print(lines)
f.close()
if len(lines) >= 1:
lines = lines[2:]
o = open(filename, 'w',encoding='utf-8')
counter=0
for line in lines:
counter=counter+1
if counter==1:
o.write(line+',\n')
else:
o.write(line+'\n')
o.close()

Python prints two lines in the same line when merging files

I am new to Python and I'm getting this result and I am not sure how to fix it efficiently.
I have n files, let's say for simplicity just two, with some info with this format:
1.250484649 4.00E-02
2.173737246 4.06E-02
... ...
This continues up to m lines. I'm trying to append all the m lines from the n files in a single file. I prepared this code:
import glob
outfile=open('temp.txt', 'w')
for inputs in glob.glob('*.dat'):
infile=open(inputs,'r')
for row in infile:
outfile.write(row)
It reads all the .dat files (the ones I am interested in) and it does what I want but it merges the last line of the first file and the first line of the second file into a single line:
1.250484649 4.00E-02
2.173737246 4.06E-02
3.270379524 2.94E-02
3.319202217 6.56E-02
4.228424345 8.91E-03
4.335169497 1.81E-02
4.557886098 6.51E-02
5.111075901 1.50E-02
5.547288248 3.34E-02
5.685118615 3.22E-03
5.923718239 2.86E-02
6.30299944 8.05E-03
6.528018125 1.25E-020.704223685 4.98E-03
1.961058114 3.07E-03
... ...
I'd like to fix this in a smart way. I can fix this if I introduce a blank line between each data line and then at the end remove all the blank likes but this seems suboptimal.
Thank you!

There's no newline on the last line of each .dat file, so you'll need to add it:
import glob
with open('temp.txt', 'w') as outfile:
for inputs in glob.glob('*.dat'):
with open(inputs, 'r') as infile:
for row in infile:
if not row.endswith("\n"):
row = f"{row}\n"
outfile.write(row)
Also using with (context managers) to automatically close the files afterwards.
To avoid a trailing newline - there's a few ways to do this, but the simplest one that comes to mind is to load all the input data into memory as individual lines, then write it out in one go using "\n".join(lines). This puts "\n" between each line but not at the end of the last line in the file.
import glob
lines = []
for inputs in glob.glob('*.dat'):
with open(inputs, 'r') as infile:
lines += [line.rstrip('\n') for line in infile.readlines()]
with open('temp.txt', 'w') as outfile:
outfile.write('\n'.join(lines))
[line.rstrip('\n') for line in infile.readlines()] - this is a list comprehension. It makes a list of each line in an individual input file, with the '\n' removed from the end of the line. It can then be += appended to the overall list of lines.
While we're here - let's use logging to give status updates:
import glob
import logging
OUT_FILENAME = 'test.txt'
lines = []
for inputs in glob.glob('*.dat'):
logging.info(f'Opening {inputs} to read...')
with open(inputs, 'r') as infile:
lines += [line.rstrip('\n') for line in infile.readlines()]
logging.info(f'Finished reading {inputs}')
logging.info(f'Opening {OUT_FILENAME} to write...')
with open(OUT_FILENAME, 'w') as outfile:
outfile.write('\n'.join(lines))
logging.info(f'Finished writing {OUT_FILENAME}')

Generate output file Python

As it can be seen in the code. I created two output files one for output after splitting
and second output as actual out after removing duplicate lines
How can i make only one output file. Sorry if i sound too stupid, I'm a beginner
import sys
txt = sys.argv[1]
lines_seen = set() # holds lines already seen
outfile = open("out.txt", "w")
actualout = open("output.txt", "w")
for line in open(txt, "r"):
line = line.split("?", 1)[0]
outfile.write(line+"\n")
outfile.close()
for line in open("out.txt", "r"):
if line not in lines_seen: # not a duplicate
actualout.write(line)
lines_seen.add(line)
actualout.close()

You can add the lines from the input file directly into the set. Since sets cannot have duplicates, you don't even need to check for those. Try this:
import sys
txt = sys.argv[1]
lines_seen = set() # holds lines already seen
actualout = open("output.txt", "w")
for line in open(txt, "r"):
line = line.split("?", 1)[0]
lines_seen.add(line + "\n")
for line in lines_seen:
actualout.write(line)
actualout.close()

In the first step you iterate through every line in the file, split the line on your decriminator and store it into a list. After that you iterate through the list and write it into your output file.
import sys
txt = sys.argv[1]
lines_seen = set() # holds lines already seen
actualout = open("output.txt", "w")
data = [line.split("?", 1[0] for line in open("path/to/file/here", "r")]
for line in data:
if line not in lines_seen: # not a duplicate
actualout.write(line)
lines_seen.add(line)
actualout.close()

Replacing text from one file from another file

The f1.write(line2) works but it does not replace the text in the file, it just adds it to the file. I want the file1 to be identical to file2 if they are different by overwriting the text from file1 with the text from file2
Here is my code:
with open("file1.txt", "r+") as f1, open("file2.txt", "r") as f2:
for line1 in f1:
for line2 in f2:
if line1 == line2:
print("same")
else:
print("different")
f1.write(line2)
break
f1.close()
f2.close()

I would read both files create a new list with the different elements replaced and then write the entire list to the file
with open('file2.txt', 'r') as f:
content = [line.strip() for line in f]
with open('file1.txt', 'r') as j:
content_a = [line.strip() for line in j]
for idx, item in enumerate(content_a):
if content_a[idx] == content[idx]:
print('same')
pass
else:
print('different')
content_a[idx] = content[idx]
with open('file1.txt', 'w') as k:
k.write('\n'.join(content_a))
file1.txt before:
chrx#chrx:~/python/stackoverflow/9.28$ cat file1.txt
this
that
this
that
who #replacing
that
what
blah
code output:
same
same
same
same
different
same
same
same
file1.txt after:
chrx#chrx:~/python/stackoverflow/9.28$ cat file1.txt
this
that
this
that
vash #replaced who
that
what
blah

I want the file1 to be identical to file2
import shutil
with open('file2', 'rb') as f2, open('file1', 'wb') as f1:
shutil.copyfileobj(f2, f1)
This will be faster as you don't have to read file1.
Your code is not working because you'd have to position the file current pointer (with f1.seek() in the correct position to write the line.
In your code, you're reading a line first, and that positions the pointer after the line just read. When writing, the line data will be written in the file in that point, thus duplicating the line.
Since lines can have different sizes, making this work won't be easy, because even if you position the pointer correctly, if some line is modified to get bigger it would overwrite part of the next line inside the file when you write it. You would end up having to cache at least part of the file contents in memory anyway.
Better truncate the file (erase its contents) and write the other file data directly - then they will be identical. That's what the code in the answer does.

Python script to find the rows which are not common

I have two files as follows, the first objective is to get the rows which are not common among 1.csv and 2.csv by comparing the first columns first 14 digits.
Second objective is if the first column in 1.csv is matching with any of the first columns in 2.csv, compare the same rows second column with that of the second column of 1.csv and print the row which is not present in 1.csv and present in 2.csv
The script is below as follow but not able to get the desired output
import csv
t1 = open('1.csv', 'r')
t2 = open('2.csv', 'r')
fileone = t1.readlines()
filetwo = t2.readlines()
t1.close()
t2.close()
outFile = open('update.csv', 'w')
x = 0
for i in fileone:
if i != filetwo[x]:
outFile.write(filetwo[x])
x += 1
outFile.close()

If the format is given fixed it would be a solution to split each line into 2 pieces so you can compare only the first 14 digits as you requested.
The solution you have does only a line by line comparison. If you split up the lines you can iterate over data of either of the files and use simple 'in' to see if the line is in the other file.

First thing always use with when handling files, it will take one line less and will never forget to close the files:
with open('1.csv', 'r') as file1, open('2.csv', 'r') as file2:
file1_lines = file1.readlines()
file2_lines = file2.readlines()
file1_headers = [line[:14] for line in file1_lines]
file2_headers = [line[:14] for line in file2_lines]
with open('update.csv', 'w') as out_file:
# Objective 1: lines that have their first 14 digit in one file only
for line in file1_lines:
if line[:14] not in file2_headers:
out_file.write(line)
for line in file2_lines:
if line[:14] not in file1_headers:
out_file.write(line)
# Objective 2: Lines that are in file 2 but not 1
for line in file2_lines:
if line not in file1_lines:
out_file.write(line)
Your code doesn't mention 14 anywhere, that should alert you in the first place ;-) cheers!

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Comparing two files with python...iterating line by line - python

Related

Adding a comma to end of first row of csv files within a directory using python

Python prints two lines in the same line when merging files

Generate output file Python

Replacing text from one file from another file

Python script to find the rows which are not common

Categories

Resources