I have two files that I want to compare:
-File 1 contains SHA1 hash values and the file path of executable files. (Each line is a different file.)
-File 2 is a file containing known files and their SHA1 hash value and other data. (Again, each line is a different file.)
I want to compare each line of file 1 against each line of file 2. If there is a match, I want the script to stop comparing and start with the next line of file 1. If there is no match, I want to write that file hash and path from file 1 to file 3.
Also, the filesize of File 2 is about ~30GB - so the code needs to be optimized to do the searches as efficiently and fast as possible.
Here is what I have so far, but I am not getting the desired output.
#!/usr/bin/env python
import sys
with open(sys.argv[3], 'w') as f3:
f3.write('Hash Values of Unknown/Modified Executables:'+'\n')
f3.close()
with open(sys.argv[1], 'r') as f1:
line_f1 = f1.readline()[0:40]
full_line = f1.readline()
with open(sys.argv[2], 'r') as f2:
next(f2)
line_f2 = f2.readline()[1:41]
for line_f1 in line_f2:
if (line_f1 == line_f2):
pass
elif (line_f1 != line_f2):
with open(sys.argv[3], 'a') as f3:
f3.write(full_line)
Here is the sample data set that I am working with (The data has been modified from its original form to allow for some matches and some that do not match.):
File 1
00062169d823ecb98e47918d61be2fe8a84a303b /usr/test1
000864f38d0a8505ee8b5618a29039fe1e644fbe /usr/test2 #NO MATCH
001988b60a8fb6a22d2bfdd442c723b477f840fb /usr/test3 #NO MATCH
001a3627d954ed621910f3c79489a63db36916ba /usr/test4 #NO MATCH
001bf3fa581c7660216b34834d7ac8dc5a75a83f /usr/test5 #NO MATCH
00307660e0c54193b9cf3630d38312acf10b4093 /usr/bin/test6 #NO MATCH
00420df3e26140830e5e298e51d48d3f3c6ffc7e /usr/bin/test7 #NO MATCH
004d06de7875d2c20b0c5b29c3c658bce24b5869 /usr/bin/test8 #NO MATCH
00512e1dd13f0389322de86027b5b5ff74acc706 /usr/bin/test9
005691edd3987a833fcff92be169a71796a4972a /usr/bin/test10 #NO MATCH
0060b09755987013a7bbe5992866f08c39db9e6b /usr/sbin/test11
0063c0d2ab0c4e89dd87a455bb4142e2aa954b62 /usr/sbin/test12
00646760493c4fd9f4644572449c702b2c43116b /usr/sbin/test13
006732aaf4649b21878f4077af807ac34c71dd5d /usr/sbin/test14
006830799e5673d0d8bb988bfc43b6874661f90f /usr/sbin/test15
006cc4f1004af878422bb0775592769f0b7add42 /home/admin/Desktop/test16
009a4244a3e8932c91c1d1eb2057c67a2a15087f /home/admin/Desktop/test17
00a04e033f1191bb1a993777c28b5ba729ceac28 /home/admin/Desktop/test18
00aacb0db059d1d8ff0ccafbeeafc7e32ba4fd10 /home/admin/Desktop/test19
00ab49b182b0480f3e7eacff7f1d9505dc0a3a32 /home/admin/Desktop/test20
00b64e72857c63592031d2c682c4563e22a35b98 /home/admin/Documents/test21
00b73f67c6f4a6ffe4f9842f57271ed91225e530 /home/admin/Documents/test22
00ba66e659b9b519401ac69c8cf9f3901055ed42 /home/admin/Documents/test23 #NO MATCH
00c0f1ed488ed13f61f1d77d8b1bf1c3bee1b7e5 /home/admin/Documents/test24 #NO MATCH
00c784a2b0a2818e16ce96eddb676ab17f594e9b /home/admin/Documents/test25
File 2
"SHA-1","MD5","CRC32","FileName","FileSize","ProductCode","OpSystemCode","SpecialCode"
"00b73f67c6f4a6ffe4f9842f57271ed91225e530","344428FA4BA313712E4CA9B16D089AC4","7516A25F",".text._ZNSt14overflow_errorC1ERKSs",33,219181,"362",""
"00b64e72857c63592031d2c682c4563e22a35b98","F46CA74CA3D89E9D3CF8D8E5CD77842D","2F9CC135","__DATA__mod_init_func",772,218747,"362",""
"00ab49b182b0480f3e7eacff7f1d9505dc0a3a32","8ED4B4ED952526D89899E723F3488DE4","7A5407CA","wow64_microsoft-windows-i..timezones.resources_31bf3856ad364e35_10.0.16299.579_de-de_f24979c73226184d.manifest",2520,190718,"362",""
"00aacb0db059d1d8ff0ccafbeeafc7e32ba4fd10","497C460BBA43530494F37DF7DE3A5FF4","46B80AC7","bpa10x.ko",12944,17066,"362",""
"00a04e033f1191bb1a993777c28b5ba729ceac28","7C36BE0D2BF2520D564D36C6F4241B4F","66E07FC3",".text",1130496,223308,"362",""
"009a4244a3e8932c91c1d1eb2057c67a2a15087f","EAEB051BACDB9D67605659E3DF80C48C","74F27585","package_3482_for_kb4462939~31bf3856ad364e35~amd64~~10.0.1.5.cat",10660,204580,"362",""
"006cc4f1004af878422bb0775592769f0b7add42","E7990319759290BB6E0D17D7C685D203","F6A2F49D","ultoa.o",692,220911,"362",""
"006830799e5673d0d8bb988bfc43b6874661f90f","9A872042A9CD96B4FB13901000B91982","97D3B7E8","microsoft-windows-internet-browserppipro-package~31bf3856ad364e35~x86~sl-si~10.0.19041.906.cat",8897,236128,"362",""
"006732aaf4649b21878f4077af807ac34c71dd5d","3491EE38124BF5382D0828C5209C83B5","6CC040F2","Batman_Seventies.POR",90,213814,"362",""
"0000030F6D93EC90BDEA54B08BF7B512B13F55F9","CC6B8BA59F74F251DBCA14962A156C9D","ECEDDFD8",".rodata",173816,220501,"362",""
"000003191A199BFA961C18A6F71FF2ED04D0F9DA","84B2CE4DC226E61470EC240593CCBFF3","CC6201BD",".rdata",5120,221574,"362",""
"0000034F77D9314B1B94DBDA3031BECE1198D067","FE330C56554EF007D38C89764864E365","71C6F991","arm64_49016ecbe73216140477e3b16492e87f_31bf3856ad364e35_10.0.17134.81_none_ae8f44b72b46370a.manifest",705,188511,"362",""
"000003802D91BC41F5C89BB6115903ABC35372AB","F85BA698CA9E66D39BA8E223602E136E","41195B49",".gnu.version",192,226194,"362",""
"00646760493c4fd9f4644572449c702b2c43116b","858DEA54B3CBE4664F6652C37180A8AE","210F55CB","ScBrPls1.A05D7955_E27E_48E7_843F_456A4A59DC3A",456632,226257,"362",""
"0063c0d2ab0c4e89dd87a455bb4142e2aa954b62","0DD50DF49C7E9C01B97038FAE5A077E1","7B608B44",".text",5460480,182069,"362",""
"0060b09755987013a7bbe5992866f08c39db9e6b","849C766653FB4C4C6E9727175FE4974B","16C39D0D",".rela.rodata",23328,263769,"362",""
"00512e1dd13f0389322de86027b5b5ff74acc706","DBAE160A16763542AA8C6A7DBCDB05C5","D6E3587F","GodHitWallAdditiveRight-SkeletalAnimation.bin",20896,198268,"362",""
"00c784a2b0a2818e16ce96eddb676ab17f594e9b","41D0DD202B31F022CDB92802567058A5","7AD24105","redbull.erp",8663417,257490,"362",""
"00000760AB4465B9CE24F569BB49958E36723DEF","8D7D1546A3F803D7B4D5428C756B8E36","50D73BD5","gtru_t26596_s54847_6_p_vzdavani_drPI.ogg",38039,200078,"362",""
"00000767994D0189AED15752A566C8D48E1CBDA0","093941287F5F5A9246395ECC29EDFD27","7ED4CBF1","ServDeps.dll.mui",5120,288137,"362",""
"0000079B4CB3F9C043F48DE6F28ED0A6B9708AC7","7C88ED5E462B23E653455484F4BD2D0A","8F32BA13",".data",216,263765,"362",""
"000007B928F4C211CC8ED3C9707196A7C5BA3AF8","68563E2BFC732E10E885BD2DCF49F2EF","34940E24","microsoft-windows-businessscanning-feature-package~31bf3856ad364e35~amd64~pt-br~6.1.7601.17514.mum",1541,201424,"362",""
"00062169d823ecb98e47918d61be2fe8a84a303b","8DFD90838375D367A11C9064092A6807","409D0B8D",".text",33792,163489,"362",""
A few remarks:
When using with open(..) you don't need to close your file, it's closed automatically at the end of the block
iterating through the lines of a file f is as simple as for line in f:
So you can do the following:
import sys
with open(sys.argv[3], 'w') as f3:
f3.write('Hash Values of Unknown/Modified Executables:'+'\n')
with open(sys.argv[1], 'r') as f1:
for line_f1 in f1:
matched = False
hash1, path1, *_ = line_f1.split()
with open(sys.argv[2], 'r') as f2:
for line_f2 in f2:
if hash1 in line_f2:
matched = True
break
if not matched:
f3.write(line_f1)
I need to break up a 1.3m text file to smaller text file based on the 1st row of a section. The data inputs will likely vary over time so I'd like to automate the process with a something that looks like, but open to any suggestions:
FirstLine test1
1 1 1
TIMESTEP Avg VARIANCE(mm^2) STD
2006-01-06T00:00:00Z 77.556335 114.23446 10.688052
2006-02-06T00:00:00Z 30.174097 20.363855 4.512633
2006-03-06T00:00:00Z 65.48971 146.99098 12.123984
2006-04-06T00:00:00Z 68.65635 335.42905 18.314722
2006-05-06T00:00:00Z 65.31086 121.24954 11.011337
2006-06-06T00:00:00Z 123.571075 172.97223 13.151891
FirstLine test2
1 1 1
TIMESTEP Avg VARIANCE(mm^2) STD
2006-01-06T00:00:00Z 66.34833 258.47723 16.077227
2006-02-06T00:00:00Z 16.08292 16.153652 4.0191607
2006-03-06T00:00:00Z 34.585014 185.23705 13.610182
I need the 1st row to be the FirstLine row, and all to the next row with FirstLine.
I've tried identifying the row number with this script:
def search_string_in_file(content, FirstLine):
line_number = 0
list_of_results = []
RowList = []
# Open the file in read only mode
with open('test.csv', 'r') as read_obj:
# Read all lines in the file one by one
for line in read_obj:
# For each line, check if line contains the string
line_number += 1
if FirstLine in line:
# If yes, then add the line number & line as a tuple in the list
list_of_results.append((line_number, line.rstrip()))
print(list_of_results)
# Return list of tuples containing line numbers and lines where string is found
RowList = pd.DataFrame.from_string(list_of_results)
return list_of_results
The above seems to run successfully, but there are no results and no errors.
Found a way to do this that actually cut some steps out.
found = re.findall(r'\n*(.*?\n\#)\n*', data, re.M | re.S)
What I'm trying to do is to open two CSV files and print only the lines in which the content of a column in file 1 and file 2 match. I already know that I should end up with 14 results, but instead the first line of the CSV file I'm working with gets printed 14 times. Where did I go wrong?
file1 = open("../dir/file1.csv", "r")
for line in file1:
file1splitted = line.strip().split(",")
file2 = open("../dir/file2.csv", "r")
for line in file2:
file2splitted = line.strip().split(",")
for line in file1:
if file1splitted[0] == file2splitted [2]:
print (file1splitted[0],file1splitted[1], file2splitted[6], file2splitted[10], file2splitted[12])
file1.close()
file2.close()
You should be using the csv module for reading these files because splitting on commas is not reliable; it's fine for a single CSV column to contain values that themselves include commas.
I've added a couple of things to try make this cleaner and to help you move forward in your learning:
I've used the with context manager that automatically closes a file once you're done reading it. No need for .close()
I've packaged the csv reading code into a function. Now we only need to write that part once and we can call the function with any file.
I've used the csv module to read the file. This will return a nested list of rows, each inner list representing a single row.
I've used a list comprehension which is a neater way of writing a for loop that creates a list. In this case, it's a list of all the items in the first column of file_1.
I've converted the list in Point 4 into a set. When we iterate through file_2, we can very quickly check whether a row value has been seen in file_1 (set lookup is O(1) rather than having to iterate through file_1 every single time).
The indices I print are from my own test files, you will need to adapt them to your own use-case.
import csv
def read_csv(file_name):
with open(file_name) as infile: # Context manager to auto-close files at end
reader = csv.reader(infile)
#next(reader) remove the hash if you want to drop the headers
return list(reader)
file_1 = read_csv('file_1.csv')
file_2 = read_csv('file_2.csv')
# Make a set of file_1 column 0 with a list comprehension
file_1_vals = set([item[0] for item in file_1])
# Now iterate through file_2
for row in file_2:
if row[2] in file_1_vals:
print(row[1])
file1 = open("../dir/file1.csv", "r")
file2 = open("../dir/file2.csv", "r")
for line in file1:
file1splitted = line.strip().split(",")
for line in file2:
file2splitted = line.strip().split(",")
if file1splitted[0] == file2splitted [2]:
print (file1splitted[0],file1splitted[1], file2splitted[6], file2splitted[10], file2splitted[12])
file1.close()
file2.close()
if you provide your csv files then I can help you more.
I have a dataset of about 10 CSV files. I want to combine those files row-wise into a single CSV file.
What I tried:
import csv
fout = open("claaassA.csv","a")
# first file:
writer = csv.writer(fout)
for line in open("a01.ihr.60.ann.csv"):
print line
writer.writerow(line)
# now the rest:
for num in range(2, 10):
print num
f = open("a0"+str(num)+".ihr.60.ann.csv")
#f.next() # skip the header
for line in f:
print line
writer.writerow(line)
#f.close() # not really needed
fout.close()
Definitively need more details in the question (ideally examples of the inputs and expected output).
Given the little information provided, I will assume that you know that all files are valid CSV and they all have the same number or lines (rows). I'll also assume that memory is not a concern (i.e. they are "small" files that fit together in memory). Furthermore, I assume that line endings are new line (\n).
If all these assumptions are valid, then you can do something like this:
input_files = ['file1.csv', 'file2.csv', 'file3.csv']
output_file = 'output.csv'
output = None
for infile in input_files:
with open(infile, 'r') as fh:
if output:
for i, l in enumerate(fh.readlines()):
output[i] = "{},{}".format(output[i].rstrip('\n'), l)
else:
output = fh.readlines()
with open(output_file, 'w') as fh:
for line in output:
fh.write(line)
There are probably more efficient ways, but this is a quick and dirty way to achieve what I think you are asking for.
The previous answer implicitly assumes we need to do this in python. If bash is an option then you could use the paste command. For example:
paste -d, file1.csv file2.csv file3.csv > output.csv
I don't understand fully why you use the library csv. Actually, it's enough to fill the output file with the lines from given files (it they have the same columns' manes and orders).
input_path_list = [
"a01.ihr.60.ann.csv",
"a02.ihr.60.ann.csv",
"a03.ihr.60.ann.csv",
"a04.ihr.60.ann.csv",
"a05.ihr.60.ann.csv",
"a06.ihr.60.ann.csv",
"a07.ihr.60.ann.csv",
"a08.ihr.60.ann.csv",
"a09.ihr.60.ann.csv",
]
output_path = "claaassA.csv"
with open(output_path, "w") as fout:
header_written = False
for intput_path in input_path_list:
with open(intput_path) as fin:
header = fin.next()
# it adds the header at the beginning and skips other headers
if not header_written:
fout.write(header)
header_written = True
# it adds all rows
for line in fin:
fout.write(line)
Sorry if the title is a little confusing. I have two files, file1 and file2 both with many columns. I need to find common elements in a certain column, if they match, the whole line from file1 should be added to the matched line in file2:
e.g.:
file1.txt:
[a,b,c],
[x,e,y],
...
file2.txt:
[d,e,f],
[s,p,z],
...
Note, here just the element "e" matches, the result should (in a new file, but with all of the information in file2.txt) be:
newfile.txt:
[d,e,f],[x,e,y],
[s,p,z]
...
My idea:
output = open('file2.txt', 'w')
for f in variants:
add = ""
if f[0] in sources:
add = ???
output.write("\t".join(f) + add + "\n")
output.close()
"variants" contains the list from file1.txt, I don't really understand how to add the rest of the information from file1.txt to the matching line in file2.txt, any help please!
from collections import defaultdict
def parse_data(line):
# Returns a list of values from line of text.
return line[1:-2].split(',')
with open('newfile.txt', 'wb') as new_file, open('file1.txt', 'rb') as f1, open('file2.txt', 'rb') as f2:
mapping = defaultdict(list)
# Zero-based indexing.
CERTAIN_COLUMN = 1
for line in f1:
# Remove new-lines and get comma-separated values.
line = line.strip()
columns = parse_data(line)
mapping[columns[CERTAIN_COLUMN]].append(line)
for line in f2:
line = line.strip()
columns = parse_data(line)
for matched in mapping[columns[CERTAIN_COLUMN]]:
new_file.write('{},{},\n'.format(matched, line))
The first loop populates the dict with search_criteria -> matched rows mapping, i.e e -> ['[x,e,y]'].
The second loop prints all entries that meet the search criteria for each line of file2.txt