file hashing by identifying files by first 4 bytes - python

I'm trying to write a python script to search through my current directory to identify jpg by their header then hash those files. I'm kind of all over the place. Any suggestions would be appreciated.
from os import listdir, getcwd
from os.path import isfile, join, normpath, basename
import hashlib
jpgHead = b'\xff\xd8\xff\xe0'
def get_files():
current_path = normpath(getcwd())
return [join(current_path, f) for f in listdir(current_path) if
isfile(join(current_path, f))]
def checkJPG():
checkJPG=checkJPG.read(4)
if checkJPG==jpgHead
get_hashes()
def get_hashes():
files = checkJPG()
list_of_hashes = []
for each_file in files:
hash_md5 = hashlib.md5()
with open(each_file, "rb") as f:
list_of_hashes.append('Filename: {}\tHash:
{}\n'.format(basename(each_file), hash_md5.hexdigest()))
return list_of_hashes
def write_jpgHashes():
hashes=get_hashes()
with open('list_of_hashes.txt', 'w') as f:
for md5_hash in hashes:
f.write(md5_hash)
if __name__ == '__main__':
write_jpgHashes()

I modified some of your functions a bit , give it a try
from os import listdir, getcwd
from os.path import isfile, join, normpath, basename
import hashlib
jpgHead = b'\xff\xd8\xff\xe0'
def get_files(path = getcwd()):
current_path = normpath(path)
return [ join(current_path, f) for f in listdir(current_path) if isfile(join(current_path, f)) ]
def checkJPG(path):
with open(path, 'rb') as f :
header = f.read(4)
return header == jpgHead
def get_hashes():
list_of_hashes = []
for each_file in get_files() :
if checkJPG(each_file) :
list_of_hashes.append('Filename: {}\tHash: {}\n'.format(each_file, md5hf(each_file)))
return list_of_hashes
def md5hf(path):
#return hashlib.md5(open(path, "rb").read()).hexdigest() ## you can use this line for small files ##
hash_md5 = hashlib.md5()
with open(path, "rb") as f:
for chunk in iter(lambda : f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def write_jpgHashes():
hashes=get_hashes()
with open('list_of_hashes.txt', 'w') as f:
for md5_hash in hashes:
f.write(md5_hash)
if __name__ == '__main__':
write_jpgHashes()
Notes :
Fixed some syntax and indentation errors
Turned checkJPG to boolean
Added the md5 hash of files to list_of_hashes in get_hashes
Added the md5hf function , to get the md5 checksum

Related

How to set Image path in Python

I have the below python code that selects all the images in the same directory of the python file and hashes them in order to receive a hash txt file.
I would prefer to set only the needed image and not to hash all the images.
can you please advise how and where can I set the name of the image or the path? (ex: to hash only the following image C:\Users\admin\Desktop\TD\image.png)
THE CODE is the below :
import os
import hashlib
import logging
logging.basicConfig(filename='InitializationHash.txt', level=logging.INFO,
format='%(message)s')
image_ext = ['.png', '.jpg']
def hash_image(filepath):
with open(filepath, 'rb') as f:
file_bytes = f.read()
hash_text = hashlib.sha256(file_bytes).hexdigest()
logging.info(hash_text)
def get_images(path):
for f in os.listdir(path):
full_path = os.path.join(path, f)
if os.path.isdir(full_path):
get_images(full_path)
else:
ext = os.path.splitext(full_path)[1]
if ext in image_ext:
hash_image(full_path)
if __name__ == '__main__':
get_images(".")
If you are trying to pass in the name of a file and only hash that one file you can create a new function like this
def get_one_image(file_Name):
filepath = os.path.abspath(file_Name)
hash_image(filepath)
you can call that instead
if __name__ == '__main__':
get_one_image("FileName.png")

how to copy content of file to another file with deleting some lines in a TXT file based on some strings with python

i have this python script that open a file dialog and select a text file than copy its content to another file, what i need to do before i copy to the next file is to delete several lines based on some strings that are predefined in the script.
the problem is that the file is copied as it is without deleting anything.
can anyone help me to solve this issue??
OPenDirectory.py
#!/usr/bin/python
import Tkinter
from os import listdir
from os.path import isfile, join
import tkFileDialog
def readWrite():
unwanted = set(['thumbnails', 'tikare', 'cache'])
mypath = "C:\Users\LT GM\Desktop/"
Tkinter.Tk().withdraw()
in_path = tkFileDialog.askopenfile(initialdir = mypath, filetypes=[('text files', ' TXT ')])
files = [f for f in listdir(mypath) if isfile(join(mypath, f))]
for file in files:
if file.split('.')[1] == 'txt':
outputFileName = 'Sorted-' + file
with open(mypath + outputFileName, 'w') as w:
with open(mypath + '/' + file) as f:
for l in f:
if l != unwanted:
print l
w.write(l)
in_path.close()
if __name__== "__main__":
readWrite()
ChangedScipt
#!/usr/bin/python
import Tkinter
from os import listdir
from os.path import isfile, join
import tkFileDialog
def readWrite():
unwanted = set(['thumbnails', 'tikare', 'cache'])
mypath = "C:\Users\LT GM\Desktop/"
Tkinter.Tk().withdraw()
in_path = tkFileDialog.askopenfile(initialdir = mypath, filetypes=[('text files', ' TXT ')])
files = [f for f in listdir(mypath) if isfile(join(mypath, f))]
for file in files:
if file.split('.')[1] == 'txt':
if file.strip() in unwanted:
continue
outputFileName = 'Sorted-' + file
with open(mypath + outputFileName, 'w') as w:
with open(mypath + '/' + file) as f:
for l in f:
if l != unwanted:
print l
w.write(l)
in_path.close()
if __name__== "__main__":
readWrite()
blacklist = set(['thumbnails', 'quraan', 'cache'])
with open('path/to/input') as infile, open('path/to/output', 'w') as outfile:
for line in infile:
if line.strip() in blacklist: continue # if you want to match the full line
# if any(word in line for word in blacklist): continue # if you want to check if a word exists on a line
outfile.write(line)

Python Recursive Hashlib

I'm having a problem for computing all checksums of all the files under the /bin/* directory.
I'm implementing a HIDS in Python, so i need to compute the checksums of each file and save it, say, in a list .... so my code here only returns the first checksum of the /bin/* directory.
import sys
import haslib
path = sys.argv[1] #PATH OF THE FILES, ex: /etc/shadow, /bin/*, etc.
with open(path,'rb') as fh:
md5 = hashlib.md5()
while True:
data = fh.read(8192)
if not data:
break
md5.update(data)
print md5.hexdigest()
Any suggestions ??
import sys
from os import listdir
from os.path import isfile, join
import hashlib
path = sys.argv[1] #PATH OF THE FILES, ex: /etc/shadow, /bin/*, etc.
files = [ f for f in listdir(path) if isfile(join(path,f)) ]
my_files = {}
for fil in files:
with open(fil,'rb') as fh:
md5 = hashlib.md5()
while True:
data = fh.read(8192)
if not data:
break
md5.update(data)
my_files[fil] = md5.hexdigest()
for k,v in my_files.iteritems():
print 'file_name is {} | hash is {}'.format(k,v)

get Path from xml-files (Python)

I have 300 XML files, in each file there is a path (see code) and i want to make a list (.CSV) of this Paths with Python.
<da:AdminData>
<da:Datax />
<da:DataID>223</da:DataID>
<da:Date>2013-08-19</da:Date>
<da:Time>13:27:25</da:Time>
<da:Modification>2013-08-19</da:Modification>
<da:ModificationTime>13:27:25</da:ModificationTime>
**<da:Path>D:\08\06\xxx-aaa_20130806_111339.dat</da:Path>**
<da:ID>xxx-5225-fff</da:ID>
I wrote the following code, but does not work for subdirectories
import os, glob, re, time, shutil
xmlpath = r'D:'
outfilename = "result.csv"
list = glob.glob(os.path.join(xmlpath,'*.xml'))
output = ""
for file in list :
fh = open(file)
text = fh.read()
pattern = "<da:Path>(.*)</da:Path>"
pattern = re.compile(pattern);
a = pattern.search(text)
if a:
output += '\n' + a.group(1)
logfile = open(outfile, "w")
logfile.write(output)
logfile.close()
To glob recursively, it is best to use a combination of os.walk and fnmatch.fnmatch. Example:
import os
import fnmatch
def recursive_glob(rootdir, pattern):
matching_files = []
for d, _, fnames in os.walk(rootdir):
matching_files.extend(
os.path.join(d, fname) for fname in fnames
if fnmatch.fnmatch(fname, pattern)
)
return matching_files
xmlfiles = recursive_glob(r"D:\", "*.xml")

Get rows from all .txt files in directory using python

I have some txt files in a directory and I need to get the last 15 lines from all of them. How could I do it using python?
I chose this code:
from os import listdir
from os.path import isfile, join
dir_path= './'
files = [ f for f in listdir(dir_path) if isfile(join(dir_path,f)) ]
out = []
for file in files:
filedata = open(join(dir_path, file), "r").readlines()[-15:]
out.append(filedata)
f = open(r'./fin.txt','w')
f.writelines(out)
f.close()
but I get the error "TypeError: writelines() argument must be a sequence of strings". I think it's because of Russian letters in the lines.
import os
from collections import deque
for filename in os.listdir('/some/path'):
# might want to put a check it's actually a file here...
# (join it to a root path, or anything else....)
# and sanity check it's text of a usable kind
with open(filename) as fin:
last_15 = deque(fin, 15)
deque will automatically discard the oldest entry and peak the max size to be 15, so it's an efficient way of keeping just the "last" 'n' items.
Try this:
from os import listdir
from os.path import isfile
for filepath in listdir("/path/to/folder")
if isfile(filepath): # if need
last_five_lines = open(filepath).readlines()[-15:]
# or, one line:
x = [open(f).readlines()[-15:] for f in listdir("/path/to/folder") if isfile(f)]
Updated:
lastlines = []
for file in files:
lastlines += open(join(dir_path, file), "r").readlines()[-15:]
with open('./fin.txt', 'w') as f:
f.writelines(lastlines)
from os import listdir
from os.path import isfile, join
dir_path= '/usr/lib/something'
files = [ f for f in listdir(dir_path) if isfile(join(dir_path,f)) ]
for file in files:
filedata = open(join(dir_path, file), "r").readlines()[-15:]
#do something with the filedata
Hope this helps:
import os
current_dir = os.getcwd()
dir_objects = os.listdir(current_dir)
dict_of_last_15 = {}
for file in dir_objects:
file_obj = open(file, 'rb')
content = file_obj.readlines()
last_15_lines = content[-15:]
dict_of_last_15[file] = last_15_lines
print "#############: %s" % file
print dict_of_last_15[file]
file_to_check.close()

Categories

Resources