Python Recursive Hashlib

Python Recursive Hashlib - python

I'm having a problem for computing all checksums of all the files under the /bin/* directory.
I'm implementing a HIDS in Python, so i need to compute the checksums of each file and save it, say, in a list .... so my code here only returns the first checksum of the /bin/* directory.
import sys
import haslib
path = sys.argv[1] #PATH OF THE FILES, ex: /etc/shadow, /bin/*, etc.
with open(path,'rb') as fh:
md5 = hashlib.md5()
while True:
data = fh.read(8192)
if not data:
break
md5.update(data)
print md5.hexdigest()
Any suggestions ??

import sys
from os import listdir
from os.path import isfile, join
import hashlib
path = sys.argv[1] #PATH OF THE FILES, ex: /etc/shadow, /bin/*, etc.
files = [ f for f in listdir(path) if isfile(join(path,f)) ]
my_files = {}
for fil in files:
with open(fil,'rb') as fh:
md5 = hashlib.md5()
while True:
data = fh.read(8192)
if not data:
break
md5.update(data)
my_files[fil] = md5.hexdigest()
for k,v in my_files.iteritems():
print 'file_name is {} | hash is {}'.format(k,v)

Related

file hashing by identifying files by first 4 bytes

I'm trying to write a python script to search through my current directory to identify jpg by their header then hash those files. I'm kind of all over the place. Any suggestions would be appreciated.
from os import listdir, getcwd
from os.path import isfile, join, normpath, basename
import hashlib
jpgHead = b'\xff\xd8\xff\xe0'
def get_files():
current_path = normpath(getcwd())
return [join(current_path, f) for f in listdir(current_path) if
isfile(join(current_path, f))]
def checkJPG():
checkJPG=checkJPG.read(4)
if checkJPG==jpgHead
get_hashes()
def get_hashes():
files = checkJPG()
list_of_hashes = []
for each_file in files:
hash_md5 = hashlib.md5()
with open(each_file, "rb") as f:
list_of_hashes.append('Filename: {}\tHash:
{}\n'.format(basename(each_file), hash_md5.hexdigest()))
return list_of_hashes
def write_jpgHashes():
hashes=get_hashes()
with open('list_of_hashes.txt', 'w') as f:
for md5_hash in hashes:
f.write(md5_hash)
if __name__ == '__main__':
write_jpgHashes()

I modified some of your functions a bit , give it a try
from os import listdir, getcwd
from os.path import isfile, join, normpath, basename
import hashlib
jpgHead = b'\xff\xd8\xff\xe0'
def get_files(path = getcwd()):
current_path = normpath(path)
return [ join(current_path, f) for f in listdir(current_path) if isfile(join(current_path, f)) ]
def checkJPG(path):
with open(path, 'rb') as f :
header = f.read(4)
return header == jpgHead
def get_hashes():
list_of_hashes = []
for each_file in get_files() :
if checkJPG(each_file) :
list_of_hashes.append('Filename: {}\tHash: {}\n'.format(each_file, md5hf(each_file)))
return list_of_hashes
def md5hf(path):
#return hashlib.md5(open(path, "rb").read()).hexdigest() ## you can use this line for small files ##
hash_md5 = hashlib.md5()
with open(path, "rb") as f:
for chunk in iter(lambda : f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def write_jpgHashes():
hashes=get_hashes()
with open('list_of_hashes.txt', 'w') as f:
for md5_hash in hashes:
f.write(md5_hash)
if __name__ == '__main__':
write_jpgHashes()
Notes :
Fixed some syntax and indentation errors
Turned checkJPG to boolean
Added the md5 hash of files to list_of_hashes in get_hashes
Added the md5hf function , to get the md5 checksum

Create multiply amount of files with random names in python and zip them

I'm newbie in Python,
I need to create a large quantity of files with random names in Dest_Dir (my destination directory), And then Zip them to one file.
Does anyone have an idea how to do that?
I managed to create files like that with a for loop into a specific folder, but it doesn't fit in case I want to create large amount of file (Lets say 100)
And the names I created aren't random.
import os
import sys
import platform
SRC_Dir = os.path.dirname(__file__)
Dest_Dir = os.path.join(SRC_Dir, 'dest')
items = ["one", "two", "three"]
for item in items:
#(os.path.join(Dest_Dir, filename), 'wb') as temp_file:
with open(os.path.join(Dest_Dir, item), 'wb') as f:
f.write("This is my first line of code")
f.write("\nThis is my second line of code with {} the first item in my list".format(item))
f.write("\nAnd this is my last line of code")

You could make use of the built-in tempfile
import os
import tempfile
for _ in range(100):
file_descriptor, file_path = tempfile.mkstemp(".txt", "prefix-", Dest_Dir)
file_handle = open(file_path, "wb")
# do stuff
os.close(file_descriptor)
file_handle.close()
Since a comment was made about the zip part I figured I would add that as well
import os
import tempfile
import zipfile
new_files = []
for _ in range(10):
file_descriptor, file_path = tempfile.mkstemp(".txt", "prefix-", "/tmp")
file_handle = open(file_path, "wb")
file_handle.write("HELLO")
os.close(file_descriptor)
file_handle.close()
new_files.append(file_path)
with zipfile.ZipFile("/tmp/zipped.zip", "w") as zipped:
for file_path in new_files:
zipped.write(file_path, os.path.basename(file_path))
The zipped.write arguments here assume only the filename (and not the path) are desired for the archived name.

I have a ".txt "file which consists of various filenames and I want to search each filename in a folder where these files are actually kept

Suppose I have a text file aiq_hits.txt.
Each line in this file corresponds a filename
ant1.aiq
ant2.aiq
ant3.aiq
ant4.aiq
I want to match each line of my textfile (ant1.aiq,ant2.aiq and so on) with filenames which are present at some specific place(R:\Sample) and extract matching files into some other place (R:\sample\wsa).
I have an idea that I need to use functions like os.walk() and fnmatch.fnmatch(), shutil.copy() but I am not able to implement them
My code:
import os
import shutil
import fnmatch
with open("aiq_hits.txt","r") as in_file:
for line in in_file:
I am stuck here

import os
import shutil
sourceDir = "R:\\Sample"
targetDir = "R:\\Sample\\wsa"
existingFiles = set(f for f in os.listdir(sourceDir) if os.path.isfile(os.path.join(sourceDir, f)))
infilepath = "aiq_hits.txt"
with open(infilepath) as infile:
for line in infile:
fname = line.strip()
if fname not in existingFiles: continue
shutil.move(os.path.join(sourceDir, fname), os.path.join(targetDir, fname))

I hope this will suffice:
import os
def match_files(url,file_read, dest):
f = open(file_read, 'rb')
file_list = os.listdir(url)
print(file_list)
saved_path = os.getcwd()
print("Current working directory is " + saved_path)
os.chdir(url)
match = []
for file_name in f:
file_name = file_name.strip()
if file_name in file_list:
match.append(file_name)
os.rename(os.path.join(url, file_name), os.path.join(dest, file_name))
os.chdir(saved_path)
print match
here, url is source directory or folder from which u want to match files, file_read is the name of file (with path) in which list of file names is given, dest is the destination folder.
this code moves the matching files from url to dest, i.e. these files won't remin in url after running the code.

Alternatively you could use the glob module which allows you to enter in a expression for the file name\extension which will then return a list that you can loop over.
I'd use this module if the source directory can have files with the same extension that you want to exclude from being looped over
Also I'm assuming that the file name list is not large and so storing it in a list wont be an issue
eg (I haven't tested the below )
from glob import glob
import os
import shutil
src = 'R:\\Sample'
dst = "R:\\Sample\\wsa"
in_file_list = "aiq_hits.txt"
list_Of_files = glob(os.path.join(src, 'ant*.aiq'))
data = []
with open(in_file_list) as reader:
data += reader.readlines()
for row in list_Of_files:
file_path, file_name = os.path.split(row)
if file_name in data:
shutil.copy2(row, os.path.join(dst, file_name))
# or if you want to move the file
# shutil.move(row, os.path.join(dst, file_name))

get Path from xml-files (Python)

I have 300 XML files, in each file there is a path (see code) and i want to make a list (.CSV) of this Paths with Python.
<da:AdminData>
<da:Datax />
<da:DataID>223</da:DataID>
<da:Date>2013-08-19</da:Date>
<da:Time>13:27:25</da:Time>
<da:Modification>2013-08-19</da:Modification>
<da:ModificationTime>13:27:25</da:ModificationTime>
**<da:Path>D:\08\06\xxx-aaa_20130806_111339.dat</da:Path>**
<da:ID>xxx-5225-fff</da:ID>
I wrote the following code, but does not work for subdirectories
import os, glob, re, time, shutil
xmlpath = r'D:'
outfilename = "result.csv"
list = glob.glob(os.path.join(xmlpath,'*.xml'))
output = ""
for file in list :
fh = open(file)
text = fh.read()
pattern = "<da:Path>(.*)</da:Path>"
pattern = re.compile(pattern);
a = pattern.search(text)
if a:
output += '\n' + a.group(1)
logfile = open(outfile, "w")
logfile.write(output)
logfile.close()

To glob recursively, it is best to use a combination of os.walk and fnmatch.fnmatch. Example:
import os
import fnmatch
def recursive_glob(rootdir, pattern):
matching_files = []
for d, _, fnames in os.walk(rootdir):
matching_files.extend(
os.path.join(d, fname) for fname in fnames
if fnmatch.fnmatch(fname, pattern)
)
return matching_files
xmlfiles = recursive_glob(r"D:\", "*.xml")

Get rows from all .txt files in directory using python

I have some txt files in a directory and I need to get the last 15 lines from all of them. How could I do it using python?
I chose this code:
from os import listdir
from os.path import isfile, join
dir_path= './'
files = [ f for f in listdir(dir_path) if isfile(join(dir_path,f)) ]
out = []
for file in files:
filedata = open(join(dir_path, file), "r").readlines()[-15:]
out.append(filedata)
f = open(r'./fin.txt','w')
f.writelines(out)
f.close()
but I get the error "TypeError: writelines() argument must be a sequence of strings". I think it's because of Russian letters in the lines.

import os
from collections import deque
for filename in os.listdir('/some/path'):
# might want to put a check it's actually a file here...
# (join it to a root path, or anything else....)
# and sanity check it's text of a usable kind
with open(filename) as fin:
last_15 = deque(fin, 15)
deque will automatically discard the oldest entry and peak the max size to be 15, so it's an efficient way of keeping just the "last" 'n' items.

Try this:
from os import listdir
from os.path import isfile
for filepath in listdir("/path/to/folder")
if isfile(filepath): # if need
last_five_lines = open(filepath).readlines()[-15:]
# or, one line:
x = [open(f).readlines()[-15:] for f in listdir("/path/to/folder") if isfile(f)]
Updated:
lastlines = []
for file in files:
lastlines += open(join(dir_path, file), "r").readlines()[-15:]
with open('./fin.txt', 'w') as f:
f.writelines(lastlines)

from os import listdir
from os.path import isfile, join
dir_path= '/usr/lib/something'
files = [ f for f in listdir(dir_path) if isfile(join(dir_path,f)) ]
for file in files:
filedata = open(join(dir_path, file), "r").readlines()[-15:]
#do something with the filedata

Hope this helps:
import os
current_dir = os.getcwd()
dir_objects = os.listdir(current_dir)
dict_of_last_15 = {}
for file in dir_objects:
file_obj = open(file, 'rb')
content = file_obj.readlines()
last_15_lines = content[-15:]
dict_of_last_15[file] = last_15_lines
print "#############: %s" % file
print dict_of_last_15[file]
file_to_check.close()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python Recursive Hashlib - python

Related

file hashing by identifying files by first 4 bytes

Create multiply amount of files with random names in python and zip them

I have a ".txt "file which consists of various filenames and I want to search each filename in a folder where these files are actually kept

get Path from xml-files (Python)

Get rows from all .txt files in directory using python

Categories

Resources