How to skip unhashable (corrupt) files while md5 fingerprinting?

How to skip unhashable (corrupt) files while md5 fingerprinting? - python

The code below makes an md5/metadata fingerprint, but crashes on files with unknown corruption (e.g., files, that can be copied, mostly even opened, but that can not be hashed or zipped up [to disguise their corruption]).
Question: How one makes this code to skip or ignore any and all problem files and just do the rest? Imagine 1 million files on 8 TB. Otherwise I leave it running and having no real-time monitoring of progress, 2 days later I find out that nothing got hashed because a couple problem files made the code hung.
Part of the code (see full code below):
def createBasicInfoListFromDisk():
global diskCompareListDetails, onlyFileNameOnDisk, driveLetter,walk_dir
walk_dir = os.path.abspath(walk_dir)
for root, subdirs, files in os.walk(walk_dir, topdown=True, onerror=None, followlinks=True ):
for filename in files:
file_path = os.path.join(root, filename)
temp = file_path.split(":")
driveLetter = temp[0]
filePathWithoutDriveLetter = temp[1]
fileSize = os.path.getsize(file_path)
mod_on = get_last_write_time(file_path)
print('\t- file %s (full path: %s)' % (filename, file_path))
print('FileName : {filename} is of size {size} and was modified on{mdt}'.format(filename=file_path,size=fileSize,mdt=mod_on ))
diskCompareListDetails.append("\"" + filePathWithoutDriveLetter+"\",\""+str(fileSize) + "\",\"" + mod_on +'"')
onlyFileNameOnDisk.append("\""+filePathWithoutDriveLetter+"\"")
return
Error:
FileName : T:\problemtest\problemfile.doc is of size 27136 and was modified on2010-10-10 13:58:32
Traceback (most recent call last):
File "t:\scripts\test.py", line 196, in <module>
createBasicInfoListFromDisk()
File "t:\scripts\test.py", line 76, in createBasicInfoListFromDisk
mod_on = get_last_write_time(file_path)
File "t:\scripts\test.py", line 61, in get_last_write_time
convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(st.st_mtime))
OSError: [Errno 22] Invalid argument
Full code:
import os
import sys
import time
import datetime
import difflib
import decimal
import hashlib
from pip._vendor.distlib.compat import raw_input
csvListDetails = list()
csvCompareListDetails = list()
diskCompareListDetails = list()
onlyFileNameOnDisk = list()
addedFiles = list()
removedFiles = list()
driveLetter =""
finalFilesToChange=list()
finalFilesToDelete=list()
changedFiles=list()
csvfilewithPath="md5.csv"
import shutil
walk_dir=""
def findAndReadCSVFile(fileName):
global csvListDetails
global csvCompareListDetails
haveIgnoredLine = 0
foundFile=0
try :
inputFileHandler = open(fileName,"rt",encoding='utf-8')
update_time = get_last_write_time(fileName)
print("\n Found md5.csv, last updated on: %s" % update_time)
foundFile=1
except (OSError, IOError, FileNotFoundError):
print("\n md5.csv not found. Will create a new one.")
return foundFile
for line in inputFileHandler:
if (haveIgnoredLine==0):
haveIgnoredLine=1
continue
rowItem = line.replace("\n","").split('","')
csvCompareListDetails.append('"' + rowItem[3]+',"'+rowItem[2]+'","' +rowItem[1]+'"')
lineDetails = list()
for detailNum in range (0,len(rowItem)):
lineDetails.append('"' + (rowItem[detailNum].replace('"','')) + '"')
csvListDetails.append(lineDetails)
inputFileHandler.close()
return foundFile
def get_last_write_time(filename):
st = os.stat(filename)
convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(st.st_mtime))
return convert_time_to_human_readable
def createBasicInfoListFromDisk():
global diskCompareListDetails, onlyFileNameOnDisk, driveLetter,walk_dir
walk_dir = os.path.abspath(walk_dir)
for root, subdirs, files in os.walk(walk_dir, topdown=True, onerror=None, followlinks=True ):
for filename in files:
file_path = os.path.join(root, filename)
temp = file_path.split(":")
driveLetter = temp[0]
filePathWithoutDriveLetter = temp[1]
fileSize = os.path.getsize(file_path)
mod_on = get_last_write_time(file_path)
print('\t- file %s (full path: %s)' % (filename, file_path))
print('FileName : {filename} is of size {size} and was modified on{mdt}'.format(filename=file_path,size=fileSize,mdt=mod_on ))
diskCompareListDetails.append("\"" + filePathWithoutDriveLetter+"\",\""+str(fileSize) + "\",\"" + mod_on +'"')
onlyFileNameOnDisk.append("\""+filePathWithoutDriveLetter+"\"")
return
def compareLogAndDiskLists():
global addedFiles, removedFiles
diff = difflib.unified_diff(csvCompareListDetails, diskCompareListDetails, fromfile='file1', tofile='file2', lineterm='', n=0)
lines = list(diff)[2:]
addedFiles = [line[1:] for line in lines if line[0] == '+']
removedFiles = [line[1:] for line in lines if line[0] == '-']
return
def displayInfoForUserInput():
global finalFilesToChange, finalFilesToDelete
changedOrNewFileCount = 0
noLongerExistingFilesCount = 0
totalSizeOfChange = 0
for line in addedFiles:
if line not in removedFiles:
changedOrNewFileCount = changedOrNewFileCount +1
elements = line.replace("\n","").split('","')
sizeOfFile= int(elements[1].replace('"',''))
totalSizeOfChange = totalSizeOfChange + sizeOfFile
finalFilesToChange.append(elements[0] +'"')
for line in removedFiles:
elements = line.split('","')
if elements[0]+'"' not in onlyFileNameOnDisk:
noLongerExistingFilesCount = noLongerExistingFilesCount + 1
finalFilesToDelete.append(elements[0]+'"')
GBModSz= decimal.Decimal(totalSizeOfChange) / decimal.Decimal('1073741824')
print("\n New or modified files on drive: {} (need to hash)".format(changedOrNewFileCount))
print (" Obsolete lines in md5.csv (files modified or not on drive): {} (lines to delete)".format(noLongerExistingFilesCount))
print (" {} files ({:.2f} GB) needs to be hashed.".format(changedOrNewFileCount,GBModSz))
userInput = raw_input("\n Proceed with hash? (Y/N, Yes/No) ")
if (userInput.strip().upper() == "Y" or userInput.strip().upper() == "YES"):
print("Continuing Processing...")
else:
print("You opted not to continue, Exiting")
sys.exit()
return
def processFiles(foundFile):
if (foundFile==1):
oldFileName = walk_dir+"/md5.csv"
shutil.copy( oldFileName, getTargetFileName(oldFileName))
BLOCKSIZE = 1048576*4
global changedFiles
for fileToHash in finalFilesToChange:
hasher = hashlib.new('md5')
fileToUse=driveLetter+":"+fileToHash.replace('"','')
with open(fileToUse, 'rb') as afile:
buf = afile.read(BLOCKSIZE)
while len(buf) > 0:
hasher.update(buf)
buf = afile.read(BLOCKSIZE)
fileDetails = list()
fileDetails.append(hasher.hexdigest())
fileDetails.append(get_last_write_time(fileToUse))
fileDetails.append(os.path.getsize(fileToUse))
fileDetails.append(fileToHash)
changedFiles.append(fileDetails)
return
def getTargetFileName(oldFileName):
targetFileName= walk_dir+"/generated_on_" + get_last_write_time(oldFileName).replace(" ","_").replace("-","").replace(":","")
targetFileName = targetFileName + "__archived_on_" + datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
targetFileName = targetFileName + "__md5.csv"
return targetFileName
def writeCSVFile(fileName):
try :
outputFileHandler=open(fileName,"wt",encoding='utf-8')
outputFileHandler.write("\"md5Hash\",\"LastWriteTime\",\"Length\",\"FullName\"\n")
for details in csvListDetails:
if details[3] in finalFilesToDelete:
continue
if details[3] in finalFilesToChange:
continue
outputFileHandler.write("{},{},{},{}\n".format(details[0],details[1],details[2],details[3]))
for details in changedFiles:
outputFileHandler.write("\"{}\",\"{}\",\"{}\",{}\n".format(details[0],details[1],details[2],details[3]))
outputFileHandler.close()
except (OSError, IOError, FileNotFoundError) as e:
print("ERROR :")
print("File {} is either not writable or some other error: {}".format(fileName,e))
return
if __name__ == '__main__':
walk_dir = raw_input("\n Enter drive or directory to scan: ")
csvfilewithPath=walk_dir+"/md5.csv"
print("\n Drive to scan: " + walk_dir)
foundFile = 0
foundFile=findAndReadCSVFile(csvfilewithPath)
createBasicInfoListFromDisk()
compareLogAndDiskLists()
displayInfoForUserInput()
processFiles(foundFile)
writeCSVFile(csvfilewithPath)
Trying this fix, no luck:
def get_last_write_time(filename):
try:
st = os.stat(filename)
convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(st.st_mtime))
return convert_time_to_human_readable
except OSError:
pass
return "ERROR"
def createBasicInfoListFromDisk():

I agree with IMCoins and I'm very qurius on why except isn't catching the error.
So first thing I would do is to go to the source where the OSError is being raised and try to catch it explicity.
def get_last_write_time(filename):
try:
st = os.stat(filename)
convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S",
time.localtime(st.st_mtime)
return convert_time_to_human_readable
except OSError:
pass
return "ERROR" #or whatever string you want add

Updated answer, for updated post.
As stated earlier, except statement with exception type specified catches everything. So, in order to do what want... I'm afraid possible answer are either :
To make a method that identifies corrupted files, and handles it properly.
Make try, except statement that encapsulate every part of your code where there could be an error.
Let me warn you about the second solution though, as sometimes, there are system errors that you do not want to avoid. I believe you should print the exception that you catch, in order to identify further problems you may encounter.
Just so you know, as you may not : your error is not in a try, except statement. Your error is in (if I copied and pasted properly in my editor) line 196, createBasicinfoListFromDisk(), then line 76, mod_on = get_last_write_time(file_path)
As you also mentioned you are using python 3.x, I suggest you are looking into the suppress function (https://docs.python.org/3/library/contextlib.html#contextlib.suppress).
I hope it helped you.

Related

Python script to remove all the txt files that match a condition from subdirectories without affecting the directory structure

I am trying to write a python script that removes txt files that have less than "wordLimit" words, where "wordLimit" is a given number. The files should be removed from all the directories and subdirectories at a certain path. The directory structure should stay the same.
My version always prints "Error while deleting file"
import os, glob
wordLimit = 1000
directory = os.getcwd()
def shouldArticleBeRemoved(filepath, minWords):
number = 0
filename = os.path.basename(filepath)
with open(filename, 'r+') as f:
for line in f:
words = line.split()
number += len(words)
if number < minWords:
return True
else: return False
def iterateCheckAndRemove():
notRemoved = 0
removed = 0
fileList = glob.glob('C:/Users/HP/Desktop/Articles/*/*/*.txt', recursive=True)
for filePath in fileList:
try:
if shouldArticleBeRemoved(filePath, wordLimit):
os.remove(filePath)
removed += 1
else: notRemoved +=1
except OSError:
notRemoved +=1
print("Error while deleting file")
print(removed)
iterateCheckAndRemove()

Try this. It's much simpler:-
import os
def main():
wordLimit = 1000
for r, _, f in os.walk(r'C:\Users\HP\Desktop\Articles'):
for _f in f:
apath = os.path.join(r, _f)
_, ext = os.path.splitext(apath)
if ext == '.txt':
try:
rflag = False
with open(apath) as text:
if len(text.read().split()) < wordLimit:
rflag = True
if rflag:
os.remove(apath)
print(f'{apath} was deleted')
except Exception as e:
print(f'Error while processing {apath} -> {e}')
if __name__ == '__main__':
main()

why the python file handling not recognizing file name?

I want to replace a string with another string in a file. I have below program to perform the task.
import os
import sys
import traceback
from glob import iglob
def usage():
print('Usage: python FindAndReplace.py [Old String] [New String] '
'[File Filters(default:".txt,.xml")] [Directory To Check(.)]')
def search_replace_string(fileName, old_str, new_str):
if not(os.path.isfile(fileName) and os.access(fileName, os.W_OK)):
print("Warning: Skipping..File does not exist or and is not writeable:" + filename)
return False
fileupdated = False
# Read the old file
with open(fileName, 'r') as f:
newlines = []
for lines in f.readlines():
if old_str in lines:
fileupdated = True
line = lines.replace(old_str, new_str)
newlines.append(line)
# Write changes to same file
if fileupdated:
print("string Found and Updating File: " + fileName)
try:
with open(fileName, 'w') as f:
for line in newlines:
f.write(line)
except:
print("Error: Cannot open/access existing file for writing: " + fileName)
return fileupdated
def main():
try:
DEFAULT_PATH = iglob(str('<path_to_file.xml'))
if len(sys.argv) < 3:
usage()
# old/new string required parameters, exit if not supplied
sys.exit(-1)
else:
oldString = sys.argv[1]
newString = sys.argv[2]
if len(sys.argv) < 4:
patterns = ['.xml', '.txt']
else:
stringFilter = sys.argv[3]
patterns = stringFilter.split(',')
if len(sys.argv) < 5:
path = DEFAULT_PATH
else:
path = sys.argv[4]
print('[Old String] :' + oldString)
print('[New String] :' + newString)
print('[File Filters] :' + ', '.join(patterns))
print('[Directory To Check] :' + path)
if not os.path.exists(path):
raise Exception("Selected path does not exist: " + path)
# Walk through directory structure looking for files matching patterns
matchingFileList = [os.path.join(dp, f) for dp, dn, filenames in os.walk(path) for f in filenames if os.path.splitext(f)[1] in patterns]
print('Files found matching patterns: ' + str(len(matchingFileList)))
filecount = 0
filesReplaced = 0
for currFile in matchingFileList:
filecount += 1
filesReplaced = search_replace_string(currFile, old_str, new_str)
if filesReplaced:
filesReplaced += 1
print("Total Files Searched :" + str(filecount))
print("Total Files Replaced/Updated :" + str(filesReplaced))
except Exception as err:
print(traceback.format_exception_only(type(err), err)[0].rstrip())
sys.exit(-1)
if __name__ == '__main__':
main()
When I am executing it from a command line I am getting below error:
(null): can't open file 'uro.py': [Errno 2] No such file or directory
Below is the command line argument I am giving
python uro.py <file_path> <old_str> <new_str>
NOTE: I am using xml file.
I wanted to develop a logic that will take file_name, old and new string as a command line argument. In the error one can see that the program is considering the python file a input file. While it should take the path of the file that I give with CMD argument.
What's the mistake here? Please suggest. Thank you

I got it fixed, there was some path issue. Thank you.

"[Errno 2] No such file or directory" Issue [duplicate]

This question already has answers here:
Trying to use open(filename, 'w' ) gives IOError: [Errno 2] No such file or directory if directory doesn't exist
(3 answers)
Closed 6 months ago.
So my prof. gave me this code as the solution of my homework but when I run it it gives me an error. Can you please help me out? I guess I didn't specify the location of the file but not sure if that's the case.The objective of this question is to generate and read files that contain a list of random numbers.
import random
import os
import time
def fillFile(fileSize, fileName):
# Delete file if exists
if os.path.exists(fileName):
os.remove(fileName)
# Open file
FILE = open(fileName, "w")
# Write to file
for i in range(fileSize):
r = random.randint(0,fileSize+1000)
FILE.write(str(r) + "\n")
FILE.close()
def readFile(fileName):
# Open file
if os.path.exists(fileName):
FILE = open(fileName,"r")
else:
print(fileName + " does not exist!")
exit()
# Read File
alist = []
for line in FILE:
alist.append(int(line))
FILE.close()
return alist
def mainForFiles():
# Dosyaları oluştur
fileSizes = [1000, 5000, 10000, 25000, 50000, 100000, 200000]
dirName = ".\\filesForAssignment1\\"
# Delete fileStats.txt file if exists
statFileName = "fileStats.txt"
if os.path.exists(statFileName):
os.remove(statFileName)
# open stat file
statFile = open(statFileName, "w")
statFile.write("fillFile")
print("WRITING TO FILES")
for i in fileSizes:
start = time.time()
fillFile(i, dirName+"file"+str(i))
finish = time.time()
statFile.write(" " + str(finish-start))
print("File Size = " + str(i) + " Write Time = " + str(finish-start))
statFile.write("\n")
print("READING FILES")
statFile.write("readFile")
for i in fileSizes:
fileName = dirName+"file"+str(i)
# Dosyayı oku
finish = time.time()
alist = readFile(fileName)
start = time.time()
statFile.write(" " + str(finish-start))
print ("File Size = " + str(i)+ " Dosya Okuma Zamanı = " + str(finish-start))
statFile.write("\n")
statFile.close()
mainForFiles()
File "C:/Users/emrea/PycharmProjects/helloworld/hello2.py", line 84, in
<module>
mainForFiles()
File "C:/Users/emrea/PycharmProjects/helloworld/hello2.py", line 57, in mainForFiles
fillFile(i, dirName+"file"+str(i))
File "C:/Users/emrea/PycharmProjects/helloworld/hello2.py", line 12, in fillFile
FILE = open(fileName, "w")
FileNotFoundError: [Errno 2] No such file or directory: '.\\filesForAssignment1\\file1000'

FileNotFoundError: [Errno 2] No such file or directory: '.\\filesForAssignment1\\file1000'
The w mode causes the file to be created if it doesn't exist (and truncated if it does so the os.remove is not actually useful there), however it does expect intermediate directories to exist.
This means you should ensure the path to the file ('.\\filesForAssignment1) does exist before trying to create the file.
os.makedirs(os.path.dirname(fileName), exists_ok=True)
should do the trick, or
pathlib.Path(fileName).parent.mkdir(parents=True, exists_ok=True)
for a somewhat more modern take on it.
There's a bunch of other minor issues in the script:
the main function should generally be "gated" so modules can be imported without running them
explicitly closing files has fallen out of favor as it's unreliable
when opening files in "text" mode (the default) you should always provide an encoding
pathlib is fun, also that way you should not have to deal with path separators and all that crap
unless it's required to handle that case, I'd just let open(fname, 'r') error out if the file doesn't exist
Here's a version I think should be slightly improved:
import pathlib
import random
import os
import time
def fillFile(fileSize, fileName):
with fileName.open('w', encoding='utf-8') as f:
for i in range(fileSize):
r = random.randint(0,fileSize+1000)
f.write(f"{r}\n")
def readFile(fileName):
with fileName.open(encoding='utf-8') as f:
return [int(line) for line in f]
OUT_DIR = pathlib.Path.cwd().joinpath("filesForAssignment1")
FILE_SIZES = [1000, 5000, 10000, 25000, 50000, 100000, 200000]
def mainForFiles():
# Dosyaları oluştur
OUT_DIR.mkdir(parents=True, exist_ok=True) # make sure the directory exists
statFilePath = pathlib.Path("fileStats.txt")
with statFilePath.open('w', encoding='utf-8') as statFile:
statFile.write("fillFile")
print("WRITING TO FILES")
for i in FILE_SIZES:
start = time.time()
fillFile(i, OUT_DIR.joinpath(f'file{i}'))
finish = time.time()
statFile.write(f" {finish-start}")
print(f"File Size = {i} Write Time = {finish-start})")
statFile.write("\n")
print("READING FILES")
statFile.write("readFile")
for i in FILE_SIZES:
f = OUT_DIR.joinpath(f'file{i}')
# Dosyayı oku
start = time.time()
alist = readFile(f)
finish = time.time()
statFile.write(f" {finish-start}")
print (f"File Size = {i} Dosya Okuma Zamanı = {finish-start}")
statFile.write("\n")
if __name__ == '__main__':
mainForFiles()

exit() is not doing what you want, it continues with the code.
def readFile(fileName):
# Open file
if os.path.exists(fileName):
FILE = open(fileName,"r")
else:
print(fileName + " does not exist!")
return
# Read File
alist = []
for line in FILE:
alist.append(int(line))
FILE.close()
return alist

epub3 : how to add the mimetype at first in archive

I'm working on a script to create epub from html files, but when I check my epub I have the following error : Mimetype entry missing or not the first in archive
The Mimetype is present, but it's not the first file in the epub. Any idea how to put it in first place in any case using Python ?

Sorry, I don't have the time right now to give a detailed explanation, but here's a (relatively) simple epub processing program I wrote a while ago that shows how to do that.
epubpad.py
#! /usr/bin/env python
''' Pad the the ends of paragraph lines in an epub file with a single space char
Written by PM 2Ring 2013.05.12
'''
import sys, re, zipfile
def bold(s): return "\x1b[1m%s\x1b[0m" % s
def report(attr, val):
print "%s '%s'" % (bold(attr + ':'), val)
def fixepub(oldname, newname):
oldz = zipfile.ZipFile(oldname, 'r')
nlist = oldz.namelist()
#print '\n'.join(nlist) + '\n'
if nlist[0] != 'mimetype':
print bold('Warning!!!'), "First file is '%s', not 'mimetype" % nlist[0]
#get the name of the contents file from the container
container = 'META-INF/container.xml'
# container should be in nlist
s = oldz.read(container)
p = re.compile(r'full-path="(.*?)"')
a = p.search(s)
contents = a.group(1)
#report("Contents file", contents)
i = contents.find('/')
if i>=0:
dirname = contents[:i+1]
else:
#No directory separator in contents name!
dirname = ''
report("dirname", dirname)
s = oldz.read(contents)
#print s
p = re.compile(r'<dc:creator.*>(.*)</dc:creator>')
a = p.search(s)
creator = a.group(1)
report("Creator", creator)
p = re.compile(r'<dc:title>(.*)</dc:title>')
a = p.search(s)
title = a.group(1)
report("Title", title)
#Find the names of all xhtml & html text files
p = re.compile(r'\.[x]?htm[l]?')
htmnames = [i for i in nlist if p.search(i) and i.find('wrap')==-1]
#Pattern for end of lines that don't need padding
eolp = re.compile(r'[>}]$')
newz = zipfile.ZipFile(newname, 'w', zipfile.ZIP_DEFLATED)
for fname in nlist:
print fname,
s = oldz.read(fname)
if fname == 'mimetype':
f = open(fname, 'w')
f.write(s)
f.close()
newz.write(fname, fname, zipfile.ZIP_STORED)
print ' * stored'
continue
if fname in htmnames:
print ' * text',
#Pad lines that are (hopefully) inside paragraphs...
newlines = []
for line in s.splitlines():
if len(line)==0 or eolp.search(line):
newlines.append(line)
else:
newlines.append(line + ' ')
s = '\n'.join(newlines)
newz.writestr(fname, s)
print
newz.close()
oldz.close()
def main():
oldname = len(sys.argv) > 1 and sys.argv[1]
if not oldname:
print 'No filename given!'
raise SystemExit
newname = len(sys.argv) > 2 and sys.argv[2]
if not newname:
if oldname.rfind('.') == -1:
newname = oldname + '_P'
else:
newname = oldname.replace('.epub', '_P.epub')
newname = newname.replace(' ', '_')
print "Processing '%s' to '%s' ..." % (oldname, newname)
fixepub(oldname, newname)
if __name__ == '__main__':
main()
FWIW, I wrote this program to process files for my simple e-reader that annoyingly joins paragraphs together if they don't end with white space.

The solution I've found:
delete the previous mimetype file
when creating the new archive create an new mimetype file before adding anything else : zipFile.writestr("mimetype", "application/epub+zip")
Why does it work : the mimetype is the same for all epub : "application/epub+zip", no need to use the original file.

Problem with file cleaning process?

I am working on very large file system. My task is to clean the system with some given parameters. Below program fragment can give a idea.
import DirectoryWalker
extentions_to_delete = list([".rar",".doc",".URL",".js",".EXE",".mht",".css",".txt", ".cache", ".xml"])
extentions_to_copy = list([".jpg",".BMP",".GIF",".jpeg",".gif",".bmp",".png",".JPG"])
dw = DirectoryWalker.DirectoryWalker("/media/08247451247443AA/home/crap/")
def copy_advice(key, files):
for ext in extentions_to_copy:
if(ext == key):
print str(len(files)) + " Files of type " + key + " should be coppied to the target folder."
for file in files:
copy_to = "/media/08247451247443AA/home/crap-pics/"
moved = dw.move_file_to(file, copy_to, True)
if not moved:
print file + " : not moved"
walks = dw.get_all_file_types()
for key in DirectoryWalker.Walk.store.keys():
files = DirectoryWalker.Walk.store[key]
copy_advice(key, files)
In the DirectoryWalker following code is written. Walk is a simple class which have a store object.
def get_all_file_types(self):
extentions = []
for dirpath,dirnames,filenames in os.walk(self.dir_name):
for file in filenames:
extentions.append(Walk(dirpath +"/"+ file))
return extentions
def move_file_to(self, file_path, copy_to, rename_if_exists= False):
file_name = os.path.split(file_path)[1]
target_file_name = copy_to + file_name;
coppied = False
if not os.path.isfile(target_file_name):
coppied = True
try:
os.rename(file_path, target_file_name)
except OSError:
coppied = False
print "Oops! Unable to rename : " + file_path + " to target : " + target_file_name
if rename_if_exists:
coppied = True
file_name = "new_"+ file_name
try:
os.rename(file_path, target_file_name)
except OSError:
coppied = False
print "Oops! Unable to rename : " + file_path + " to target : " + target_file_name
return coppied
The Walk class
class Walk:
store = dict([])
def __init__(self, filename):
self.file_ext = os.path.splitext(filename)[-1]
self.file_name = filename
if not (Walk.store.has_key(self.file_ext)):
Walk.store[self.file_ext] = list()
Walk.store[self.file_ext].append(self.file_name)
But when program executed, it only moves almost 10400 files. But manual calculation suggest, there should be 13400 files in the file system. Please let me know, what I am doing wrong?
Update Solutions
After a careful investigations, I come out with result that there are many ambiguous file names in the target file system and those files were missing.

To answer your question, why not start with a simpler piece of code to test?
import os
all_files = []
for root, dirs, files in os.walk('/media/08247451247443AA/home/crap/'):
all_files.extend(files)
print len(all_files)
As a side note, could you replace the Walk class with a defaultdict?

After a careful investigations, I come out with result that there are many ambiguous file names in the target file system and those files were missing.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to skip unhashable (corrupt) files while md5 fingerprinting? - python

Related

Python script to remove all the txt files that match a condition from subdirectories without affecting the directory structure

why the python file handling not recognizing file name?

"[Errno 2] No such file or directory" Issue [duplicate]

epub3 : how to add the mimetype at first in archive

Problem with file cleaning process?

Categories

Resources