"[Errno 2] No such file or directory" Issue [duplicate] - python

This question already has answers here:
Trying to use open(filename, 'w' ) gives IOError: [Errno 2] No such file or directory if directory doesn't exist
(3 answers)
Closed 6 months ago.
So my prof. gave me this code as the solution of my homework but when I run it it gives me an error. Can you please help me out? I guess I didn't specify the location of the file but not sure if that's the case.The objective of this question is to generate and read files that contain a list of random numbers.
import random
import os
import time
def fillFile(fileSize, fileName):
# Delete file if exists
if os.path.exists(fileName):
os.remove(fileName)
# Open file
FILE = open(fileName, "w")
# Write to file
for i in range(fileSize):
r = random.randint(0,fileSize+1000)
FILE.write(str(r) + "\n")
FILE.close()
def readFile(fileName):
# Open file
if os.path.exists(fileName):
FILE = open(fileName,"r")
else:
print(fileName + " does not exist!")
exit()
# Read File
alist = []
for line in FILE:
alist.append(int(line))
FILE.close()
return alist
def mainForFiles():
# Dosyaları oluştur
fileSizes = [1000, 5000, 10000, 25000, 50000, 100000, 200000]
dirName = ".\\filesForAssignment1\\"
# Delete fileStats.txt file if exists
statFileName = "fileStats.txt"
if os.path.exists(statFileName):
os.remove(statFileName)
# open stat file
statFile = open(statFileName, "w")
statFile.write("fillFile")
print("WRITING TO FILES")
for i in fileSizes:
start = time.time()
fillFile(i, dirName+"file"+str(i))
finish = time.time()
statFile.write(" " + str(finish-start))
print("File Size = " + str(i) + " Write Time = " + str(finish-start))
statFile.write("\n")
print("READING FILES")
statFile.write("readFile")
for i in fileSizes:
fileName = dirName+"file"+str(i)
# Dosyayı oku
finish = time.time()
alist = readFile(fileName)
start = time.time()
statFile.write(" " + str(finish-start))
print ("File Size = " + str(i)+ " Dosya Okuma Zamanı = " + str(finish-start))
statFile.write("\n")
statFile.close()
mainForFiles()
File "C:/Users/emrea/PycharmProjects/helloworld/hello2.py", line 84, in
<module>
mainForFiles()
File "C:/Users/emrea/PycharmProjects/helloworld/hello2.py", line 57, in mainForFiles
fillFile(i, dirName+"file"+str(i))
File "C:/Users/emrea/PycharmProjects/helloworld/hello2.py", line 12, in fillFile
FILE = open(fileName, "w")
FileNotFoundError: [Errno 2] No such file or directory: '.\\filesForAssignment1\\file1000'

FileNotFoundError: [Errno 2] No such file or directory: '.\\filesForAssignment1\\file1000'
The w mode causes the file to be created if it doesn't exist (and truncated if it does so the os.remove is not actually useful there), however it does expect intermediate directories to exist.
This means you should ensure the path to the file ('.\\filesForAssignment1) does exist before trying to create the file.
os.makedirs(os.path.dirname(fileName), exists_ok=True)
should do the trick, or
pathlib.Path(fileName).parent.mkdir(parents=True, exists_ok=True)
for a somewhat more modern take on it.
There's a bunch of other minor issues in the script:
the main function should generally be "gated" so modules can be imported without running them
explicitly closing files has fallen out of favor as it's unreliable
when opening files in "text" mode (the default) you should always provide an encoding
pathlib is fun, also that way you should not have to deal with path separators and all that crap
unless it's required to handle that case, I'd just let open(fname, 'r') error out if the file doesn't exist
Here's a version I think should be slightly improved:
import pathlib
import random
import os
import time
def fillFile(fileSize, fileName):
with fileName.open('w', encoding='utf-8') as f:
for i in range(fileSize):
r = random.randint(0,fileSize+1000)
f.write(f"{r}\n")
def readFile(fileName):
with fileName.open(encoding='utf-8') as f:
return [int(line) for line in f]
OUT_DIR = pathlib.Path.cwd().joinpath("filesForAssignment1")
FILE_SIZES = [1000, 5000, 10000, 25000, 50000, 100000, 200000]
def mainForFiles():
# Dosyaları oluştur
OUT_DIR.mkdir(parents=True, exist_ok=True) # make sure the directory exists
statFilePath = pathlib.Path("fileStats.txt")
with statFilePath.open('w', encoding='utf-8') as statFile:
statFile.write("fillFile")
print("WRITING TO FILES")
for i in FILE_SIZES:
start = time.time()
fillFile(i, OUT_DIR.joinpath(f'file{i}'))
finish = time.time()
statFile.write(f" {finish-start}")
print(f"File Size = {i} Write Time = {finish-start})")
statFile.write("\n")
print("READING FILES")
statFile.write("readFile")
for i in FILE_SIZES:
f = OUT_DIR.joinpath(f'file{i}')
# Dosyayı oku
start = time.time()
alist = readFile(f)
finish = time.time()
statFile.write(f" {finish-start}")
print (f"File Size = {i} Dosya Okuma Zamanı = {finish-start}")
statFile.write("\n")
if __name__ == '__main__':
mainForFiles()

exit() is not doing what you want, it continues with the code.
def readFile(fileName):
# Open file
if os.path.exists(fileName):
FILE = open(fileName,"r")
else:
print(fileName + " does not exist!")
return
# Read File
alist = []
for line in FILE:
alist.append(int(line))
FILE.close()
return alist

Related

IOError: [Errno 2] No such file or directory: (Python: I am having this error even though the file does exist)

in the error output message I have the .json file in a streamed tweets folder. The data exists im not sure what is wrong with my path
import json
import os, os.path
# counter keeps track of the unique tweet ids and is used later when
# prepending the ElasticSearch compatibility
counter = 1
# find all files in folderPath, does not include subdirectories or directories
folderPath = r"c:/Users/Katherine/Downloads/Final Project-.zip/Final
Project/122_finalproject_part2/streamed_tweets";
files = next(os.walk(folderPath))[2]
# filecount is keeps track of how many files are in the streamedtweets folder
filecount = len(files) - 1
print(filecount)
ctr = 0
for ctr in range(filecount):
inFileName = open('streamedtweets/tweet_data_' + str(filecount) + '.json', 'r')
outFileName = open('elastic_data/elastictwitter_data_' + str(filecount) + '.json', 'w')
ERROR
File "add_elastic.py", line 16, in <module>
inFileName = open('streamedtweets/tweet_data_' + str(filecount) + '.json', 'r')
IOError: [Errno 2] No such file or directory: 'streamedtweets/tweet_data_32.json'
You expected tweet_data_ to be appended with 0, 1, 2, 3... but in actuality you're always appending 32 which doesn't seem to exist. A better way would be to loop directly through the files you have instead of guessing its name.
Try doing this instead:
folderPath = r"c:/Users/Katherine/Downloads/Final Project-.zip/Final
Project/122_finalproject_part2/streamed_tweets"
files = next(os.walk(folderPath))[2]
for i, file in enumerate(files):
if file.startswith('tweet_data_') and file.endswith('.json'):
inFileName = open(file, 'r')
outFileName = open('elastic_data/elastictwitter_data_' + str(i) + '.json', 'w')
And even better way would be to use the with statement to manage your files opening and closing:
for i, file in enumerate(files):
if file.startswith('tweet_data_') and file.endswith('.json'):
with open(file, 'r') as inFileName, open('elastic_data/elastictwitter_data_' + str(i) + '.json', 'w') as outFileName:
# do something with inFileName / outFileName

How to skip unhashable (corrupt) files while md5 fingerprinting?

The code below makes an md5/metadata fingerprint, but crashes on files with unknown corruption (e.g., files, that can be copied, mostly even opened, but that can not be hashed or zipped up [to disguise their corruption]).
Question: How one makes this code to skip or ignore any and all problem files and just do the rest? Imagine 1 million files on 8 TB. Otherwise I leave it running and having no real-time monitoring of progress, 2 days later I find out that nothing got hashed because a couple problem files made the code hung.
Part of the code (see full code below):
def createBasicInfoListFromDisk():
global diskCompareListDetails, onlyFileNameOnDisk, driveLetter,walk_dir
walk_dir = os.path.abspath(walk_dir)
for root, subdirs, files in os.walk(walk_dir, topdown=True, onerror=None, followlinks=True ):
for filename in files:
file_path = os.path.join(root, filename)
temp = file_path.split(":")
driveLetter = temp[0]
filePathWithoutDriveLetter = temp[1]
fileSize = os.path.getsize(file_path)
mod_on = get_last_write_time(file_path)
print('\t- file %s (full path: %s)' % (filename, file_path))
print('FileName : {filename} is of size {size} and was modified on{mdt}'.format(filename=file_path,size=fileSize,mdt=mod_on ))
diskCompareListDetails.append("\"" + filePathWithoutDriveLetter+"\",\""+str(fileSize) + "\",\"" + mod_on +'"')
onlyFileNameOnDisk.append("\""+filePathWithoutDriveLetter+"\"")
return
Error:
FileName : T:\problemtest\problemfile.doc is of size 27136 and was modified on2010-10-10 13:58:32
Traceback (most recent call last):
File "t:\scripts\test.py", line 196, in <module>
createBasicInfoListFromDisk()
File "t:\scripts\test.py", line 76, in createBasicInfoListFromDisk
mod_on = get_last_write_time(file_path)
File "t:\scripts\test.py", line 61, in get_last_write_time
convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(st.st_mtime))
OSError: [Errno 22] Invalid argument
Full code:
import os
import sys
import time
import datetime
import difflib
import decimal
import hashlib
from pip._vendor.distlib.compat import raw_input
csvListDetails = list()
csvCompareListDetails = list()
diskCompareListDetails = list()
onlyFileNameOnDisk = list()
addedFiles = list()
removedFiles = list()
driveLetter =""
finalFilesToChange=list()
finalFilesToDelete=list()
changedFiles=list()
csvfilewithPath="md5.csv"
import shutil
walk_dir=""
def findAndReadCSVFile(fileName):
global csvListDetails
global csvCompareListDetails
haveIgnoredLine = 0
foundFile=0
try :
inputFileHandler = open(fileName,"rt",encoding='utf-8')
update_time = get_last_write_time(fileName)
print("\n Found md5.csv, last updated on: %s" % update_time)
foundFile=1
except (OSError, IOError, FileNotFoundError):
print("\n md5.csv not found. Will create a new one.")
return foundFile
for line in inputFileHandler:
if (haveIgnoredLine==0):
haveIgnoredLine=1
continue
rowItem = line.replace("\n","").split('","')
csvCompareListDetails.append('"' + rowItem[3]+',"'+rowItem[2]+'","' +rowItem[1]+'"')
lineDetails = list()
for detailNum in range (0,len(rowItem)):
lineDetails.append('"' + (rowItem[detailNum].replace('"','')) + '"')
csvListDetails.append(lineDetails)
inputFileHandler.close()
return foundFile
def get_last_write_time(filename):
st = os.stat(filename)
convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(st.st_mtime))
return convert_time_to_human_readable
def createBasicInfoListFromDisk():
global diskCompareListDetails, onlyFileNameOnDisk, driveLetter,walk_dir
walk_dir = os.path.abspath(walk_dir)
for root, subdirs, files in os.walk(walk_dir, topdown=True, onerror=None, followlinks=True ):
for filename in files:
file_path = os.path.join(root, filename)
temp = file_path.split(":")
driveLetter = temp[0]
filePathWithoutDriveLetter = temp[1]
fileSize = os.path.getsize(file_path)
mod_on = get_last_write_time(file_path)
print('\t- file %s (full path: %s)' % (filename, file_path))
print('FileName : {filename} is of size {size} and was modified on{mdt}'.format(filename=file_path,size=fileSize,mdt=mod_on ))
diskCompareListDetails.append("\"" + filePathWithoutDriveLetter+"\",\""+str(fileSize) + "\",\"" + mod_on +'"')
onlyFileNameOnDisk.append("\""+filePathWithoutDriveLetter+"\"")
return
def compareLogAndDiskLists():
global addedFiles, removedFiles
diff = difflib.unified_diff(csvCompareListDetails, diskCompareListDetails, fromfile='file1', tofile='file2', lineterm='', n=0)
lines = list(diff)[2:]
addedFiles = [line[1:] for line in lines if line[0] == '+']
removedFiles = [line[1:] for line in lines if line[0] == '-']
return
def displayInfoForUserInput():
global finalFilesToChange, finalFilesToDelete
changedOrNewFileCount = 0
noLongerExistingFilesCount = 0
totalSizeOfChange = 0
for line in addedFiles:
if line not in removedFiles:
changedOrNewFileCount = changedOrNewFileCount +1
elements = line.replace("\n","").split('","')
sizeOfFile= int(elements[1].replace('"',''))
totalSizeOfChange = totalSizeOfChange + sizeOfFile
finalFilesToChange.append(elements[0] +'"')
for line in removedFiles:
elements = line.split('","')
if elements[0]+'"' not in onlyFileNameOnDisk:
noLongerExistingFilesCount = noLongerExistingFilesCount + 1
finalFilesToDelete.append(elements[0]+'"')
GBModSz= decimal.Decimal(totalSizeOfChange) / decimal.Decimal('1073741824')
print("\n New or modified files on drive: {} (need to hash)".format(changedOrNewFileCount))
print (" Obsolete lines in md5.csv (files modified or not on drive): {} (lines to delete)".format(noLongerExistingFilesCount))
print (" {} files ({:.2f} GB) needs to be hashed.".format(changedOrNewFileCount,GBModSz))
userInput = raw_input("\n Proceed with hash? (Y/N, Yes/No) ")
if (userInput.strip().upper() == "Y" or userInput.strip().upper() == "YES"):
print("Continuing Processing...")
else:
print("You opted not to continue, Exiting")
sys.exit()
return
def processFiles(foundFile):
if (foundFile==1):
oldFileName = walk_dir+"/md5.csv"
shutil.copy( oldFileName, getTargetFileName(oldFileName))
BLOCKSIZE = 1048576*4
global changedFiles
for fileToHash in finalFilesToChange:
hasher = hashlib.new('md5')
fileToUse=driveLetter+":"+fileToHash.replace('"','')
with open(fileToUse, 'rb') as afile:
buf = afile.read(BLOCKSIZE)
while len(buf) > 0:
hasher.update(buf)
buf = afile.read(BLOCKSIZE)
fileDetails = list()
fileDetails.append(hasher.hexdigest())
fileDetails.append(get_last_write_time(fileToUse))
fileDetails.append(os.path.getsize(fileToUse))
fileDetails.append(fileToHash)
changedFiles.append(fileDetails)
return
def getTargetFileName(oldFileName):
targetFileName= walk_dir+"/generated_on_" + get_last_write_time(oldFileName).replace(" ","_").replace("-","").replace(":","")
targetFileName = targetFileName + "__archived_on_" + datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
targetFileName = targetFileName + "__md5.csv"
return targetFileName
def writeCSVFile(fileName):
try :
outputFileHandler=open(fileName,"wt",encoding='utf-8')
outputFileHandler.write("\"md5Hash\",\"LastWriteTime\",\"Length\",\"FullName\"\n")
for details in csvListDetails:
if details[3] in finalFilesToDelete:
continue
if details[3] in finalFilesToChange:
continue
outputFileHandler.write("{},{},{},{}\n".format(details[0],details[1],details[2],details[3]))
for details in changedFiles:
outputFileHandler.write("\"{}\",\"{}\",\"{}\",{}\n".format(details[0],details[1],details[2],details[3]))
outputFileHandler.close()
except (OSError, IOError, FileNotFoundError) as e:
print("ERROR :")
print("File {} is either not writable or some other error: {}".format(fileName,e))
return
if __name__ == '__main__':
walk_dir = raw_input("\n Enter drive or directory to scan: ")
csvfilewithPath=walk_dir+"/md5.csv"
print("\n Drive to scan: " + walk_dir)
foundFile = 0
foundFile=findAndReadCSVFile(csvfilewithPath)
createBasicInfoListFromDisk()
compareLogAndDiskLists()
displayInfoForUserInput()
processFiles(foundFile)
writeCSVFile(csvfilewithPath)
Trying this fix, no luck:
def get_last_write_time(filename):
try:
st = os.stat(filename)
convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(st.st_mtime))
return convert_time_to_human_readable
except OSError:
pass
return "ERROR"
def createBasicInfoListFromDisk():
I agree with IMCoins and I'm very qurius on why except isn't catching the error.
So first thing I would do is to go to the source where the OSError is being raised and try to catch it explicity.
def get_last_write_time(filename):
try:
st = os.stat(filename)
convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S",
time.localtime(st.st_mtime)
return convert_time_to_human_readable
except OSError:
pass
return "ERROR" #or whatever string you want add
Updated answer, for updated post.
As stated earlier, except statement with exception type specified catches everything. So, in order to do what want... I'm afraid possible answer are either :
To make a method that identifies corrupted files, and handles it properly.
Make try, except statement that encapsulate every part of your code where there could be an error.
Let me warn you about the second solution though, as sometimes, there are system errors that you do not want to avoid. I believe you should print the exception that you catch, in order to identify further problems you may encounter.
Just so you know, as you may not : your error is not in a try, except statement. Your error is in (if I copied and pasted properly in my editor) line 196, createBasicinfoListFromDisk(), then line 76, mod_on = get_last_write_time(file_path)
As you also mentioned you are using python 3.x, I suggest you are looking into the suppress function (https://docs.python.org/3/library/contextlib.html#contextlib.suppress).
I hope it helped you.

Python 3: urlextract package, PermissionError

I am using Windows 10 x64, with Python 3.6.1 x86.
I have this script from a few months ago which was working fine, but right now it gives me a weird error. The script is a simple one that extract URLs from tweets saved in .csv files.
This is the script:
import datetime
from urlextract import URLExtract
twitter_files_list = ['File1.csv', 'File2.csv', 'File3.csv']
input_path = my_path
# Find domain of URL
def find_domain(url):
return url.split("//")[-1].split("/")[0]
# Clean domain from useless chars
def clean_domain(domain):
domain = domain.replace("[", "")
domain = domain.replace("]", "")
domain = domain.replace("\'", "")
return domain
# Extract URLs from Tweets
def url_extract(filename):
print('\n' + filename + ':')
url_counter = 0
url_file = open('extracted_urls/urls_' + filename, 'a')
# Open file
f = open(input_path + filename, "r", encoding="utf8")
lines = f.readlines()
# Search for contents of column "text"
text = []
for x in lines:
text.append(x.split('\t')[4])
# Close file
f.close()
extractor = URLExtract()
for i in range(len(text)):
try:
if extractor.find_urls(text[i]): # Check if URL exists
url = extractor.find_urls(text[i])
domain = find_domain(str(url))
if not " " in domain:
url_file.write(str(clean_domain(domain)) + "\n")
url_counter += 1
except 'Not Found':
continue
url_file.close()
# Main
if __name__ == '__main__':
print('\nURL Characterization:\n')
# Start timer
start = datetime.datetime.now()
# Find the unique usernames for every file
for twitter_file in twitter_files_list:
print('Searching ' + str(twitter_file) + '...')
url_extract(twitter_file)
# End timer
end = datetime.datetime.now()
# Print results
print("\nProcess finished")
print("Total time: " + str(end - start))
This gives me the following error:
Traceback (most recent call last):
File "C:/Users/Aventinus/url_analysis/url_extractor.py", line 77, in <module>
url_extract(twitter_file)
File "C:/Users/Aventinus/url_analysis/url_extractor.py", line 50, in url_extract
extractor = URLExtract()
File "C:\Program Files (x86)\Python36-32\lib\site-packages\urlextract.py", line 65, in __init__
if not self._download_tlds_list():
File "C:\Program Files (x86)\Python36-32\lib\site-packages\urlextract.py", line 114, in _download_tlds_list
with open(self._tld_list_path, 'w') as ftld:
PermissionError: [Errno 13] Permission denied: 'C:\\Program Files (x86)\\Python36-32\\lib\\site-packages\\.tlds'
I have no idea how to interpret this.
you can try run the script as administrator

Read all files from folder and edit

I am trying to read all fasta files from test folder and put the name of file in all headers of individual file. The code working for first file and dont proceed to second file and return error. Could you help me find bug in my code or edit it. Thanks
import sys, glob, os, string
header = ''
check = 0
path = "./test/"
dirs = os.listdir(path)
for file in dirs:
fp = open(file, "r")
fpx = open('%s_output.txt' % file, 'w')
for line in fp:
if line.startswith('>'):
line = line.rstrip()
check = check + 1
if check >= 1:
header = line
fpx.write(header + '_' + file + '\n')
else:
line = line.rstrip()
fpx.write(line + '\n')
It would be good to provide the error message you are getting! I think this must fail with "File not found" because you try to open the file by name instead of path. Try fp = open(os.path.join(path, file), "r"):
import sys, glob, os, string
header = ''
check = 0
path = "./test/"
dirs = os.listdir(path)
for file in dirs:
fp = open(os.path.join(path, file), "r")
fpx = open('%s_output.txt' % file, 'w')
for line in fp:
if line.startswith('>'):
line = line.rstrip()
check = check + 1
if check >= 1:
header = line
fpx.write(header + '_' + file + '\n')
else:
line = line.rstrip()
fpx.write(line + '\n')

Python gzip CRC check failed

I have a folder with huge text files. Each one is gzipped and weighs several Giga byte.
I wrote a piece of code to split the content of each gzip file: each gzip file is open with gzip, then every specified chunk of line is read and written to a new gzip file.
Here is the code, in file file_compression.py:
import sys, os, file_manipulation as fm
import gzip
def splitGzipFile(fileName, dest=None, chunkPerSplit=100, linePerChunk=4, file_field_separator="_", zfill=3
, verbose=False, file_permission=None, execute=True):
"""
Splits a gz file into chunk files.
:param fileName:
:param chunkPerSplit:
:param linePerChunk:
:return:
"""
absPath = os.path.abspath(fileName)
baseName = os.path.basename(absPath)
dirName = os.path.dirname(absPath)
destFolder = dirName if dest is None else dest
## Compute file fields
rawBaseName, extensions = baseName.split(os.extsep, 1)
if not str(extensions).startswith("."):
extensions = "." + extensions
file_fields = str(rawBaseName).split(file_field_separator)
first_fields = file_fields[:-1] if file_fields.__len__() > 1 else file_fields
first_file_part = file_field_separator.join(first_fields)
last_file_field = file_fields[-1] if file_fields.__len__() > 1 else ""
current_chunk = getCurrentChunkNumber(last_file_field)
if current_chunk is None or current_chunk < 0:
first_file_part = rawBaseName
## Initialize chunk variables
linePerSplit = chunkPerSplit * linePerChunk
# chunkCounter = 0
chunkCounter = 0 if current_chunk is None else current_chunk-1
for chunk in getFileChunks(fileName, linePerSplit):
print "writing " + str(str(chunk).__len__()) + " ..."
chunkCounter += 1
oFile = fm.buildPath(destFolder) + first_file_part + file_field_separator + str(chunkCounter).zfill(zfill) + extensions
if execute:
writeGzipFile(oFile, chunk, file_permission)
if verbose:
print "Splitting: created file ", oFile
def getCurrentChunkNumber(chunk_field):
"""
Tries to guess an integer from a string.
:param chunk_field:
:return: an integer, None if failure.
"""
try:
return int(chunk_field)
except ValueError:
return None
def getFileChunks(fileName, linePerSplit):
with gzip.open(fileName, 'rb') as f:
print "gzip open"
lineCounter = 0
currentChunk = ""
for line in f:
currentChunk += line
lineCounter += 1
if lineCounter >= linePerSplit:
yield currentChunk
currentChunk = ""
lineCounter = 0
if not currentChunk == '':
yield currentChunk
def writeGzipFile(file_name, content, file_permission=None):
import gzip
with gzip.open(file_name, 'wb') as f:
if not content == '':
f.write(content)
if file_permission is not None and type(file_permission) == int:
os.chmod(file_name, file_permission)
This task is multiprocess, a process is created for each file before being splitted. Each file is open and split only once, before being erased, I made sure of that by recording them in a list:
from tools.file_utils import file_compression as fc, file_manipulation as fm
import multiprocessing
from multiprocessing import Process, Queue, Manager
manager = Manager()
split_seen = manager.list()
files = [...] # list is full of gzip files.
processList = []
sampleDir = "sample/dir/"
for file in files:
fielPath = sampleDir + str(file)
p = Process(target=processFile, args=(filePath, sampleDir, True))
p.start()
processList.append(p)
## Join the processes
for p in processList:
p.join()
def processFile(filePath, destFolder, verbose=True):
global split_seen
if filePath in split_seen:
print "Duplicate file processed: " + str(filePath)
time.sleep(3)
print "adding", filePath, split_seen.__len__()
split_seen.append(filePath)
fc.splitGzipFile(filePath, dest=destFolder, chunkPerSplit=4000000\
, linePerChunk=4
, verbose=True
, file_permission=0770
, zfill=3
)
os.remove(filePath)
So far the code has always run fine. But today I had an issue with gzip files' CRC corruption:
Process Process-3:72:
Traceback (most recent call last):
...
File "/.../tools/file_utils/file_compression.py", line 43, in splitGzipFile
for chunk in getFileChunks(fileName, linePerSplit):
File "/.../tools/file_utils/file_compression.py", line 70, in getFileChunks
for line in f:
File "/.../python2.7/lib/python2.7/gzip.py", line 450, in readline
c = self.read(readsize)
File "/.../python2.7/lib/python2.7/gzip.py", line 256, in read
self._read(readsize)
File "/.../python2.7/lib/python2.7/gzip.py", line 320, in _read
self._read_eof()
File "/.../python2.7/lib/python2.7/gzip.py", line 342, in _read_eof
hex(self.crc)))
IOError: CRC check failed 0xddbb6045 != 0x34fd5580L
What could be the origins for this issue? I have to state again that so far it has
always worked, folders and files are always of the same structure. The difference in this instance perhaps is that my script is processing more gzip files than usual, maybe twice as much.
Could it be a matter of the same files being accessed at the same time? But that I seriously doubt, I made sure it is not the case by registering each file accessed in my split_seen list.
I would take any hint, as I have no more clues to where to look.
EDIT 1
Maybe some open files were accessed by someone else, or another program? I cannot ask for and rely on testimonials. So as a start, if I were to put a multiprocess.Lock, would it prevent any other thread, process, program, user, etc from modifying the file? Or is it only limited to Python? I cannot find any doc on that.
I got the exact same error on code that has been running for months. Turns out that the file source was corrupted for that particular file. I went back to an old file and it worked fine and I used a newer file and it also worked fine.
I had the same issue. I just deleted the old file re-ran the code.
rm -rf /tmp/imagenet/
HTH

Categories

Resources