I have this code to consolidate thousands of csv file from folder (Box Drive folder) however time to time I get permission error on some of the files. When I start code over it is fine. But this error pops up more less randomly with random file in directory.
What I need to do is to just wait few seconds and try to open that file again (not skip it)
So far I have this but not working as expected:
with open(OutputTo, "wb") as fout:
with open(sampleFile, "rb") as f: # first file to get header
fout.write(f.read())
for fs in toLoad: # now the rest
with open(path + fs, "rb") as f:
#while True: # infinite loop
try:
next(f) # skip the header
fout.write(f.read())
except PermissionError:
#second try usually works.
failed = failed + 1 # counter
if failed > 10:
print('\n Script failed more than 10 time so I stopped it.')
break
else:
print('\n Perm error, trying again in 5 sec.')
time.sleep(5)
By surrounding the file opening inside a while True loop with a try catch block, and waiting n seconds when PermissionError shows up. Be careful though, if you don't want to skip the file, you have to be sure the PermissionError is just a temporary thing, otherwise you end up in an endless loop constantly retrying (I would implement a break to stop after n tries)
import glob, time
filenames = glob.glob('*.csv')
for file in filenames:
while True:
try:
hndl = open(file,"rb")
# Do stuff
break
except PermissionError:
time.sleep(5)
hndl.close()
continue
I made couple of changes according to #Michael Butscher's comment. (credit to him)
btw code never failed more than twice.
with open(OutputTo, "wb") as fout:
with open(sampleFile, "rb") as f: # first file to get header
fout.write(f.read())
for fs in toLoad: # now the rest
while True: # infinite loop
with open(path + fs, "rb") as f:
try:
next(f) # skip the header
fout.write(f.read())
#status
sys.stdout.write('\r') # overwrite in place
expectedTime = round((((time.time() - startTime)/(toLoad.index(fs)+1))*(len(toLoad) - toLoad.index(fs)))/60,2) # based on how it went so far
percentage = round((toLoad.index(fs)/(len(toLoad)+1))*100,2) # number of done files relative to all files (after date filter)
success = success + 1 # counter
sys.stdout.write('completed {} perc. expected time to end: {} minutes, success: {}, failed: {}'.format(percentage, expectedTime, success, failed)) # status
sys.stdout.flush()
failed = 0 # reset counter if success
break
except PermissionError:
#second try always works. Script waits 5 seconds and usually run ok.
failed = failed + 1 # counter
print('\n Perm error, trying again in 5 sec. ({})'.format(path + fs))
time.sleep(5)
if failed > 10:
print('\n Script failed more than 10 time so I stopped it.')
break
Related
I want to convert this corpus hu.txt.xz 15GB which becomes around 60GB after unpacking to small versions of text files, each file with less than 1GB or 100000 lines
The expected output:
| siplit_1.txt
| siplit_2.txt
| siplit_3.txt
.....
| siplit_n.txt
I have this script on a local machine but doesn't work it just loads without process because bigdata as I think :
import fun
import sys
import os
import shutil
# //-----------------------
# Retrieve and return output file max lines from input
def how_many_lines_per_file():
try:
return int(input("Max lines per output file: "))
except ValueError:
print("Error: Please use a valid number.")
sys.exit(1)
# //-----------------------
# Retrieve input filename and return file pointer
def file_dir():
try:
filename = input("Input filename: ")
return open(filename, 'r')
except FileNotFoundError:
print("Error: File not found.")
sys.exit(1)
# //-----------------------
# Create output file
def create_output_file_dir(num, filename):
return open(f"./data/output_{filename}/split_{num}.txt", "a")
# //-----------------------
# Create output directory
def create_output_directory(filename):
output_path = f"./data/output_{filename}"
try:
if os.path.exists(output_path): # Remove directory if exists
shutil.rmtree(output_path)
os.mkdir(output_path)
except OSError:
print("Error: Failed to create output directory.")
sys.exit(1)
def ch_dir():
# Print the current working directory
print("Current working directory: {0}".format(os.getcwd()))
# Change the current working directory
os.chdir('./data')
# Print the current working directory
print("Current working directory: {0}".format(os.getcwd()))
# //-----------------------
def split_file():
try:
line_count = 0
split_count = 1
max_lines = how_many_lines_per_file()
# ch_dir()
input_file = fun.file_dir()
input_lines = input_file.readlines()
create_output_directory(input_file.name)
output_file = create_output_file_dir(split_count, input_file.name)
for line in input_lines:
output_file.write(line)
line_count += 1
# Create new output file if current output file's line count is greater than max line count
if line_count > max_lines:
split_count += 1
line_count = 0
output_file.close()
# Prevent creation of an empty file after splitting is finished
if not len(input_lines) == max_lines:
output_file = create_output_file_dir(split_count, input_file.name)
# Handle errors
except Exception as e:
print(f"An unknown error occurred: {e}")
# Success message
else:
print(f"Successfully split {input_file.name} into {split_count} output files!")
# //-----------------------
if __name__ == "__main__":
split_file()
Is there any python script or deep learning tool to split them for using the to next task
By calling readlines() on the input file handle, you are reading (or trying to) the whole file into memory at the same time. You can do this instead to process the file one line at a time, never having more than a single line in memory:
input_file = fun.file_dir()
...
for line in input_file:
...
Another issue to be aware of is that this line:
if not len(input_lines) == max_lines:
output_file = create_output_file_dir(split_count, input_file.name)
is likely not doing what you think it is. Neither input_lines or max_lines will ever change inside the loop, so this will either always create a new file or never will. Unless you happen to process a file with exactly max_lines lines in it, this will always be true. This is not a big deal, but I think as your code is now you're going to end up with an extra empty file. You need to change the logic anyway, so you'll have to rethink how to make this work.
UPDATE:
Here's how I would modify the logic to do the right thing regarding opening each of the output files:
input_file = fun.file_dir()
# output_file = create_output_file_dir(split_count, input_file.name)
output_file = None
...
for line in input_file:
# Open a new output file if we don't have one open
if not output_file:
output_file = create_output_file_dir(split_count, input_file.name)
output_file.write(line)
line_count += 1
# Close the current output file if the line count has reached its max
if line_count > max_lines:
split_count += 1
line_count = 0
output_file.close()
output_file = None
The key idea here is that you can't know if you need a new output file until you have tried to read the next line after closing the current output file. This logic only opens an output file when it has a line to write out and there is no open output file.
You're trying to allocate a big file into memory which is not possible.
Instead of reading all the content at once just read line by line and process it.
I've fixed the bug seen by #CryptoFool
import fun
import sys
import os
import shutil
# //-----------------------
# Retrieve and return output file max lines from input
def how_many_lines_per_file():
try:
return int(input("Max lines per output file: "))
except ValueError:
print("Error: Please use a valid number.")
sys.exit(1)
# //-----------------------
# Retrieve input filename and return file pointer
def file_dir():
try:
filename = input("Input filename: ")
return open(filename, 'r')
except FileNotFoundError:
print("Error: File not found.")
sys.exit(1)
# //-----------------------
# Create output file
def create_output_file_dir(num, filename):
return open(f"./data/output_{filename}/split_{num}.txt", "a")
# //-----------------------
# Create output directory
def create_output_directory(filename):
output_path = f"./data/output_{filename}"
try:
if os.path.exists(output_path): # Remove directory if exists
shutil.rmtree(output_path)
os.mkdir(output_path)
except OSError:
print("Error: Failed to create output directory.")
sys.exit(1)
def ch_dir():
# Print the current working directory
print("Current working directory: {0}".format(os.getcwd()))
# Change the current working directory
os.chdir('./data')
# Print the current working directory
print("Current working directory: {0}".format(os.getcwd()))
# //-----------------------
def split_file():
try:
line_count = 0
split_count = 1
max_lines = how_many_lines_per_file()
# ch_dir()
input_file = fun.file_dir()
create_output_directory(input_file.name)
output_file = None # No output file is created at first, we need to check if there's any line if it enters the for
for line in input_file:
# Open a new output file if we don't have one open
if not output_file:
output_file = create_output_file_dir(split_count, input_file.name)
output_file.write(line)
line_count += 1
# Close the current output file if the line count has reached its max
if line_count > max_lines:
split_count += 1
line_count = 0
output_file.close()
output_file = None
# Handle errors
except Exception as e:
print(f"An unknown error occurred: {e}")
# Success message
else:
print(f"Successfully split {input_file.name} into {split_count} output files!")
# //-----------------------
if __name__ == "__main__":
split_file()
When reading or writing CSV-files sometimes the file canĀ“t be accessed:
The process cannot access the file because another process has locked a portion of the file
I want my code to retry the reading/writing until it works.
Here is a draft how i would make a while loop until the file could be read.
But how can i test if "READING_DID_WORK"? Is tehre a way to test if the task was successfull? Or should i just test if FILE = List?
timeout = time.time() + 120 #seconds
bool = True
while bool == True:
time.sleep(0.5) # sleep for 500 milliseconds
if time.time() > timeout:
syncresult="timeout"
break
with io.open(SlogFilePath,"r", encoding = "utf-16(LE)") as File:
FILE = File.read().splitlines()
if READING_DID_WORK:
bool = False
else:
bool = True
OUT = FILE
You don't need the extra boolean (bool is a very bad variable name anyway) and you don't need READING_DID_WORK, just rely on the OSError that will be raised.
A simple wrapper function:
import time
...
def read_file_with_retry(file_name, encoding="utf-16(LE)"):
while True:
try:
with open(file_name, encoding=encoding) as f:
file_content = f.readlines()
except OSError:
time.sleep(0.5)
else:
return file_content
To avoid a case of infinite loop, it is suggested to implement a max-retry mechanism:
import time
...
def read_file_with_retry(file_name, encoding="utf-16(LE)", max_retries=5):
retry = 0
while True:
try:
with open(file_name, encoding=encoding) as f:
file_content = f.readlines()
except OSError:
time.sleep(0.5)
retry += 1
if retry > max_retries:
raise
else:
return file_content
How to - let a function run only on the first startup?
I have tried creating a value-adding mechanism (adding 1 to a variable after startup) but I failed.
result = _winreg.QueryValueEx(key, "MachineGuid")
ID = str(result)
licence_path = 'C:\\Program Files\\Common Files\\System\\read.txt'
oon = 0
def first_time_open_only():
file = open(licence_path, 'w')
file.write(ID[2:38])
file.close()
onn = 1 + onn
first_time_open_only()
with open(licence_path) as f:
contents = f.read()
if contents == str:
pass
else:
root.destroy()
There is a way that can solve this problem. On each run of the code, in order to understand that a function is run before or not, is to save the flag to a file such as pickle or a database. The code below shows a simple example such that the function only runs one time. This kind of problems can be solved by saving the file in order to let the code know the previous state.
In this code, if it is the first run of program, the Flag.pkl would not exists, so the flag will be equal to zero and the function will run, but in second execution the flag will have 1 as its value and the function would not execute.
import pickle
import os.path
def runOnce():
print("first time of execution")
flag = 1
with open('./Flag.pkl', 'wb') as f:
pickle.dump(flag, f)
if os.path.isfile('./Flag.pkl'):
with open('./Flag.pkl','rb') as f:
flag = pickle.load(f)
else:
flag = 0
if flag ==0:
runOnce()
else:
print("This function has been executed before!")
Using this Python code I get printed lines of file in UPPERCASE but file remains unchanged (lowercase.)
def open_f():
while True:
fname=raw_input("Enter filename:")
if fname != "done":
try:
fhand=open(fname, "r+")
break
except:
print "WRONG!!!"
continue
else: exit()
return fhand
fhand=open_f()
for line in fhand:
ss=line.upper().strip()
print ss
fhand.write(ss)
fhand.close()
Can you suggest please why files remain unaffected?
Code:
def file_reader(read_from_file):
with open(read_from_file, 'r') as f:
return f.read()
def file_writer(read_from_file, write_to_file):
with open(write_to_file, 'w') as f:
f.write(file_reader(read_from_file))
Usage:
Create a file named example.txt with the following content:
Hi my name is Dmitrii Gangan.
Create an empty file called file_to_be_written_to.txt
Add this as the last line file_writer("example.txt", "file_to_be_written_to.txt") of your .py python file.
python <your_python_script.py> from the terminal.
NOTE: They all must be in the same folder.
Result:
file_to_be_written_to.txt:
Hi my name is Dmitrii Gangan.
This program should do as you requested and allows for modifying the file as it is being read. Each line is read, converted to uppercase, and then written back to the source file. Since it runs on a line-by-line basis, the most extra memory it should need would be related to the length of the longest line.
Example 1
def main():
with get_file('Enter filename: ') as file:
while True:
position = file.tell() # remember beginning of line
line = file.readline() # get the next available line
if not line: # check if at end of the file
break # program is finished at EOF
file.seek(position) # go back to the line's start
file.write(line.upper()) # write the line in uppercase
def get_file(prompt):
while True:
try: # run and catch any error
return open(input(prompt), 'r+t') # r+t = read, write, text
except EOFError: # see if user if finished
raise SystemExit() # exit the program if so
except OSError as error: # check for file problems
print(error) # report operation errors
if __name__ == '__main__':
main()
The following is similar to what you see up above but works in binary mode instead of text mode. Instead of operating on lines, it processes the file in chunks based on the given BUFFER_SIZE and can operate more efficiently. The code under the main loop may replace the code in the loop if you wish for the program to check that it is operating correctly. The assert statements check some assumptions.
Example 2
BUFFER_SIZE = 1 << 20
def main():
with get_file('Enter filename: ') as file:
while True:
position = file.tell()
buffer = file.read(BUFFER_SIZE)
if not buffer:
return
file.seek(position)
file.write(buffer.upper())
# The following code will not run but can replace the code in the loop.
start = file.tell()
buffer = file.read(BUFFER_SIZE)
if not buffer:
return
stop = file.tell()
assert file.seek(start) == start
assert file.write(buffer.upper()) == len(buffer)
assert file.tell() == stop
def get_file(prompt):
while True:
try:
return open(input(prompt), 'r+b')
except EOFError:
raise SystemExit()
except OSError as error:
print(error)
if __name__ == '__main__':
main()
I suggest the following approach:
1) Read/close the file, return the filename and content
2) Create a new file with above filename, and content with UPPERCASE
def open_f():
while True:
fname=raw_input("Enter filename:")
if fname != "done":
try:
with open(fname, "r+") as fhand:
ss = fhand.read()
break
except:
print "WRONG!!!"
continue
else: exit()
return fname, ss
fname, ss =open_f()
with open(fname, "w+") as fhand:
fhand.write(ss.upper())
Like already alluded to in comments, you cannot successively read from and write to the same file -- the first write will truncate the file, so you cannot read anything more from the handle at that point.
Fortunately, the fileinput module offers a convenient inplace mode which works exactly like you want.
import fileinput
for line in fileinput.input(somefilename, inplace=True):
print(line.upper().strip())
The error occurs at line 49 "fileSizeRemainingInBytes = os.path.getsize(inFile)"
inFile contains the file I want to gets size. From what I understood in the python documentation this should be correct. Can someone tell me what is the problem.
import sys, os
buffer = 1000
try:
#open file in binary mode for reading
inFile = open(sys.argv[1],"rb")
print "file name is: ", inFile.name
except IOError:
#check for IOExceptions
print "Eror opening file"
sys.exit()
else:
#create new directory for copying, create out file in new directory
if (os.path.isdir("recv")):
os.chdir("recv")
try:
outFile = open(inFile.name,"wb")
except IOError:
print "something went wrong creating the out file"
sys.exit()
else :
os.mkdir("recv")
os.chdir("recv")
try:
outFile = open(inFile.name,"wb")
except IOError:
print "something went wrong creating the out file"
sys.exit()
#loop to copy bytes to new directory
fileSizeRemainingInBytes = os.path.getsize(inFile)
print "Initial size: ", fileSizeRemainingInBytes
while fileSizeRemainingInBytes > 0 :
print fileSizeRemainingInBytes
bytesToCopy = inFile.read(buffer);
outFile.write(bytesToCopy);
inFile.close()
os.path.getsize takes a file path as an argument, not a file object. So you actually want to call os.path.getsize(inFile.name). Note that this won't give you the number of bytes remaining to copy; it'll just give you the size of the whole file every time it's evaluated. To get the number of bytes remaining, you'll have to keep track of the total number of bytes read and subtract this total from the file size.
Something like this should work:
import sys
import os
buffer = 1000
with open(sys.argv[1], "rb") as in_file:
# Make your `recv` directory as a sub-directory
# or your current directory if it doesn't already exist
if not os.path.isdir("recv"):
os.mkdir("recv")
# Create the path to the file to which you
# want to copy. When opened, you'll have a file
# with the same file name as your input file,
# but it will be in your `recv` subdirectory
out_file_path = os.path.join("recv", in_file.name)
# Read the bytes
with open(out_file_path, "wb") as out_file:
bytes_read = 0
bytes_to_read = os.path.getsize(in_file.name)
while bytes_read < bytes_to_read:
out_file.write(in_file.read(buffer))
bytes_read += min(buffer, bytes_to_read - bytes_read)
print "{} / {} bytes copied".format(bytes_read, bytes_to_read)