I want to convert this corpus hu.txt.xz 15GB which becomes around 60GB after unpacking to small versions of text files, each file with less than 1GB or 100000 lines
The expected output:
| siplit_1.txt
| siplit_2.txt
| siplit_3.txt
.....
| siplit_n.txt
I have this script on a local machine but doesn't work it just loads without process because bigdata as I think :
import fun
import sys
import os
import shutil
# //-----------------------
# Retrieve and return output file max lines from input
def how_many_lines_per_file():
try:
return int(input("Max lines per output file: "))
except ValueError:
print("Error: Please use a valid number.")
sys.exit(1)
# //-----------------------
# Retrieve input filename and return file pointer
def file_dir():
try:
filename = input("Input filename: ")
return open(filename, 'r')
except FileNotFoundError:
print("Error: File not found.")
sys.exit(1)
# //-----------------------
# Create output file
def create_output_file_dir(num, filename):
return open(f"./data/output_{filename}/split_{num}.txt", "a")
# //-----------------------
# Create output directory
def create_output_directory(filename):
output_path = f"./data/output_{filename}"
try:
if os.path.exists(output_path): # Remove directory if exists
shutil.rmtree(output_path)
os.mkdir(output_path)
except OSError:
print("Error: Failed to create output directory.")
sys.exit(1)
def ch_dir():
# Print the current working directory
print("Current working directory: {0}".format(os.getcwd()))
# Change the current working directory
os.chdir('./data')
# Print the current working directory
print("Current working directory: {0}".format(os.getcwd()))
# //-----------------------
def split_file():
try:
line_count = 0
split_count = 1
max_lines = how_many_lines_per_file()
# ch_dir()
input_file = fun.file_dir()
input_lines = input_file.readlines()
create_output_directory(input_file.name)
output_file = create_output_file_dir(split_count, input_file.name)
for line in input_lines:
output_file.write(line)
line_count += 1
# Create new output file if current output file's line count is greater than max line count
if line_count > max_lines:
split_count += 1
line_count = 0
output_file.close()
# Prevent creation of an empty file after splitting is finished
if not len(input_lines) == max_lines:
output_file = create_output_file_dir(split_count, input_file.name)
# Handle errors
except Exception as e:
print(f"An unknown error occurred: {e}")
# Success message
else:
print(f"Successfully split {input_file.name} into {split_count} output files!")
# //-----------------------
if __name__ == "__main__":
split_file()
Is there any python script or deep learning tool to split them for using the to next task
By calling readlines() on the input file handle, you are reading (or trying to) the whole file into memory at the same time. You can do this instead to process the file one line at a time, never having more than a single line in memory:
input_file = fun.file_dir()
...
for line in input_file:
...
Another issue to be aware of is that this line:
if not len(input_lines) == max_lines:
output_file = create_output_file_dir(split_count, input_file.name)
is likely not doing what you think it is. Neither input_lines or max_lines will ever change inside the loop, so this will either always create a new file or never will. Unless you happen to process a file with exactly max_lines lines in it, this will always be true. This is not a big deal, but I think as your code is now you're going to end up with an extra empty file. You need to change the logic anyway, so you'll have to rethink how to make this work.
UPDATE:
Here's how I would modify the logic to do the right thing regarding opening each of the output files:
input_file = fun.file_dir()
# output_file = create_output_file_dir(split_count, input_file.name)
output_file = None
...
for line in input_file:
# Open a new output file if we don't have one open
if not output_file:
output_file = create_output_file_dir(split_count, input_file.name)
output_file.write(line)
line_count += 1
# Close the current output file if the line count has reached its max
if line_count > max_lines:
split_count += 1
line_count = 0
output_file.close()
output_file = None
The key idea here is that you can't know if you need a new output file until you have tried to read the next line after closing the current output file. This logic only opens an output file when it has a line to write out and there is no open output file.
You're trying to allocate a big file into memory which is not possible.
Instead of reading all the content at once just read line by line and process it.
I've fixed the bug seen by #CryptoFool
import fun
import sys
import os
import shutil
# //-----------------------
# Retrieve and return output file max lines from input
def how_many_lines_per_file():
try:
return int(input("Max lines per output file: "))
except ValueError:
print("Error: Please use a valid number.")
sys.exit(1)
# //-----------------------
# Retrieve input filename and return file pointer
def file_dir():
try:
filename = input("Input filename: ")
return open(filename, 'r')
except FileNotFoundError:
print("Error: File not found.")
sys.exit(1)
# //-----------------------
# Create output file
def create_output_file_dir(num, filename):
return open(f"./data/output_{filename}/split_{num}.txt", "a")
# //-----------------------
# Create output directory
def create_output_directory(filename):
output_path = f"./data/output_{filename}"
try:
if os.path.exists(output_path): # Remove directory if exists
shutil.rmtree(output_path)
os.mkdir(output_path)
except OSError:
print("Error: Failed to create output directory.")
sys.exit(1)
def ch_dir():
# Print the current working directory
print("Current working directory: {0}".format(os.getcwd()))
# Change the current working directory
os.chdir('./data')
# Print the current working directory
print("Current working directory: {0}".format(os.getcwd()))
# //-----------------------
def split_file():
try:
line_count = 0
split_count = 1
max_lines = how_many_lines_per_file()
# ch_dir()
input_file = fun.file_dir()
create_output_directory(input_file.name)
output_file = None # No output file is created at first, we need to check if there's any line if it enters the for
for line in input_file:
# Open a new output file if we don't have one open
if not output_file:
output_file = create_output_file_dir(split_count, input_file.name)
output_file.write(line)
line_count += 1
# Close the current output file if the line count has reached its max
if line_count > max_lines:
split_count += 1
line_count = 0
output_file.close()
output_file = None
# Handle errors
except Exception as e:
print(f"An unknown error occurred: {e}")
# Success message
else:
print(f"Successfully split {input_file.name} into {split_count} output files!")
# //-----------------------
if __name__ == "__main__":
split_file()
Related
I have a question about the python read files(its format as txt ). I read a file and argv[1] is once read bytes number, and then put it sotre in a list, but when I write it on another file, it is not the same as the original files. how could i fix this?
readfile:
fh = open(file_name, "rb")
imfor = fh.read(mss)
file_content = []
file_content.append(imfor)
while (imfor):
file_content.append(imfor)
imfor = fh.read(mss)
fh.close()
write File
fh = open("test1R.txt", "wb")
for currContent in file_content:
fh.write(currContent)
fh.close
It is always better to use a the with open context manager to read and write files. You also don't need to manually append the contents to a list. file.readlines() does that for you.
Here is a some code to help you with that:
from sys import argv
# we first check if the file exists
try:
print("reading file")
with open(argv[1], "r") as input_file:
file_contents = input_file.readlines() # stores the file content in an array
except FileNotFoundError:
print(f"File {argv[1]} not found")
exit(1)
# check if the user provided a name for the output file
try:
out_file = argv[2]
except IndexError:
out_file = "outfile.txt"
# write to the new file.
with open(out_file, "w") as out:
try:
out.write("\n".join(file_contents)) # converts the array back to string
print(f"Wrote to {out_file}")
except FileExistsError:
print(f"{out_file} already exists.")
exit(1)
I'm attempting to read and parse a .txt file that is continually being updated throughout the day. I want to parse only lines that have not already been consumed. These are then to be sent to a Telegram group.
At present, every time I run the script it parses everything.
selections = []
msgList = []
urr = ""
name = ""
ourLines=len(selections)
while(True):
file1 = open(r'C:\\urlt\log.txt', 'r')
Lines = file1.readlines()
file1.close()
try:
while(True):
if(ourLines==len(Lines)):
break
else:
txt = Lines[ourLines].strip()
tlist = txt.split("&")
ourLines=ourLines+1
for subtxt in tlist:
if "eventurl=" in subtxt:
a = subtxt[9:len(subtxt) - 3]
url = "www.beefandtuna.com/%23"+a.replace("%23", "/").strip('(')
#print(url)
urr = url
elif "bet=" in subtxt:
name = urllib.parse.unquote(subtxt[4:len(subtxt)])
#print(name)
selections.append(url+name)
msg = url +" " '\n' "Name: "+ name
if msg not in msgList:
post_to_telegram(msg)
msgList.append(msg)
#time.sleep(0.5)
except:
pass
Assuming the new contents are appended to the end of the file: after you finish reading the file, create a copy of the file.
The next time you read the file, seek to the location that is the length of the copy.
import os
from shutil import copyfile
in_file_loc = r'C:\\SmartBet.io Bot\placerlog.txt'
backup_file_loc = in_file_loc + ".bak"
while True:
try:
file_backup_size = os.stat(backup_file_loc).st_size
except:
file_backup_size = 0
file1 = open(in_file_loc, 'r')
# move file position to the end of the old file
file1.seek(file_backup_size)
# Read all lines in the file after the position we seek-ed to
Lines = file1.readlines()
file1.close()
# copy current version of file to backup
copyfile(in_file_loc, backup_file_loc)
# Then do whatever you want to do with Lines
This is probably not the best way to do this because, as rici said in a comment below:
"make a copy" is not an atomic operation, and as the file grows copying will be successively slower. Any data appended to the log file during the copy will never be reported. Furthermore, the copy might happen to include a partial entry, in which case the next scan will start in the middle of an entry.
An alternative is to save the size of the current file in a different one:
in_file_loc = r'C:\\SmartBet.io Bot\placerlog.txt'
size_file_loc = in_file_loc + ".lastsize"
while True:
# read old size from file
try:
with open(size_file_loc, 'r') as f:
file_size = int(f.read())
except:
# if error, file size is zero
file_size = 0
file1 = open(in_file_loc, 'r')
file1.seek(file_size)
Lines = file1.readlines()
new_file_size = file1.tell() # Get the location of the current file marker
file1.close()
# write new size to file
with open(size_file_loc, 'w') as f:
f.write(str(new_file_size))
# Then do whatever you want to do with Lines
Using this Python code I get printed lines of file in UPPERCASE but file remains unchanged (lowercase.)
def open_f():
while True:
fname=raw_input("Enter filename:")
if fname != "done":
try:
fhand=open(fname, "r+")
break
except:
print "WRONG!!!"
continue
else: exit()
return fhand
fhand=open_f()
for line in fhand:
ss=line.upper().strip()
print ss
fhand.write(ss)
fhand.close()
Can you suggest please why files remain unaffected?
Code:
def file_reader(read_from_file):
with open(read_from_file, 'r') as f:
return f.read()
def file_writer(read_from_file, write_to_file):
with open(write_to_file, 'w') as f:
f.write(file_reader(read_from_file))
Usage:
Create a file named example.txt with the following content:
Hi my name is Dmitrii Gangan.
Create an empty file called file_to_be_written_to.txt
Add this as the last line file_writer("example.txt", "file_to_be_written_to.txt") of your .py python file.
python <your_python_script.py> from the terminal.
NOTE: They all must be in the same folder.
Result:
file_to_be_written_to.txt:
Hi my name is Dmitrii Gangan.
This program should do as you requested and allows for modifying the file as it is being read. Each line is read, converted to uppercase, and then written back to the source file. Since it runs on a line-by-line basis, the most extra memory it should need would be related to the length of the longest line.
Example 1
def main():
with get_file('Enter filename: ') as file:
while True:
position = file.tell() # remember beginning of line
line = file.readline() # get the next available line
if not line: # check if at end of the file
break # program is finished at EOF
file.seek(position) # go back to the line's start
file.write(line.upper()) # write the line in uppercase
def get_file(prompt):
while True:
try: # run and catch any error
return open(input(prompt), 'r+t') # r+t = read, write, text
except EOFError: # see if user if finished
raise SystemExit() # exit the program if so
except OSError as error: # check for file problems
print(error) # report operation errors
if __name__ == '__main__':
main()
The following is similar to what you see up above but works in binary mode instead of text mode. Instead of operating on lines, it processes the file in chunks based on the given BUFFER_SIZE and can operate more efficiently. The code under the main loop may replace the code in the loop if you wish for the program to check that it is operating correctly. The assert statements check some assumptions.
Example 2
BUFFER_SIZE = 1 << 20
def main():
with get_file('Enter filename: ') as file:
while True:
position = file.tell()
buffer = file.read(BUFFER_SIZE)
if not buffer:
return
file.seek(position)
file.write(buffer.upper())
# The following code will not run but can replace the code in the loop.
start = file.tell()
buffer = file.read(BUFFER_SIZE)
if not buffer:
return
stop = file.tell()
assert file.seek(start) == start
assert file.write(buffer.upper()) == len(buffer)
assert file.tell() == stop
def get_file(prompt):
while True:
try:
return open(input(prompt), 'r+b')
except EOFError:
raise SystemExit()
except OSError as error:
print(error)
if __name__ == '__main__':
main()
I suggest the following approach:
1) Read/close the file, return the filename and content
2) Create a new file with above filename, and content with UPPERCASE
def open_f():
while True:
fname=raw_input("Enter filename:")
if fname != "done":
try:
with open(fname, "r+") as fhand:
ss = fhand.read()
break
except:
print "WRONG!!!"
continue
else: exit()
return fname, ss
fname, ss =open_f()
with open(fname, "w+") as fhand:
fhand.write(ss.upper())
Like already alluded to in comments, you cannot successively read from and write to the same file -- the first write will truncate the file, so you cannot read anything more from the handle at that point.
Fortunately, the fileinput module offers a convenient inplace mode which works exactly like you want.
import fileinput
for line in fileinput.input(somefilename, inplace=True):
print(line.upper().strip())
I am stuck why the words.txt is not showing the full grid, below is the tasks i must carry out:
write code to prompt the user for a filename, and attempt to open the file whose name is supplied. If the file cannot be opened the user should be asked to supply another filename; this should continue until a file has been successfully opened.
The file will contain on each line a row from the words grid. Write code to read, in turn, each line of the file, remove the newline character and append the resulting string to a list of strings.After the input is complete the grid should be displayed on the screen.
Below is the code i have carried out so far, any help would be appreciated:
file = input("Enter a filename: ")
try:
a = open(file)
with open(file) as a:
x = [line.strip() for line in a]
print (a)
except IOError as e:
print ("File Does Not Exist")
Note: Always avoid using variable names like file, list as they are built in python types
while True:
filename = raw_input(' filename: ')
try:
lines = [line.strip() for line in open(filename)]
print lines
break
except IOError as e:
print 'No file found'
continue
The below implementation should work:
# loop
while(True):
# don't use name 'file', it's a data type
the_file = raw_input("Enter a filename: ")
try:
with open(the_file) as a:
x = [line.strip() for line in a]
# I think you meant to print x, not a
print(x)
break
except IOError as e:
print("File Does Not Exist")
You need a while loop?
while True:
file = input("Enter a filename: ")
try:
a = open(file)
with open(file) as a:
x = [line.strip() for line in a]
print (a)
break
except IOError:
pass
This will keep asking untill a valid file is provided.
The error occurs at line 49 "fileSizeRemainingInBytes = os.path.getsize(inFile)"
inFile contains the file I want to gets size. From what I understood in the python documentation this should be correct. Can someone tell me what is the problem.
import sys, os
buffer = 1000
try:
#open file in binary mode for reading
inFile = open(sys.argv[1],"rb")
print "file name is: ", inFile.name
except IOError:
#check for IOExceptions
print "Eror opening file"
sys.exit()
else:
#create new directory for copying, create out file in new directory
if (os.path.isdir("recv")):
os.chdir("recv")
try:
outFile = open(inFile.name,"wb")
except IOError:
print "something went wrong creating the out file"
sys.exit()
else :
os.mkdir("recv")
os.chdir("recv")
try:
outFile = open(inFile.name,"wb")
except IOError:
print "something went wrong creating the out file"
sys.exit()
#loop to copy bytes to new directory
fileSizeRemainingInBytes = os.path.getsize(inFile)
print "Initial size: ", fileSizeRemainingInBytes
while fileSizeRemainingInBytes > 0 :
print fileSizeRemainingInBytes
bytesToCopy = inFile.read(buffer);
outFile.write(bytesToCopy);
inFile.close()
os.path.getsize takes a file path as an argument, not a file object. So you actually want to call os.path.getsize(inFile.name). Note that this won't give you the number of bytes remaining to copy; it'll just give you the size of the whole file every time it's evaluated. To get the number of bytes remaining, you'll have to keep track of the total number of bytes read and subtract this total from the file size.
Something like this should work:
import sys
import os
buffer = 1000
with open(sys.argv[1], "rb") as in_file:
# Make your `recv` directory as a sub-directory
# or your current directory if it doesn't already exist
if not os.path.isdir("recv"):
os.mkdir("recv")
# Create the path to the file to which you
# want to copy. When opened, you'll have a file
# with the same file name as your input file,
# but it will be in your `recv` subdirectory
out_file_path = os.path.join("recv", in_file.name)
# Read the bytes
with open(out_file_path, "wb") as out_file:
bytes_read = 0
bytes_to_read = os.path.getsize(in_file.name)
while bytes_read < bytes_to_read:
out_file.write(in_file.read(buffer))
bytes_read += min(buffer, bytes_to_read - bytes_read)
print "{} / {} bytes copied".format(bytes_read, bytes_to_read)