Split large text file(around 50GB) into multiple files - python

I would like to split a large text file around size of 50GB into multiple files.
Data in the files are like this-[x= any integer between 0-9]
xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
...............
...............
There might be few billions of lines in the file and i would like write for example 30/40 millions per file.
I guess the steps would be-
I've to open the file
then using readline() have to read the file line by line and write at the same time to a new file
and as soon as it hits the maximum number of lines it will create another file and
starts writing again.
I'm wondering, how to put all these steps together in a memory efficient and faster way. I've seen some examples in stack but none of them totally helping what i exactly need. I would really appreciate if anyone could help me out.

This working solution uses split command available in shell. Since the author has already accepted a possibility of a non-python solution, please do not downvote.
First, I created a test file with 1000M entries (15 GB) with
awk 'BEGIN{for (i = 0; i < 1000000000; i++) {print "123.123.123.123"} }' > t.txt
Then I used split:
split --lines=30000000 --numeric-suffixes --suffix-length=2 t.txt t
It took 5 min to produce a set of 34 small files with names t00-t33. 33 files are 458 MB each and the last t33 is 153 MB.

from itertools import chain, islice
def chunks(iterable, n):
"chunks(ABCDE,2) => AB CD E"
iterable = iter(iterable)
while True:
# store one line in memory,
# chain it to an iterator on the rest of the chunk
yield chain([next(iterable)], islice(iterable, n-1))
l = 30*10**6
file_large = 'large_file.txt'
with open(file_large) as bigfile:
for i, lines in enumerate(chunks(bigfile, l)):
file_split = '{}.{}'.format(file_large, i)
with open(file_split, 'w') as f:
f.writelines(lines)

I would use the Unix utility split, if it is available to you and your only task is to split the file. Here is however a pure Python solution:
import contextlib
file_large = 'large_file.txt'
l = 30*10**6 # lines per split file
with contextlib.ExitStack() as stack:
fd_in = stack.enter_context(open(file_large))
for i, line in enumerate(fd_in):
if not i % l:
file_split = '{}.{}'.format(file_large, i//l)
fd_out = stack.enter_context(open(file_split, 'w'))
fd_out.write('{}\n'.format(line))
If all of your lines have 4 3-digit numbers on them and you have multiple cores available, then you can exploit file seek and run multiple processes.

This class may solve your problem.
I've tested it on Linux and Windows operating system, and it's worked perfectly on both of them.
Also, I've tested binary and text file with different sizes each time and it was great.
Enjoy :)
import os
import math
class FileSpliter:
# If file type is text then CHUNK_SIZE is count of chars
# If file type is binary then CHUNK_SIZE is count of bytes
def __init__(self, InputFile, FileType="b", CHUNK_SIZE=524288, OutFile="outFile"):
self.CHUNK_SIZE = CHUNK_SIZE # byte or char
self.InputFile = InputFile
self.FileType = FileType # b: binary, t: text
self.OutFile = OutFile
self.FileSize = 0
self.Parts = None
self.CurrentPartNo = 0
self.Progress = 0.0
def Prepare(self):
if not(os.path.isfile(self.InputFile) and os.path.getsize(self.InputFile) > 0):
print("ERROR: The file is not exists or empty!")
return False
self.FileSize = os.path.getsize(self.InputFile)
if self.CHUNK_SIZE >= self.FileSize:
self.Parts = 1
else:
self.Parts = math.ceil(self.FileSize / self.CHUNK_SIZE)
return True
def Split(self):
if self.FileSize == 0 or self.Parts == None:
print("ERROR: File is not prepared for split!")
return False
with open(self.InputFile, "r" + self.FileType) as f:
while True:
if self.FileType == "b":
buf = bytearray(f.read(self.CHUNK_SIZE))
elif self.FileType == "t":
buf = f.read(self.CHUNK_SIZE)
else:
print("ERROR: File type error!")
if not buf:
# we've read the entire file in, so we're done.
break
of = self.OutFile + str(self.CurrentPartNo)
outFile = open(of, "w" + self.FileType)
outFile.write(buf)
outFile.close()
self.CurrentPartNo += 1
self.ProgressBar()
return True
def Rebuild(self):
self.CurrentPartNo = 0
if self.Parts == None:
return False
with open(self.OutFile, "w" + self.FileType) as f:
while self.CurrentPartNo < self.Parts:
If = self.OutFile + str(self.CurrentPartNo)
if not(os.path.isfile(If) and os.path.getsize(If) > 0):
print("ERROR: The file [" + If + "] is not exists or empty!")
return False
InputFile = open(If, "r" + self.FileType)
buf = InputFile.read()
if not buf:
# we've read the entire file in, so we're done.
break
f.write(buf)
InputFile.close()
os.remove(If)
self.CurrentPartNo += 1
self.ProgressBar()
return True
def ProgressBar(self, BarLength=20, ProgressIcon="#", BarIcon="-"):
try:
# You can't have a progress bar with zero or negative length.
if BarLength <1:
BarLength = 20
# Use status variable for going to the next line after progress completion.
Status = ""
# Calcuting progress between 0 and 1 for percentage.
self.Progress = float(self.CurrentPartNo) / float(self.Parts)
# Doing this conditions at final progressing.
if self.Progress >= 1.:
self.Progress = 1
Status = "\r\n" # Going to the next line
# Calculating how many places should be filled
Block = int(round(BarLength * self.Progress))
# Show this
Bar = "\r[{}] {:.0f}% {}".format(ProgressIcon * Block + BarIcon * (BarLength - Block), round(self.Progress * 100, 0), Status)
print(Bar, end="")
except:
print("\rERROR")
def main():
fp = FileSpliter(InputFile="inFile", FileType="b") #, CHUNK_SIZE=300000)
if fp.Prepare():
# Spliting ...
print("Spliting ...")
sr = fp.Split()
if sr == True:
print("The file splited successfully.")
print()
# Rebuilding ...
print("Rebuilding ...")
rr = fp.Rebuild()
if rr == True:
print("The file rebuilded successfully.")
if __name__ == "__main__":
main()

I am writing a Python3 code solution which I usually use to split files having size in MBs.
However, I have not yet tried for files having size in GBs.
TextFileSplitter.py
import traceback
#get a file name to be read
fileToRead = input("Enter file name : ")
# max lines you want to write in a single file
fileLineCount = 2000
lineCount = 0
fileCount = 1
try:
print('Start splitting...')
#read a file
fileReader = open(fileToRead)
line = fileReader.readline()
fileWriter = open(str(fileCount)+".txt","a")
while line != '':#empty is EOF
if lineCount == 0:
#create a file in append mode
fileWriter = open(str(fileCount)+".txt","a")
#increment file count, use it for new file name
fileCount += 1
#write a line
fileWriter.write(line+"\n")
lineCount += 1
if lineCount == fileLineCount:
lineCount = 0
fileWriter.close()
#read a line
line = fileReader.readline()
fileWriter.close()
except Exception as e:
#print the exception if any
print(e.__traceback__)
traceback.print_exc()
finally:
#close the file reader
fileReader.close()
o/p will look like, files, each having fileLineCount(i.e. 2000) lines, created in a same directory as :
1.txt
2.txt
3.txt
.
.
.
.
n.txt

Related

python3: Read huge file (~ 800 Gb), split the lines by condition, and append them to the end of new files in a memory-efficient way

I’m learning python 3, and I’m dealing with a huge txt file (~800Gb).
The enclosed function 'kmers_dic' while it read the main file, if the condition in if statement is satisfied, it should append the line in the previously created files (these files are 1024 and they are named with content of the kmers variable). The function work fine with a subset of the principal file, but when I run the code using the main file, my job is killed because I reached a memory usage limit.
def OpenFiles(i):
'''
A switch to handle file opening and reduce duplicated code
'''
open_method = {
"gz": gzip.open,
"norm": open
}
return open_method[i]
def rows(f, chunksize=102400, sep='\n'):
"""
Read a file where the row separator is '\n' lazily.
Default chunk size: 102400kB 100Mb.
Usage:
>>> with open('big.csv') as f:
>>> for r in rows(f):
>>> process(r)
"""
curr_row = ''
while True:
chunk = f.read(chunksize)
if chunk == '': # End of file
break
while True:
i = chunk.find(sep)
if i == -1:
break
yield curr_row + chunk[:i]
curr_row = ''
chunk = chunk[i+1:]
curr_row += chunk
def kmers_dic(input_file,kmers,out_dir):
'''
file writing by kmers
'''
#kmers_dic = set()
count_line=0
count_line_1=0
if input_file.endswith('.gz'):
nano_read = OpenFiles('gz')
else:
nano_read = OpenFiles('norm')
with nano_read(input_file, 'rt') as nano_f:
chunk = rows(nano_f,chunksize=2024,sep='\n')
for line in chunk:
count_line+=1
count_line_1+=1
sys.stdout.write('%s\r' % count_line)
sys.stdout.flush()
line = line.strip('\n')
line = line.split()
if line[2] in kmers:
kmer = line[2]
Out_f_name = out_dir+line[2]+'.lib'
file1 = open(Out_f_name, 'a')
##file1.write('\t'.join(line) + '\n') # print entire line
file1.write('\t'.join(line[1:4:]+line[6:9:]+line[9:13:]+line[15:]) + '\n')
file1.close()
print("lines: ",count_line_1)
I'm not understanding where is the issue.
Can you help me ?
Thanks in advance!
Best.
curr_row += chunk causes you keep all chunks in memory until you run out of free memory.

Python: Separating txt file to multiple files using a reoccuring symbol

I have a .txt file of amino acids separated by ">node" like this:
Filename.txt :
>NODE_1
MSETLVLTRPDDWHVHLRDGAALQSVVPYTARQFARAIAMPNLKPPITTAEQAQAYRERI
KFFLGTDSAPHASVMKENSVCGAGCFTALSALELYAEAFEAAGALDKLEAFASFHGADFY
GLPRNTTQVTLRKTEWTLPESVPFGEAAQLKPLRGGEALRWKLD*
>NODE_2
MSTWHKVQGRPKAQARRPGRKSKDDFVTRVEHDAKNDALLQLVRAEWAMLRSDIATFRGD
MVERFGKVEGEITGIKGQIDGLKGEMQGVKGEVEGLRGSLTTTQWVVGTAMALLAVVTQV
PSIISAYRFPPAGSSAFPAPGSLPTVPGSPASAASAP*
I want to separate this file into two (or as many as there are nodes) files;
Filename1.txt :
>NODE
MSETLVLTRPDDWHVHLRDGAALQSVVPYTARQFARAIAMPNLKPPITTAEQAQAYRERI
KFFLGTDSAPHASVMKENSVCGAGCFTALSALELYAEAFEAAGALDKLEAFASFHGADFY
GLPRNTTQVTLRKTEWTLPESVPFGEAAQLKPLRGGEALRWKLD*
Filename2.txt :
>NODE
MSTWHKVQGRPKAQARRPGRKSKDDFVTRVEHDAKNDALLQLVRAEWAMLRSDIATFRGD
MVERFGKVEGEITGIKGQIDGLKGEMQGVKGEVEGLRGSLTTTQWVVGTAMALLAVVTQV
PSIISAYRFPPAGSSAFPAPGSLPTVPGSPASAASAP*
With a number after the filename
This code works, however it deletes the ">NODE" line and does not create a file for the last node (the one without a '>' afterwards).
with open('FilePathway') as fo:
op = ''
start = 0
cntr = 1
for x in fo.read().split("\n"):
if x.startswith('>'):
if start == 1:
with open (str(cntr) + '.fasta','w') as opf:
opf.write(op)
opf.close()
op = ''
cntr += 1
else:
start = 1
else:
if op == '':
op = x
else:
op = op + '\n' + x
fo.close()
I can´t seem to find the mistake. Would be thankful if you could point it out to me.
Thank you for your help!
Hi again! Thank you for all the comments. With your help, I managed to get it to work perfectly. For anyone with similar problems, this is my final code:
import os
import glob
folder_path = 'FilePathway'
for filename in glob.glob(os.path.join(folder_path, '*.fasta')):
with open(filename) as fo:
for line in fo.readlines():
if line.startswith('>'):
original = line
content = [original]
fileno = 1
filename = filename
y = filename.replace(".fasta","_")
def writefasta():
global content, fileno
if len(content) > 1:
with open(f'{y}{fileno}.fasta', 'w') as fout:
fout.write(''.join(content))
content = [line]
fileno += 1
with open('FilePathway') as fin:
for line in fin:
if line.startswith('>NODE'):
writefasta()
else:
content.append(line)
writefasta()
You could do it like this:
def writefasta(d):
if len(d['content']) > 1:
with open(f'Filename{d["fileno"]}.fasta', 'w') as fout:
fout.write(''.join(d['content']))
d['content'] = ['>NODE\n']
d['fileno'] += 1
with open('test.fasta') as fin:
D = {'content': ['>NODE\n'], 'fileno': 1}
for line in fin:
if line.startswith('>NODE'):
writefasta(D)
else:
D['content'].append(line)
writefasta(D)
This would be better way. It is going to write only on odd iterations. So that, ">NODE" will be skipped and files will be created only for the real content.
with open('filename.txt') as fo:
cntr=1
for i,content in enumerate(fo.read().split("\n")):
if i%2 == 1:
with open (str(cntr) + '.txt','w') as opf:
opf.write(content)
cntr += 1
By the way, since you are using context manager, you dont need to close the file.
Context managers allow you to allocate and release resources precisely
when you want to. It opens the file, writes some data to it and then
closes it.
Please check: https://book.pythontips.com/en/latest/context_managers.html
with open('FileName') as fo:
cntr = 1
for line in fo.readlines():
with open (f'{str(cntr)}.fasta','w') as opf:
opf.write(line)
opf.close()
op = ''
cntr += 1
fo.close()

Extract IP addresses from text file without using REGEX

I am trying to extract IPv4 addresses from a text file and save them as a list to a new file, however, I can not use regex to parse the file, Instead, I have check the characters individually. Not really sure where to start with that, everything I find seems to have import re as the first line.
So far this is what I have,
#Opens and prints wireShark txt file
fileObject = open("wireShark.txt", "r")
data = fileObject.read()
print(data)
#Save IP adresses to new file
with open('wireShark.txt') as fin, open('IPAdressess.txt', 'wt') as fout:
list(fout.write(line) for line in fin if line.rstrip())
#Opens and prints IPAdressess txt file
fileObject = open("IPAdressess.txt", "r")
data = fileObject.read()
print(data)
#Close Files
fin.close()
fout.close()
So I open the file, and I have created the file that I will put the extracted IP's in, I just don't know ow to pull them without using REGEX.
Thanks for the help.
Here is a possible solution.
The function find_first_digit, position the index at the next digit in the text if any and return True. Else return False
The functions get_dot and get_num read a number/dot and, lets the index at the position just after the number/dot and return the number/dot as str. If one of those functions fails to get the number/dot raise an MissMatch exception.
In the main loop, find a digit, save the index and then try to get an ip.
If sucess, write it to output file.
If any of the called functions raises a MissMatch exception, set the current index to the saved index plus one and start over.
class MissMatch(Exception):pass
INPUT_FILE_NAME = 'text'
OUTPUT_FILE_NAME = 'ip_list'
def find_first_digit():
while True:
c = input_file.read(1)
if not c: # EOF found!
return False
elif c.isdigit():
input_file.seek(input_file.tell() - 1)
return True
def get_num():
num = input_file.read(1) # 1st digit
if not num.isdigit():
raise MissMatch
if num != '0':
for i in range(2): # 2nd 3th digits
c = input_file.read(1)
if c.isdigit():
num += c
else:
input_file.seek(input_file.tell() - 1)
break
return num
def get_dot():
if input_file.read(1) == '.':
return '.'
else:
raise MissMatch
with open(INPUT_FILE_NAME) as input_file, open(OUTPUT_FILE_NAME, 'w') as output_file:
while True:
ip = ''
if not find_first_digit():
break
saved_position = input_file.tell()
try:
ip = get_num() + get_dot() \
+ get_num() + get_dot() \
+ get_num() + get_dot() \
+ get_num()
except MissMatch:
input_file.seek(saved_position + 1)
else:
output_file.write(ip + '\n')

Hashing wordlist with big input output files in Python 3.8

I'm a beginner in coding and am trying to build a script that takes a txt file as an input, hash it and output to another txt file containing "string:hashedstring" in each line of it. The code is working properly. The problem I am facing now is that if the input file is big, it will consume all RAM and kill it. I tried to use chunks, but couldn't figure out how to use it with multiline input and output.
Any suggestions regarding other parts of the code other than the main subject here is very welcome, since I am just starting on this. Thanks.
import argparse
import hashlib
import os
import sys
def sofia_hash(msg):
h = ""
m = hashlib.md5()
m.update(msg.encode('utf-8'))
msg_md5 = m.digest()
for i in range(8):
n = (msg_md5[2*i] + msg_md5[2*i+1]) % 0x3e
if n > 9:
if n > 35:
n += 61
else:
n += 55
else:
n += 0x30
h += chr(n)
return h
top_parser = argparse.ArgumentParser(description='Sofiamass')
top_parser.add_argument('input', action="store", type=argparse.FileType('r', encoding='utf8'), help="Set input file")
top_parser.add_argument('output', action="store", help="Set output file")
args = top_parser.parse_args()
sofiainput = args.input.read().splitlines()
a = 0
try:
while a < len(sofiainput):
target_sofiainput = sofiainput[a]
etarget_sofiainput = (target_sofiainput).encode('utf-8')
try:
sofia_pass = sofia_hash(target_sofiainput)
x = True
except KeyboardInterrupt:
print ("\n[---]exiting now[---]")
if x == True:
with open(args.output, 'a') as sofiaoutput:
sofiaoutput.write(str(target_sofiainput) + ":" + str(sofia_pass) + "\n")
elif x == False:
print('error')
a += 1
except KeyboardInterrupt:
print ("\n[---]exiting now[---]")
except AttributeError:
pass
When you open the file with the open command, it creates a object called file handler. So, when you do:
with open('filepath.txt', 'r') as f:
for line in f:
print(line)
it only keeps the current line you are using in the RAM, thus achieving your objective to use as little as RAM as possible.

Splitting a text file

I have a folder with a set of text documents. I want to split each document to two or three documents, each one should be 45-70kb.
How сan I do it? I tried:
def split_file(filename, pattern, size):
with open(filename, 'rb') as f:
for index, line in enumerate(f, start=1):
with open(pattern.format(index), 'wb') as out:
n=0
for line in chain([line], f):
out.write(line)
n += len(line)
if n >= 450000 and n <=700000:
break
if __name__ == '__main__':
split_file('folderadress', 'part_{0:03d}.txt', 20000)
but it seems to me it's completely wrong.
This uses a different approach to yours. I have set the maximum size for each file to be 1000 bytes for testing purposes:
import glob
import os
dname = './gash' # directory name
unit_size = 1000 # maximum file size
for fname in glob.iglob("%s/*" % dname):
with open(fname, 'rb') as fo:
data = True
n = 1
while data:
# read returns "" (False) on EOF
data = fo.read(unit_size)
if data:
sub_fname = fname + str(n)
with open(sub_fname, 'wb') as out:
out.write(data)
n += 1
What this might do is to split a line between files, however you do not state if this could be an issue or not.

Categories

Resources