I am trying to read a file which contains one single 2.9 GB long line separated by commas. This code would read the file line by line, with each print stopping at '\n':
with open('eggs.txt', 'rb') as file:
for line in file:
print(line)
How can I instead iterate over "lines" that stop at ', ' (or any other character/string)?
I don't think there is a built-in way to achieve this. You will have to use file.read(block_size) to read the file block by block, split each block at commas, and rejoin strings that go across block boundaries manually.
Note that you still might run out of memory if you don't encounter a comma for a long time. (The same problem applies to reading a file line by line, when encountering a very long line.)
Here's an example implementation:
def split_file(file, sep=",", block_size=16384):
last_fragment = ""
while True:
block = file.read(block_size)
if not block:
break
block_fragments = iter(block.split(sep))
last_fragment += next(block_fragments)
for fragment in block_fragments:
yield last_fragment
last_fragment = fragment
yield last_fragment
Using buffered reading from the file (Python 3):
buffer_size = 2**12
delimiter = ','
with open(filename, 'r') as f:
# remember the characters after the last delimiter in the previously processed chunk
remaining = ""
while True:
# read the next chunk of characters from the file
chunk = f.read(buffer_size)
# end the loop if the end of the file has been reached
if not chunk:
break
# add the remaining characters from the previous chunk,
# split according to the delimiter, and keep the remaining
# characters after the last delimiter separately
*lines, remaining = (remaining + chunk).split(delimiter)
# print the parts up to each delimiter one by one
for line in lines:
print(line, end=delimiter)
# print the characters after the last delimiter in the file
if remaining:
print(remaining, end='')
Note that the way this is currently written, it will just print the original file's contents exactly as they were. This is easily changed though, e.g. by changing the end=delimiter parameter passed to the print() function in the loop.
Read the file a character at a time, and assemble the comma-separated lines:
def commaBreak(filename):
word = ""
with open(filename) as f:
while True:
char = f.read(1)
if not char:
print("End of file")
yield word
break
elif char == ',':
yield word
word = ""
else:
word += char
You may choose to do something like this with a larger number of charachters, Eg 1000, read at a time.
with open('eggs.txt', 'rb') as file:
for line in file:
str_line = str(line)
words = str_line.split(', ')
for word in words:
print(word)
It yields each character from file at once, what means that there is no memory overloading.
def lazy_read():
try:
with open('eggs.txt', 'rb') as file:
item = file.read(1)
while item:
if ',' == item:
raise StopIteration
yield item
item = file.read(1)
except StopIteration:
pass
print(''.join(lazy_read()))
Related
We have a huge .csv file but it doesn't seem to really be a csv.
The line endings are \tl\n.
The text between this newline character sometimes has "real" newline characters. We don't want to split on those.
We currently do it using awk.
awk_code = r'BEGIN{ RS="""(\tl\n)"""; FS="\t"} { print "\42"$1"\42,\42"$2"\42,\42\42\42"$3"\42\42\42,\n";}'
bash_command_awk = f"awk '{awk_code}' {input_file_path} > {output_path}"
awk_command_output = subprocess.check_output(bash_command_awk,stderr=subprocess.STDOUT, shell=True)
I'm trying to find an efficient way of doing it directly in Python and tried passing a custom newline into the .open() command.
def process_without_putting_file_in_RAM(file_to_process):
with file_to_process.open(encoding="utf-8", newline="\tl\n") as csv_file:
for line in csv.reader(csv_file):
However, I quickly learned newline arg only accepts one of the default characters.
How can I efficiently process this file containing the weird line ending?
Here's a function which can handle multi-character newline between chunks correctly
def line_splitter(file, newline, chunk_size=4096):
tail = ''
while True:
chunk = file.read(chunk_size)
if not chunk:
if tail:
yield tail
break
lines = (tail + chunk).split(newline)
tail = lines.pop(0)
if lines:
yield tail
tail = lines.pop()
yield from lines
another version which, although it doesn't make copies of whole chunks didn't prove faster. It will be marginally faster for large chunks. Do not use chunk_size less than newline size :)
def line_splitter(file, newline, chunk_size=4096):
tail = ''
while True:
chunk = file.read(chunk_size)
if not chunk:
if tail:
yield tail
break
lines = chunk.split(newline)
tail = (tail + lines[0]).split(newline)
if len(tail) > 1:
lines[0] = tail[1]
else:
del lines[0]
tail = tail[0]
if lines:
yield tail
tail = lines.pop()
yield from lines
The caller should be like:
with longabstract_file.open() as f:
for line in line_splitter(f, "\tl\n"):
if line: # ignore blank lines
print(line)
Assuming your csv is a comma or space delimited, not tab, what you were looking for is lineterminator flag, but there's no need for that since it's automatically assumed '\n' is a line break. From the doc:
Note: The reader is hard-coded to recognise either '\r' or '\n' as
end-of-line, and ignores lineterminator. This behavior may change in
the future.
so what you can do is add string method .replace() to get rid of '\tl' like this
def process_without_putting_file_in_RAM(file_to_process):
with file_to_process.open(encoding="utf-8") as csv_file:
for line in csv.reader(csv_file, delimiter=","):
print(line[-1].replace('\tl', ''))
Why not use pandas. Specifically pandas.read_csv using lineterminator and chunksize parameters:
import pandas as pd
batch_size = 10000
new_line_str = '\tl\n'
iterator_df = pd.read_csv(file_to_process, chunksize=batch_size, lineterminator=new_line_str)
for chunk in iterator_df:
# process chunk here
At the risk of losing reputation I did not know what else to do. My file is not showing any hidden characters and I have tried every .replace and .strip I can think of. My file is UTF-8 encoded and I am using python/3.6.1
I have a file with the format:
>header1
AAAAAAAA
TTTTTTTT
CCCCCCCC
GGGGGGGG
>header2
CCCCCC
TTTTTT
GGGGGG
AAAAAA
I am trying to remove line breaks from the end of the file to make each line a continuous string. (This file is actually thousands of lines long).
My code is redundant in the sense that I typed in everything I could think of to remove line breaks:
fref = open(ref)
for line in fref:
sequence = 0
header = 0
if line.startswith('>'):
header = ''.join(line.splitlines())
print(header)
else:
sequence = line.strip("\n").strip("\r")
sequence = line.replace('\n', ' ').replace('\r', '').replace(' ', '').replace('\t', '')
print(len(sequence))
output is:
>header1
8
8
8
8
>header2
6
6
6
6
But if I manually go in and delete the end of line to make it a continuous string it shows it as a congruent string.
Expected output:
>header1
32
>header2
24
Thanks in advance for any help,
Dennis
There are several approaches to parsing this kind of input. In all cases, I would recommend isolating the open and print side-effects outside of a function that you can unit test to convince yourself of the proper behavior.
You could iterate over each line and handle the case of empty lines and end-of-file separately. Here, I use yield statements to return the values:
def parse(infile):
for line in infile:
if line.startswith(">"):
total = 0
yield line.strip()
elif not line.strip():
yield total
else:
total += len(line.strip())
if line.strip():
yield total
def test_parse(func):
with open("input.txt") as infile:
assert list(parse(infile)) == [
">header1",
32,
">header2",
24,
]
Or, you could handle both empty lines and end-of-file at the same time. Here, I use an output array to which I append headers and totals:
def parse(infile):
output = []
while True:
line = infile.readline()
if line.startswith(">"):
total = 0
header = line.strip()
elif line and line.strip():
total += len(line.strip())
else:
output.append(header)
output.append(total)
if not line:
break
return output
def test_parse(func):
with open("input.txt") as infile:
assert parse(infile) == [
">header1",
32,
">header2",
24,
]
Or, you could also split the whole input file into empty-line-separated blocks and parse them independently. Here, I use an output stream to which I write the output; in production, you could pass the sys.stdout stream for example:
import re
def parse(infile, outfile):
content = infile.read()
for block in re.split(r"\r?\n\r?\n", content):
header, *lines = re.split(r"\s+", block)
total = sum(len(line) for line in lines)
outfile.write("{header}\n{total}\n".format(
header=header,
total=total,
))
from io import StringIO
def test_parse(func):
with open("/tmp/a.txt") as infile:
outfile = StringIO()
parse(infile, outfile)
outfile.seek(0)
assert outfile.readlines() == [
">header1\n",
"32\n",
">header2\n",
"24\n",
]
Note that my tests use open("input.txt") for brevity but I would actually recommend passing a StringIO(...) instance instead to see the input being tested more easily, to avoid hitting the filesystem and to make the tests faster.
From my understanding of your question you would like something like this:
Note how the sequence is build over multiple iteration steps of the loop, as you wish to combine multiple lines.
with open(ref) as f:
sequence = "" # reset sequence
header = None
for line in f:
if line.startswith('>'):
if header:
print(header) # print last header
print(len(sequence)) # print last sequence
sequence = "" # reset sequence
header = line[1:] # store header
else:
sequence += line.rstrip() # append line to sequence
I am currently working on an application which requires reading all the input from a file until a certain character is encountered.
By using the code:
file=open("Questions.txt",'r')
c=file.readlines()
c=[x.strip() for x in c]
Every time strip encounters \n, it is removed from the input and treated as a string in list c.
This means every line is split into the part of a list c. But I want to make a list up to a point whenever a special character is encountered like this:
if the input file has the contents:
1.Hai
2.Bye\-1
3.Hello
4.OAPd\-1
then I want to get a list as
c=['1.Hai\n2.Bye','3.Hello\n4.OApd']
Please help me in doing this.
The easiest way would be to read the file in as a single string and then split it across your separator:
with open('myFileName') as myFile:
text = myFile.read()
result = text.split(separator) # use your \-1 (whatever that means) here
In case your file is very large, holding the complete contents in memory as a single string for using .split() is maybe not desirable (and then holding the complete contents in the list after the split is probably also not desirable). Then you could read it in chunks:
def each_chunk(stream, separator):
buffer = ''
while True: # until EOF
chunk = stream.read(CHUNK_SIZE) # I propose 4096 or so
if not chunk: # EOF?
yield buffer
break
buffer += chunk
while True: # until no separator is found
try:
part, buffer = buffer.split(separator, 1)
except ValueError:
break
else:
yield part
with open('myFileName') as myFile:
for chunk in each_chunk(myFile, separator='\\-1\n'):
print(chunk) # not holding in memory, but printing chunk by chunk
I used "*" instead of "-1", I'll let you make the appropriate changes.
s = '1.Hai\n2.Bye*3.Hello\n4.OAPd*'
temp = ''
results = []
for char in s:
if char is '*':
results.append(temp)
temp = []
else:
temp += char
if len(temp) > 0:
results.append(temp)
The following code is doing most of what I want...
All I need is for that print to actually be a return so that I can dump the data into another txt file I'm writing (f2).
(Also, the spacing obtained with print letters is not what I want but I figure I'll deal with it later.)
Every time I replace print with return it just stops reading after the first line of the initial text file (f1).
def DNA2Prot(f1, f2="translated_fasta.txt"):
fin = open(f1, 'r')
for letters in fin:
if letters[0] != ">":
seqs = letters
codons = [ ]
protein = ''
for i in range(0, len(seqs), 3):
try:
codon = seqs[i:i+3]
codons = codon_table[codon]
protein = protein+codons
except KeyError:
protein += ""
print protein
else:
print letters
fin.close()
Use yield instead and treat your function as a generator. This way the caller can do what he/she pleases with all of the proteins the DNA2Prot function generates and read from the file until the entire file is read.
def DNA2Prot(f1, f2='translated_fasta.txt'):
# prefer using `with` to `open` and `close`
with open(f1, 'r') as fin:
for letters in fin:
if letters[0] != '>':
seqs = letters
codons = [ ]
protein = ''
for i in range(0, len(seqs), 3):
# no need for a try catch, because we can use `get`
# get will return None by default if the
# specified `codon` does not appear in
# `codon_table`
codon = seqs[i:i + 3]
codons = codon_table.get(codon)
if codons:
protein += codons
yield protein
else:
yield letters
Now you have to treat the DNA2Prot function as an Iterator:
with open('/path/to/outfile', 'w') as f:
for protein in DNA2Prot(f1):
# do something with protein
print protein
First things first. When you use the return statement you are telling your code to break out(i.e leave) from the point where the return statement is located. This means that your code will start reading from fin, move on to the second for and as soon as it is done with it (read all the letters of the line) it will reach you return statement and break out from the DNA2prot function.
Now, there are two things you can do to when it comes to files. First is use the print function to redirect your output to a file (not recommended) or properly open the files and write into them.
With regards to the first solution (and assuming you are using python 2.7) you can simply do:
from __future__ import print_function
and when you want to use your print statement just write:
print(protein, file = fin).
However, if I were you I would go for a more elegant and clean solution that doesn't rely on unnecessary imports:
def DNA2Prot(f1, f2="translated_fasta.txt"):
with open (f1, 'r+') as fin, open(f2, 'w+') as fin2: #Using the "with-open" statement you don't need to close the file object
for letters in fin:
if letters[0]!=">":
seqs=letters
codons=[ ]
protein=''
for i in range(0,len(seqs),3):
try:
codon=seqs[i:i+3]
codons=codon_table[codon]
protein=protein+codons
except KeyError:
protein+=""
f2.write(protein) # Write your data to the second file
else:
f2.write(letters)
I am trying to parse some text files and need to extract blocks of text. Specifically, the lines that start with "1:" and 19 lines after the text. The "1:" does not start on the same row in each file and there is only one instance of "1:". I would prefer to save the block of text and export it to a separate file. In addition, I need to preserve the formatting of the text in the original file.
Needless to say I am new to Python. I generally work with R but these files are not really compatible with R and I have about 100 to process. Any information would be appreciated.
The code that I have so far is:
tmp = open(files[0],"r")
lines = tmp.readlines()
tmp.close()
num = 0
a=0
for line in lines:
num += 1
if "1:" in line:
a = num
break
a = num is the line number for the block of text I want. I then want to save to another file the next 19 lines of code, but can't figure how how to do this. Any help would be appreciated.
Here is one option. Read all lines from your file. Iterate till you find your line and return next 19 lines. You would need to handle situations where your file doesn't contain additional 19 lines.
fh = open('yourfile.txt', 'r')
all_lines = fh.readlines()
fh.close()
for count, line in enumerate(all_lines):
if "1:" in line:
return all_lines[count+1:count+20]
Could be done in a one-liner...
open(files[0]).read().split('1:', 1)[1].split('\n')[:19]
or more readable
txt = open(files[0]).read() # read the file into a big string
before, after = txt.split('1:', 1) # split the file on the first "1:"
after_lines = after.split('\n') # create lines from the after text
lines_to_save = after_lines[:19] # grab the first 19 lines after "1:"
then join the lines with a newline (and add a newline to the end) before writing it to a new file:
out_text = "1:" # add back "1:"
out_text += "\n".join(lines_to_save) # add all 19 lines with newlines between them
out_text += "\n" # add a newline at the end
open("outputfile.txt", "w").write(out_text)
to comply with best practice for reading and writing files you should also be using the with statement to ensure that the file handles are closed as soon as possible. You can create convenience functions for it:
def read_file(fname):
"Returns contents of file with name `fname`."
with open(fname) as fp:
return fp.read()
def write_file(fname, txt):
"Writes `txt` to a file named `fname`."
with open(fname, 'w') as fp:
fp.write(txt)
then you can replace the first line above with:
txt = read_file(files[0])
and the last line with:
write_file("outputfile.txt", out_text)
I always prefer to read the file into memory first, but sometimes that's not possible. If you want to use iteration then this will work:
def process_file(fname):
with open(fname) as fp:
for line in fp:
if line.startswith('1:'):
break
else:
return # no '1:' in file
yield line # yield line containing '1:'
for i, line in enumerate(fp):
if i >= 19:
break
yield line
if __name__ == "__main__":
with open('ouput.txt', 'w') as fp:
for line in process_file('intxt.txt'):
fp.write(line)
It's using the else: clause on a for-loop which you don't see very often anymore, but was created for just this purpose (the else clause if executed if the for-loop doesn't break).