I am trying to parse a large fasta file and I am encountering out of memory errors. Some suggestions to improve the data handling would be appreciated. Currently the program correctly prints out the names however partially through the file I get a MemoryError
Here is the generator
def readFastaEntry( fp ):
name = ""
seq = ""
for line in fp:
if line.startswith( ">" ):
tmp = []
tmp.append( name )
tmp.append( seq )
name = line
seq = ""
yield tmp
else:
seq = seq.join( line )
and here is the caller stub more will be added after this part works
fp = open( sys.argv[1], 'r' )
for seq in readFastaEntry( fp ) :
print seq[0]
For those not fimilar with the fasta format here is an example
>1 (PB2)
AATATATTCAATATGGAGAGAATAAAAGAACTAAGAGATCTAATGTCACAGTCTCGCACTCGCGAGATAC
TCACCAAAACCACTGTGGACCACATGGCCATAATCAAAAAGTACACATCAGGAAGGCAAGAGAAGAACCC
TGCACTCAGGATGAAGTGGATGATG
>2 (PB1)
AACCATTTGAATGGATGTCAATCCGACTTTACTTTTCTTGAAAGTTCCAGCGCAAAATGCCATAAGCACC
ACATTTCCCTATACTGGAGACCCTCC
each entry starts with a ">" stating the name etc then the next N lines are data. There is no defined ending of the data other than the next line having a ">" at the beginning.
Have you considered using BioPython. They have a sequence reader that can read fasta files. And if you are interested in coding one yourself, you can take a look at BioPython's code.
Edit: Code added
def read_fasta(fp):
name, seq = None, []
for line in fp:
line = line.rstrip()
if line.startswith(">"):
if name: yield (name, ''.join(seq))
name, seq = line, []
else:
seq.append(line)
if name: yield (name, ''.join(seq))
with open('f.fasta') as fp:
for name, seq in read_fasta(fp):
print(name, seq)
A pyparsing parser for this format is only a few lines long. See the annotations in the following code:
data = """>1 (PB2)
AATATATTCAATATGGAGAGAATAAAAGAACTAAGAGATCTAATGTCACAGTCTCGCACTCGCGAGATAC
TCACCAAAACCACTGTGGACCACATGGCCATAATCAAAAAGTACACATCAGGAAGGCAAGAGAAGAACCC
TGCACTCAGGATGAAGTGGATGATG
>2 (PB1)
AACCATTTGAATGGATGTCAATCCGACTTTACTTTTCTTGAAAGTTCCAGCGCAAAATGCCATAAGCACC
ACATTTCCCTATACTGGAGACCCTCC"""
from pyparsing import Word, nums, QuotedString, Combine, OneOrMore
# define some basic forms
integer = Word(nums)
key = QuotedString("(", endQuoteChar=")")
# sequences are "words" made up of the characters A, G, C, and T
# we want to match one or more of them, and have the parser combine
# them into a single string (Combine by default requires all of its
# elements to be adjacent within the input string, but we want to allow
# for the intervening end of lines, so we add adjacent=False)
sequence = Combine(OneOrMore(Word("AGCT")), adjacent=False)
# define the overall pattern to scan for - attach results names
# to each matched element
seqEntry = ">" + integer("index") + key("key") + sequence("sequence")
for seq,s,e in seqEntry.scanString(data):
# just dump out the matched data
print seq.dump()
# could also access fields as seq.index, seq.key and seq.sequence
Prints:
['>', '1', 'PB2', 'AATATATTCAATATGGAGAGAATAAAAGAACTAAGAGATCTAATGTCACAGTCTCGCACTCGCGAGATACTCACCAAAACCACTGTGGACCACATGGCCATAATCAAAAAGTACACATCAGGAAGGCAAGAGAAGAACCCTGCACTCAGGATGAAGTGGATGATG']
- index: 1
- key: PB2
- sequence: AATATATTCAATATGGAGAGAATAAAAGAACTAAGAGATCTAATGTCACAGTCTCGCACTCGCGAGATACTCACCAAAACCACTGTGGACCACATGGCCATAATCAAAAAGTACACATCAGGAAGGCAAGAGAAGAACCCTGCACTCAGGATGAAGTGGATGATG
['>', '2', 'PB1', 'AACCATTTGAATGGATGTCAATCCGACTTTACTTTTCTTGAAAGTTCCAGCGCAAAATGCCATAAGCACCACATTTCCCTATACTGGAGACCCTCC']
- index: 2
- key: PB1
- sequence: AACCATTTGAATGGATGTCAATCCGACTTTACTTTTCTTGAAAGTTCCAGCGCAAAATGCCATAAGCACCACATTTCCCTATACTGGAGACCCTCC
Without having a great understanding of what you are doing, I would have written the code like this:
def readFastaEntry( fp ):
name = ""
while True:
line = name or f.readline()
if not line:
break
seq = []
while True:
name = f.readline()
if not name or name.startswith(">"):
break
else:
seq.append(name)
yield (line, "".join(seq))
This gathers up the data after a starting line up to the next starting line. Making seq an array means that you minimize the string joining until the last possible moment. Yielding a tuple makes more sense than a list.
def read_fasta(filename):
name = None
with open(filename) as file:
for line in file:
if line[0] == ">":
if name:
yield (name, seq)
name = line[1:-1].split("|")[0]
seq = ""
else:
seq += line[:-1]
yield (name, seq)
Related
I'm trying to make multiple lines before a '>' character append into one list so I can convert it to a value in a dictionary. For example, I'm trying to make:
> 1
AAA
CCC
> 2
become AAACCC.
The code is below:
def parse_fasta(path):
with open(path) as thefile:
label = []
sequences = []
for k, line in enumerate(thefile):
if line.startswith('>'):
labeler = line.strip('>').strip('\n')
label.append(labeler)
else:
seqfix = ''.join(line.strip('\n'))
sequences.append(seqfix)
dict_version = {k: v for k, v in zip(label, sequences)}
return dict_version
parse_fasta('small.fasta')
You can create the dictionary as you go. Here is a method for doing that.
EDIT: removed defaultdict (so no modules)
from pprint import pprint
dict_version = {}
with open('fasta_sample.txt', 'r') as f:
for line in f:
line = line.rstrip()
if line.startswith('>'):
key = line[1:]
else:
if key in dict_version:
dict_version[key] += line
else:
dict_version[key] = line
pprint(dict_version)
The sample file:
>1FN3:A|PDBID|CHAIN|SEQUENCE
VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGKKVADALTNAVAHVDDMPNAL
SALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTPAVHASLDKFLASVSTVLTSKYR
>5OKT:A|PDBID|CHAIN|SEQUENCE
MGSSHHHHHHSSGLVPRGSHMELRVGNRYRLGRKIGSGSFGDIYLGTDIAAGEEVAIKLECVKTKHPQLHIESKIYKMMQ
GGVGIPTIRWCGAEGDYNVMVMELLGPSLEDLFNFCSRKFSLKTVLLLADQMISRIEYIHSKNFIHRDVKPDNFLMGLGK
KGNLVYIIDFGLAKKYRDARTHQHIPYRENKNLTGTARYASINTHLGIEQSRRDDLESLGYVLMYFNLGSLPWQGLKAAT
KRQKYERISEKKMSTPIEVLCKGYPSEFATYLNFCRSLRFDDKPDYSYLRQLFRNLFHRQGFSYDYVFDWNMLK*
>2PAB:A|PDBID|CHAIN|SEQUENCE
GPTGTGESKCPLMVKVLDAVRGSPAINVAVHVFRKAADDTWEPFASGKTSESGELHGLTTEEQFVEGIYKVEIDTKSYWK
ALGISPFHEHAEVVFTANDSGPRRYTIAALLSPYSYSTTAVVTNPKE*
>3IDP:B|PDBID|CHAIN|SEQUENCE
HHHHHHDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDVAVKMLNVTAPTPQQLQAFKNEVGVLRK
TRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHHLHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHED
LTVKIGDFGLATEKSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNINNRDQIIF
MVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARSLPKIHRS
>4QUD:A|PDBID|CHAIN|SEQUENCE
MENTENSVDSKSIKNLEPKIIHGSESMDSGISLDNSYKMDYPEMGLCIIINNKNFHKSTGMTSRSGTDVDAANLRETFRN
LKYEVRNKNDLTREEIVELMRDVSKEDHSKRSSFVCVLLSHGEEGIIFGTNGPVDLKKIFNFFRGDRCRSLTGKPKLFII
QACRGTELDCGIETDSGVDDDMACHKIPVEADFLYAYSTAPGYYSWRNSKDGSWFIQSLCAMLKQYADKLEFMHILTRVN
RKVATEFESFSFDATFHAKKQIPCIVSMLTKELYFYH
Pretty print of the dictionary created is:
{'1FN3:A|PDBID|CHAIN|SEQUENCE': 'VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGKKVADALTNAVAHVDDMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTPAVHASLDKFLASVSTVLTSKYR',
'2PAB:A|PDBID|CHAIN|SEQUENCE': 'GPTGTGESKCPLMVKVLDAVRGSPAINVAVHVFRKAADDTWEPFASGKTSESGELHGLTTEEQFVEGIYKVEIDTKSYWKALGISPFHEHAEVVFTANDSGPRRYTIAALLSPYSYSTTAVVTNPKE*',
'3IDP:B|PDBID|CHAIN|SEQUENCE': 'HHHHHHDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDVAVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHHLHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATEKSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNINNRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARSLPKIHRS',
'4QUD:A|PDBID|CHAIN|SEQUENCE': 'MENTENSVDSKSIKNLEPKIIHGSESMDSGISLDNSYKMDYPEMGLCIIINNKNFHKSTGMTSRSGTDVDAANLRETFRNLKYEVRNKNDLTREEIVELMRDVSKEDHSKRSSFVCVLLSHGEEGIIFGTNGPVDLKKIFNFFRGDRCRSLTGKPKLFIIQACRGTELDCGIETDSGVDDDMACHKIPVEADFLYAYSTAPGYYSWRNSKDGSWFIQSLCAMLKQYADKLEFMHILTRVNRKVATEFESFSFDATFHAKKQIPCIVSMLTKELYFYH',
'5OKT:A|PDBID|CHAIN|SEQUENCE': 'MGSSHHHHHHSSGLVPRGSHMELRVGNRYRLGRKIGSGSFGDIYLGTDIAAGEEVAIKLECVKTKHPQLHIESKIYKMMQGGVGIPTIRWCGAEGDYNVMVMELLGPSLEDLFNFCSRKFSLKTVLLLADQMISRIEYIHSKNFIHRDVKPDNFLMGLGKKGNLVYIIDFGLAKKYRDARTHQHIPYRENKNLTGTARYASINTHLGIEQSRRDDLESLGYVLMYFNLGSLPWQGLKAATKRQKYERISEKKMSTPIEVLCKGYPSEFATYLNFCRSLRFDDKPDYSYLRQLFRNLFHRQGFSYDYVFDWNMLK*'}
EDIT: To work the solution following your try:
from pprint import pprint
def parse_fasta(path):
with open(path) as thefile:
label = []
sequences = ''
total_seq = []
for line in thefile:
line = line.strip()
if len(line) == 0:
continue
if line.startswith('>'):
line = line.strip('>')
label.append(line)
if len(sequences) > 0:
total_seq.append(sequences)
sequences = ''
else:
sequences += line
total_seq.append(sequences)
dict_version = {k: v for k, v in zip(label, total_seq)}
return dict_version
d = parse_fasta('fasta_sample.txt')
pprint(d)
You'll see I made some changes to get the correct output. I added an array total_seq to hold the sequences for each sequence header. (You didn't have this and was a problem in your solution). The joins in your code were not doing anything. The value was just a single string although you had the right idea. You'll see in the revised code the join was done to join the accumulated sequences for one header id into one string of fasta characters.
I tested for blank lines and did a continue if the line was blank, (len(line) == 0).
There was a test if len(sequences) > 0 to see if any sequences had been seen yet. Which they wouldn't on the first record. It would see the ID before it had seen any sequences.
After the for loop completes, it is necessary to add the last sequence
total_seq.append(sequences)
since all other sequences except the last are added to the total_seq when a new ID is detected.
I hope this explanation is helpful as it more closely follows your code.
I have a text file like this example:
example:
>chr9:128683-128744
GGATTTCTTCTTAGTTTGGATCCATTGCTGGTGAGCTAGTGGGATTTTTTGGGGGGTGTTA
>chr16:134222-134283
AGCTGGAAGCAGCGTGAATAAAACAGAATGGCCGGGACCTTAAAGGCTTTGCTTGGCCTGG
>chr16:134226-134287
GGAAGCAGCGTGGGAATCACAGAATGGACGGCCGATTAAAGGCTTTGCTTGGCCTGGATTT
>chr1:134723-134784
AAGTGATTCACCCTGCCTTTCCGACCTTCCCCAGAACAGAACACGTTGATCGTGGGCGATA
>chr16:135770-135831
GCCTGAGCAAAGGGCCTGCCCAGACAAGATTTTTTAATTGTTTAAAAACCGAATAAATGTT
this file is divided into different parts and every part has 2 rows. the 1st row starts with > (and this row is called ID) and the 2nd row is the sequence of letters.
I want to search for 2 short motif (AATAAA and GGAC) in the sequence of letters and if they contain these motifs, I want to get the the ID and sequence of that part.
but the point is AATAAA should be the 1st sequence and GGAC will come after that. there is a distance between them but this distance can be 2 letters or more.
expected output:
>chr16:134222-134283
AGCTGGAAGCAGCGTGAATAAAACAGAATGGCCGGGACCTTAAAGGCTTTGCTTGGCCTGG
I am trying to do that in python using the following command:
infile = open('infile.txt', 'r')
mot1 = 'AATAAA'
mot2 = 'GGAC'
new = []
for line in range(len(infile)):
if not infile[line].startswith('>'):
for match in pattern.finder(mot1) and pattern.finder(mot2):
new.append(infile[line-1])
with open('outfile.txt', "w") as f:
for item in new:
f.write("%s\n" % item)
this code does not return what I want. do you know how to fix it?
You can group the ID with sequence, and then utilize re.findall:
import re
data = [i.strip('\n') for i in open('filename.txt')]
new_data = [[data[i], data[i+1]] for i in range(0, len(data), 2)]
final_result = [[a, b] for a, b in new_data if re.findall('AATAAA\w{2,}GGAC', b)]
Output:
[['>chr16:134222-134283', 'AGCTGGAAGCAGCGTGAATAAAACAGAATGGCCGGGACCTTAAAGGCTTTGCTTGGCCTGG']]
Not sure I've got your idea about this distance can be 2 letters or more, and is it obligatory to check, but following code gives you desired output:
mot1 = 'AATAAA'
mot2 = 'GGAC'
with open('infile.txt', 'r') as inp:
last_id = None
for line in inp:
if line.startswith('>'):
last_id = line
else:
if mot1 in line and mot2 in line:
print(last_id)
print(line)
You can redirect output to a file if you want
You can use a regex and a dictionary comprehension:
import re
with open('test.txt', 'r') as f:
lines = f.readlines()
data = dict(zip(lines[::2],lines[1::2]))
{k.strip(): v.strip() for k,v in data.items() if re.findall(r'AATAAA\w{2,}GGAC', v)}
Returns:
{'>chr16:134222-134283': 'AGCTGGAAGCAGCGTGAATAAAACAGAATGGCCGGGACCTTAAAGGCTTTGCTTGGCCTGG'}
You may slice the irrelevant part of the string if mot1 is found in it. Here's a way to do it:
from math import ceil
infile = open('infile.txt', 'r')
text = infile.readlines()
infile.close()
mot1 = 'AATAAA'
mot2 = 'GGAC'
check = [(text[x], text[x+1]) for x in range(ceil(len(text)/2))]
result = [(x + '\n' + y) for (x, y) in check if mot1 in y and mot2 in y[(y.find(mot1)+len(mot1)+2):]]
with open('outfile.txt', "w") as f:
for item in result:
f.write("%s\n" % item)
If the file is not too big, you can read it at once, and use re.findall():
import re
with open("infile.txt") as finp:
data=finp.read()
with open('outfile.txt', "w") as f:
for item in re.findall(r">.+?[\r\n\f][AGTC]*?AATAAA[AGTC]{2,}GGAC[AGTC]*", data):
f.write(item+"\n")
"""
+? and *? means non-greedy process;
>.+?[\r\n\f] matches a line starting with '>' and followed by any characters to the end of the line;
[AGTC]*?AATAAA matches any number of A,G,T,C characters, followed by the AATAAA pattern;
[AGTC]{2,} matches at least two or more characters of A,G,T,C;
GGAC matches the GGAC pattern;
[AGTC]* matches the empty string or any number of A,G,T,C characters.
"""
This might sound banal but it has being a pain.
So I wrote code that parses lines. The .txt file has a line which match my re.match and a line which doesnt.
cat file.txt
00.00.00 : Blabla
x
In this case I treat checking the first letter "x".
def parser():
path = "file.txt"
with open (path, 'r+') as file:
msg = {}
list = []
start = 0
lines = file.readlines()
for i in range (0,len(lines)):
line = lines[i]
if re.match('MY RULES', line) is not None:
field['date'] = line[:8]
msg['msg'] = line[start + 2:]
print msg
if line.startswith('x'):
msg['msg'] += line
list.append(msg)
print chat
OUTPUT for 2 lines
{'date': '0.0.00', 'msg': 'BlaBla'}
{'msg': 'x'}
The problem is I cant append the second dict message['msg'] to the last message, if starts with "x".
The expected output is:
{'date': '0.0.00', 'msg': 'BlaBlax'}
I tried using the variant, for changing the last appended chat:
else:
list[len(list) - 1]['msg'] += + line
but then I get the error:
IndexError: list index out of range
I also tried using next(infile) to predict the next line, but then it output every other line.
How would you trick a nested loop to append a dict entry?
Cheers
First of all do not use list as a name for a variable it is builtin keyword and you are shadowing it.
Secondly if I understand correctly you would like to append the last result.
Here:
if re.match('MY RULES', line) is not None:
field['date'] = line[:8]
msg['msg'] = line[start + 2:]
print msg
if line.startswith('x'):
msg['msg'] += line
You are analyzing the same line and this msg['msg'] = line[start + 2:] in the next iteration overwrites your key msg in dictionary msg and clear the previous value. So this code
field['date'] = line[:8]
msg['msg'] = line[start + 2:]
print msg
Always gets executed even for a simple x in your input file and clears the previous values under the key msg
If you would like it to work you need if else although I would recommend storing intermediate values it in a different way then in locally scoped variable.
Full example with code fix:
def parser():
path = "file.txt"
with open(path, 'r+') as file:
msg = {}
chat = []
start = 0
lines = file.readlines()
for i in range(0, len(lines)):
line = lines[i]
if True:
if line.startswith('x'):
msg['msg'] += line
else:
msg['date'] = line[:8]
msg['msg'] = line[12:]
chat.append(msg)
print(chat)
parser()
Result:
[{'date': '00.00.00', 'msg': 'Blabla\nx'}]
Assuming that the line if re.match('MY RULES', line) is not None:
is True for all the lines in the file that is:
00.00.00 : Blabla
x
How about this:
path = "file.txt"
with open (path, 'r') as f:
msg = dict()
for line in f.readlines():
if line[0].isdigit():
tmp = line.split(':')
date = tmp[0].strip()
msg[date] = ' '.join(*[x.split() for x in tmp[1:]])
else:
msg[date] += ' ' + ' '.join(*[line.split()])
We go line by line, in case first letter of the line is a digit we assume it is a date and add it to our dict - otherwise we add the string found to the last dict entry we made. str.split() makes sure you get ride of all different whitespace characters.
You can for sure replace the if statement in the for loop with your regex... The issue i see with your implementation in general is that as soon as the input varies slightly (e.g. more whitespace chars as intended) your solution produces faulty results. Basic python string manipulations are really powerful ;)
Update
This should produce the right output:
*file.txt*
00.00.00 : Blabla
x
00.00.00 : Blabla2
x2
path = "file.txt"
with open (path, 'r') as f:
lst = list()
for line in f.readlines():
if line[0].isdigit():
tmp = line.split(':')
date = tmp[0].strip()
msg = {date: ' '.join(*[x.split() for x in tmp[1:]])}
lst.append(msg)
else:
msg[date] += ' ' + ' '.join(*[line.split()])
print(lst)
>>> [{'00.00.00': 'Blabla x'}, {'00.00.00': 'Blabla2 x2'}]
I missed the part that you want to store each pair separately in a dict and append it to a list.
I am trying to set up a system for running various statistics on a text file. In this endeavor I need to open a file in Python (v2.7.10) and read it both as lines, and as a string, for the statistical functions to work.
So far I have this:
import csv, json, re
from textstat.textstat import textstat
file = "Data/Test.txt"
data = open(file, "r")
string = data.read().replace('\n', '')
lines = 0
blanklines = 0
word_list = []
cf_dict = {}
word_dict = {}
punctuations = [",", ".", "!", "?", ";", ":"]
sentences = 0
This sets up the file and the preliminary variables. At this point, print textstat.syllable_count(string) returns a number. Further, I have:
for line in data:
lines += 1
if line.startswith('\n'):
blanklines += 1
word_list.extend(line.split())
for char in line.lower():
cf_dict[char] = cf_dict.get(char, 0) + 1
for word in word_list:
lastchar = word[-1]
if lastchar in punctuations:
word = word.rstrip(lastchar)
word = word.lower()
word_dict[word] = word_dict.get(word, 0) + 1
for key in cf_dict.keys():
if key in '.!?':
sentences += cf_dict[key]
number_words = len(word_list)
num = float(number_words)
avg_wordsize = len(''.join([k*v for k, v in word_dict.items()]))/num
mcw = sorted([(v, k) for k, v in word_dict.items()], reverse=True)
print( "Total lines: %d" % lines )
print( "Blank lines: %d" % blanklines )
print( "Sentences: %d" % sentences )
print( "Words: %d" % number_words )
print('-' * 30)
print( "Average word length: %0.2f" % avg_wordsize )
print( "30 most common words: %s" % mcw[:30] )
But this fails as 22 avg_wordsize = len(''.join([k*v for k, v in word_dict.items()]))/num returns a ZeroDivisionError: float division by zero. However, if I comment out the string = data.read().replace('\n', '') from the first piece of code, I can run the second piece without problem and get the expected output.
Basically, how do I set this up so that I can run the second piece of code on data, as well as textstat on string?
The call to data.read() places the file pointer at the end of the file, so you dont have anything more to read at this point. You either have to close and reopen the file or more simply reset the pointer at the begining using data.seek(0)
First see the line:
string = data.read().replace('\n', '')
You are reading from data once. Now, cursor is in the end of data.
Then see the line,
for line in data:
You are trying to read it again, but you just can't do it, because there is nothing else in data, you are at the end of it.so len(word_list) are returning 0.
You are dividing by it and getting the error.
ZeroDivisionError: float division by zero.
But when you comment it, now you are reading only once, which is valid, so second portion of your codes now work.
Clear now?
So, what to do now?
Use data.seek() after data.read()
Demo:
>>> a = open('file.txt')
>>> a.read()
#output
>>>a.read()
#nothing
>>> a.seek(0)
>>> a.read()
#output again
Here is a simple fix. Replace the line for line in data: by :
data.seek(0)
for line in data.readlines():
...
It basically points back to the beginning of the file and read it again line by line.
While this should work, you may want to simplify the code and read the file only once. Something like:
with open(file, "r") as fin:
lines = fin.readlines()
string = ''.join(lines).replace('\n', '')
I have a text file in the following format:
DELIMITER1
extract me
extract me
extract me
DELIMITER2
I'd like to extract every block of extract mes between DELIMITER1 and DELIMITER2 in the .txt file
This is my current, non-performing code:
import re
def GetTheSentences(file):
fileContents = open(file)
start_rx = re.compile('DELIMITER')
end_rx = re.compile('DELIMITER2')
line_iterator = iter(fileContents)
start = False
for line in line_iterator:
if re.findall(start_rx, line):
start = True
break
while start:
next_line = next(line_iterator)
if re.findall(end_rx, next_line):
break
print next_line
continue
line_iterator.next()
Any ideas?
You can simplify this to one regular expression using re.S, the DOTALL flag.
import re
def GetTheSentences(infile):
with open(infile) as fp:
for result in re.findall('DELIMITER1(.*?)DELIMITER2', fp.read(), re.S):
print result
# extract me
# extract me
# extract me
This also makes use of the non-greedy operator .*?, so multiple non-overlapping blocks of DELIMITER1-DELIMITER2 pairs will all be found.
If the delimiters are within a line:
def get_sentences(filename):
with open(filename) as file_contents:
d1, d2 = '.', ',' # just example delimiters
for line in file_contents:
i1, i2 = line.find(d1), line.find(d2)
if -1 < i1 < i2:
yield line[i1+1:i2]
sentences = list(get_sentences('path/to/my/file'))
If they are on their own lines:
def get_sentences(filename):
with open(filename) as file_contents:
d1, d2 = '.', ',' # just example delimiters
results = []
for line in file_contents:
if d1 in line:
results = []
elif d2 in line:
yield results
else:
results.append(line)
sentences = list(get_sentences('path/to/my/file'))
This should do what you want:
import re
def GetTheSentences(file):
start_rx = re.compile('DELIMITER')
end_rx = re.compile('DELIMITER2')
start = False
output = []
with open(file, 'rb') as datafile:
for line in datafile.readlines():
if re.match(start_rx, line):
start = True
elif re.match(end_rx, line):
start = False
if start:
output.append(line)
return output
Your previous version looks like it's supposed to be an iterator function. Do you want your output returned one item at a time? That's slightly different.
This is a good job for List comprehensions, no regex required. First list comp scrubs the typical \n in the text line list found when opening txt file. Second list comp just uses in operator to identify sequence patterns to filter.
def extract_lines(file):
scrubbed = [x.strip('\n') for x in open(file, 'r')]
return [x for x in scrubbed if x not in ('DELIMITER1','DELIMITER2')]