I have a text file in the following format:
DELIMITER1
extract me
extract me
extract me
DELIMITER2
I'd like to extract every block of extract mes between DELIMITER1 and DELIMITER2 in the .txt file
This is my current, non-performing code:
import re
def GetTheSentences(file):
fileContents = open(file)
start_rx = re.compile('DELIMITER')
end_rx = re.compile('DELIMITER2')
line_iterator = iter(fileContents)
start = False
for line in line_iterator:
if re.findall(start_rx, line):
start = True
break
while start:
next_line = next(line_iterator)
if re.findall(end_rx, next_line):
break
print next_line
continue
line_iterator.next()
Any ideas?
You can simplify this to one regular expression using re.S, the DOTALL flag.
import re
def GetTheSentences(infile):
with open(infile) as fp:
for result in re.findall('DELIMITER1(.*?)DELIMITER2', fp.read(), re.S):
print result
# extract me
# extract me
# extract me
This also makes use of the non-greedy operator .*?, so multiple non-overlapping blocks of DELIMITER1-DELIMITER2 pairs will all be found.
If the delimiters are within a line:
def get_sentences(filename):
with open(filename) as file_contents:
d1, d2 = '.', ',' # just example delimiters
for line in file_contents:
i1, i2 = line.find(d1), line.find(d2)
if -1 < i1 < i2:
yield line[i1+1:i2]
sentences = list(get_sentences('path/to/my/file'))
If they are on their own lines:
def get_sentences(filename):
with open(filename) as file_contents:
d1, d2 = '.', ',' # just example delimiters
results = []
for line in file_contents:
if d1 in line:
results = []
elif d2 in line:
yield results
else:
results.append(line)
sentences = list(get_sentences('path/to/my/file'))
This should do what you want:
import re
def GetTheSentences(file):
start_rx = re.compile('DELIMITER')
end_rx = re.compile('DELIMITER2')
start = False
output = []
with open(file, 'rb') as datafile:
for line in datafile.readlines():
if re.match(start_rx, line):
start = True
elif re.match(end_rx, line):
start = False
if start:
output.append(line)
return output
Your previous version looks like it's supposed to be an iterator function. Do you want your output returned one item at a time? That's slightly different.
This is a good job for List comprehensions, no regex required. First list comp scrubs the typical \n in the text line list found when opening txt file. Second list comp just uses in operator to identify sequence patterns to filter.
def extract_lines(file):
scrubbed = [x.strip('\n') for x in open(file, 'r')]
return [x for x in scrubbed if x not in ('DELIMITER1','DELIMITER2')]
Related
I have a string with elements one below the other
For Ex:
print res ( where res is the result of a regular expression match )
abc
xyz
But i want to create a list with these elements,
final output: lst = ['abc','xyz']
I tried the below code, but its not working. Any help would be appreciated
lst = []
res = ""
with open(filename, "rt") as myfile:
for line in myfile:
if pattern.search(line) != None:
res = [sub[ : -2] for sub in line.split(' ')[6:len(line)-1]]
print res[0]
for h in res:
print lst.append(h)
Not sure how your original file really looks like but this is the easy way via split() on newlines.
with open(filename, 'r') as f:
s = f.read()
data = s.split('\n')
print(data)
I have a text file like this example:
example:
>chr9:128683-128744
GGATTTCTTCTTAGTTTGGATCCATTGCTGGTGAGCTAGTGGGATTTTTTGGGGGGTGTTA
>chr16:134222-134283
AGCTGGAAGCAGCGTGAATAAAACAGAATGGCCGGGACCTTAAAGGCTTTGCTTGGCCTGG
>chr16:134226-134287
GGAAGCAGCGTGGGAATCACAGAATGGACGGCCGATTAAAGGCTTTGCTTGGCCTGGATTT
>chr1:134723-134784
AAGTGATTCACCCTGCCTTTCCGACCTTCCCCAGAACAGAACACGTTGATCGTGGGCGATA
>chr16:135770-135831
GCCTGAGCAAAGGGCCTGCCCAGACAAGATTTTTTAATTGTTTAAAAACCGAATAAATGTT
this file is divided into different parts and every part has 2 rows. the 1st row starts with > (and this row is called ID) and the 2nd row is the sequence of letters.
I want to search for 2 short motif (AATAAA and GGAC) in the sequence of letters and if they contain these motifs, I want to get the the ID and sequence of that part.
but the point is AATAAA should be the 1st sequence and GGAC will come after that. there is a distance between them but this distance can be 2 letters or more.
expected output:
>chr16:134222-134283
AGCTGGAAGCAGCGTGAATAAAACAGAATGGCCGGGACCTTAAAGGCTTTGCTTGGCCTGG
I am trying to do that in python using the following command:
infile = open('infile.txt', 'r')
mot1 = 'AATAAA'
mot2 = 'GGAC'
new = []
for line in range(len(infile)):
if not infile[line].startswith('>'):
for match in pattern.finder(mot1) and pattern.finder(mot2):
new.append(infile[line-1])
with open('outfile.txt', "w") as f:
for item in new:
f.write("%s\n" % item)
this code does not return what I want. do you know how to fix it?
You can group the ID with sequence, and then utilize re.findall:
import re
data = [i.strip('\n') for i in open('filename.txt')]
new_data = [[data[i], data[i+1]] for i in range(0, len(data), 2)]
final_result = [[a, b] for a, b in new_data if re.findall('AATAAA\w{2,}GGAC', b)]
Output:
[['>chr16:134222-134283', 'AGCTGGAAGCAGCGTGAATAAAACAGAATGGCCGGGACCTTAAAGGCTTTGCTTGGCCTGG']]
Not sure I've got your idea about this distance can be 2 letters or more, and is it obligatory to check, but following code gives you desired output:
mot1 = 'AATAAA'
mot2 = 'GGAC'
with open('infile.txt', 'r') as inp:
last_id = None
for line in inp:
if line.startswith('>'):
last_id = line
else:
if mot1 in line and mot2 in line:
print(last_id)
print(line)
You can redirect output to a file if you want
You can use a regex and a dictionary comprehension:
import re
with open('test.txt', 'r') as f:
lines = f.readlines()
data = dict(zip(lines[::2],lines[1::2]))
{k.strip(): v.strip() for k,v in data.items() if re.findall(r'AATAAA\w{2,}GGAC', v)}
Returns:
{'>chr16:134222-134283': 'AGCTGGAAGCAGCGTGAATAAAACAGAATGGCCGGGACCTTAAAGGCTTTGCTTGGCCTGG'}
You may slice the irrelevant part of the string if mot1 is found in it. Here's a way to do it:
from math import ceil
infile = open('infile.txt', 'r')
text = infile.readlines()
infile.close()
mot1 = 'AATAAA'
mot2 = 'GGAC'
check = [(text[x], text[x+1]) for x in range(ceil(len(text)/2))]
result = [(x + '\n' + y) for (x, y) in check if mot1 in y and mot2 in y[(y.find(mot1)+len(mot1)+2):]]
with open('outfile.txt', "w") as f:
for item in result:
f.write("%s\n" % item)
If the file is not too big, you can read it at once, and use re.findall():
import re
with open("infile.txt") as finp:
data=finp.read()
with open('outfile.txt', "w") as f:
for item in re.findall(r">.+?[\r\n\f][AGTC]*?AATAAA[AGTC]{2,}GGAC[AGTC]*", data):
f.write(item+"\n")
"""
+? and *? means non-greedy process;
>.+?[\r\n\f] matches a line starting with '>' and followed by any characters to the end of the line;
[AGTC]*?AATAAA matches any number of A,G,T,C characters, followed by the AATAAA pattern;
[AGTC]{2,} matches at least two or more characters of A,G,T,C;
GGAC matches the GGAC pattern;
[AGTC]* matches the empty string or any number of A,G,T,C characters.
"""
I would like to search for strings that match a pattern in a text file and export only the matched strings
k=''
regex = re.compile(r'[a-zA-Z]{2}\d{8}')
with open(file, 'r') as f:
for line in f:
line = line.replace(',', '')
line = line.replace('.', '')
k = regex.findall(line)
#k.append(line)
if not k=='':
position=True
else:
position=False
if position==True:
print(k)
Somehow my code doesn't work, it always returns the following output:
[] [] [] [] [] [] [] ['AI13933231'] [] [] [] [] []
I want the output to contain only the matched strings. Thank you!
The reason why there are empty array literals [] is because this line actually exists, but is either empty (containing just \n) or does not match the regex '[a-zA-Z]{2}\d{8}'. And please note that regex.findall(line) returns an list, so if the regex did not find any that matches, it is an empty list.
Your main error happened in this section: if not k=='':. Note k is an list.
Consider this code:
import re
k=''
regex = re.compile(r'[a-zA-Z]{2}\d{8}')
with open("omg.txt", 'r') as f:
for line in f:
line = line.replace(',', '')
line = line.replace('.', '')
k = regex.findall(line)
#k.append(line)
position = False
if str(k) != '[]': # The `[]` is just the string representation of an empty array
position=True
print(k)
else:
position=False
Given the file (Text after # are ignored, not part of the file)
AZ23153133
# Empty line
AB12355342
gz # No match
XY93312344
The output would be
['AZ23153133']
['AB12355342']
['XY93312344']
I'm having some difficulty with writing a program in Python. I would like the program to read lines between a set of characters, reverse the order of the lines and then write them into a new file. The input is:
AN10 G17 G21 G90
N20 '2014_12_08_Banding_Test_4
N30 M3 S1B
N40G00X0.000Y0.000Z17.000
N50 G00X0.001Y0.001Z17.000
N60 G01Z0.000F3900.0
N70 G01X0.251
N80 G01X149.999
N90 G01Y0.251
N100 G01X149.749
N110 G01X149.499Z-8.169
N120 G01X148.249Z-8.173
N130 G01X146.999Z-8.183
N140 G01X145.499Z-8.201
...
N3140 G01Y0.501
So far my code is:
with open('Source.nc') as infile, open('Output.nc', 'w') as outfile:
copy = False
strings_A = ("G01Y", ".251")
strings_B = ("G01Y", ".501")
content = infile.readlines()
for lines in content:
lines.splitlines(1)
if all(x in lines for x in strings_A):
copy = True
elif all(x in lines for x in strings_B):
copy = False
elif copy:
outfile.writelines(reversed(lines))
I think I am failing to understand something about the difference between lines and a multi-multiline string. I would really appreciate some help here!
Thanks in advance, Arthur
A string has multiple lines if it contains newline characters \n.
You can think of a file as either one long string that contains newline characters:
s = infile.read()
Or you can treat it like a list of lines:
lines = infile.readlines()
If you have a multiline string you can split it into a list of lines:
lines = s.splitlines(False)
# which is basically a special form of:
lines = s.split('\n')
If you want to process a file line by line all of the following methods are equivalent (in effect if not in efficiency) :
with open(filename, 'r') as f:
s = f.read()
lines = s.splitlines()
for line in lines:
# do something
pass
with open(filename, 'r') as f:
lines = f.readlines()
for line in lines:
# do something
pass
# this last option is the most pythonic one,
# it uses the fact that any file object can be treated as a list of lines
with open(filename, 'r') as f
for line in f:
# do something
pass
EDIT Now the solution of your problem:
with open('Source.nc') as infile, open('Output.nc', 'w') as outfile:
copy = False
strings_A = ("G01Y", ".251")
strings_B = ("G01Y", ".501")
target_lines = []
for line in infile:
if copy and all(x in line for x in strings_B):
outfile.writelines(reversed(target_lines))
break
if copy:
target_lines.append(line)
if all(x in line for x in strings_A):
copy = True
This will copy all lines between a line that matches all(x in line for x in strings_A) and a line that matches all(x in line for x in strings_B) into the outfile in reversed order. The identifying lines are NOT included in the output (I hope that was the intent).
The order of the if clauses is deliberate to achieve that.
Also be aware that the identification tests (all(x in line for x in strings_A)) you use, work as a substring search not a word match, again I don't know if that was your intent.
EDIT2 In response to comment:
with open('Source.nc') as infile, open('Output.nc', 'w') as outfile:
strings_A = ("G01Y", ".251")
strings_B = ("G01Y", ".501")
do_reverse = False
lines_to_reverse = []
for line in infile:
if all(x in line for x in strings_B):
do_reverse = False
outfile.writelines(reversed(lines_to_reverse))
outfile.writeline(line)
continue
if do_reverse:
lines_to_reverse.append(line)
continue
else:
outfile.writeline(line)
if all(x in line for x in strings_A):
do_reverse = True
lines_to_reverse = []
I am trying to parse a large fasta file and I am encountering out of memory errors. Some suggestions to improve the data handling would be appreciated. Currently the program correctly prints out the names however partially through the file I get a MemoryError
Here is the generator
def readFastaEntry( fp ):
name = ""
seq = ""
for line in fp:
if line.startswith( ">" ):
tmp = []
tmp.append( name )
tmp.append( seq )
name = line
seq = ""
yield tmp
else:
seq = seq.join( line )
and here is the caller stub more will be added after this part works
fp = open( sys.argv[1], 'r' )
for seq in readFastaEntry( fp ) :
print seq[0]
For those not fimilar with the fasta format here is an example
>1 (PB2)
AATATATTCAATATGGAGAGAATAAAAGAACTAAGAGATCTAATGTCACAGTCTCGCACTCGCGAGATAC
TCACCAAAACCACTGTGGACCACATGGCCATAATCAAAAAGTACACATCAGGAAGGCAAGAGAAGAACCC
TGCACTCAGGATGAAGTGGATGATG
>2 (PB1)
AACCATTTGAATGGATGTCAATCCGACTTTACTTTTCTTGAAAGTTCCAGCGCAAAATGCCATAAGCACC
ACATTTCCCTATACTGGAGACCCTCC
each entry starts with a ">" stating the name etc then the next N lines are data. There is no defined ending of the data other than the next line having a ">" at the beginning.
Have you considered using BioPython. They have a sequence reader that can read fasta files. And if you are interested in coding one yourself, you can take a look at BioPython's code.
Edit: Code added
def read_fasta(fp):
name, seq = None, []
for line in fp:
line = line.rstrip()
if line.startswith(">"):
if name: yield (name, ''.join(seq))
name, seq = line, []
else:
seq.append(line)
if name: yield (name, ''.join(seq))
with open('f.fasta') as fp:
for name, seq in read_fasta(fp):
print(name, seq)
A pyparsing parser for this format is only a few lines long. See the annotations in the following code:
data = """>1 (PB2)
AATATATTCAATATGGAGAGAATAAAAGAACTAAGAGATCTAATGTCACAGTCTCGCACTCGCGAGATAC
TCACCAAAACCACTGTGGACCACATGGCCATAATCAAAAAGTACACATCAGGAAGGCAAGAGAAGAACCC
TGCACTCAGGATGAAGTGGATGATG
>2 (PB1)
AACCATTTGAATGGATGTCAATCCGACTTTACTTTTCTTGAAAGTTCCAGCGCAAAATGCCATAAGCACC
ACATTTCCCTATACTGGAGACCCTCC"""
from pyparsing import Word, nums, QuotedString, Combine, OneOrMore
# define some basic forms
integer = Word(nums)
key = QuotedString("(", endQuoteChar=")")
# sequences are "words" made up of the characters A, G, C, and T
# we want to match one or more of them, and have the parser combine
# them into a single string (Combine by default requires all of its
# elements to be adjacent within the input string, but we want to allow
# for the intervening end of lines, so we add adjacent=False)
sequence = Combine(OneOrMore(Word("AGCT")), adjacent=False)
# define the overall pattern to scan for - attach results names
# to each matched element
seqEntry = ">" + integer("index") + key("key") + sequence("sequence")
for seq,s,e in seqEntry.scanString(data):
# just dump out the matched data
print seq.dump()
# could also access fields as seq.index, seq.key and seq.sequence
Prints:
['>', '1', 'PB2', 'AATATATTCAATATGGAGAGAATAAAAGAACTAAGAGATCTAATGTCACAGTCTCGCACTCGCGAGATACTCACCAAAACCACTGTGGACCACATGGCCATAATCAAAAAGTACACATCAGGAAGGCAAGAGAAGAACCCTGCACTCAGGATGAAGTGGATGATG']
- index: 1
- key: PB2
- sequence: AATATATTCAATATGGAGAGAATAAAAGAACTAAGAGATCTAATGTCACAGTCTCGCACTCGCGAGATACTCACCAAAACCACTGTGGACCACATGGCCATAATCAAAAAGTACACATCAGGAAGGCAAGAGAAGAACCCTGCACTCAGGATGAAGTGGATGATG
['>', '2', 'PB1', 'AACCATTTGAATGGATGTCAATCCGACTTTACTTTTCTTGAAAGTTCCAGCGCAAAATGCCATAAGCACCACATTTCCCTATACTGGAGACCCTCC']
- index: 2
- key: PB1
- sequence: AACCATTTGAATGGATGTCAATCCGACTTTACTTTTCTTGAAAGTTCCAGCGCAAAATGCCATAAGCACCACATTTCCCTATACTGGAGACCCTCC
Without having a great understanding of what you are doing, I would have written the code like this:
def readFastaEntry( fp ):
name = ""
while True:
line = name or f.readline()
if not line:
break
seq = []
while True:
name = f.readline()
if not name or name.startswith(">"):
break
else:
seq.append(name)
yield (line, "".join(seq))
This gathers up the data after a starting line up to the next starting line. Making seq an array means that you minimize the string joining until the last possible moment. Yielding a tuple makes more sense than a list.
def read_fasta(filename):
name = None
with open(filename) as file:
for line in file:
if line[0] == ">":
if name:
yield (name, seq)
name = line[1:-1].split("|")[0]
seq = ""
else:
seq += line[:-1]
yield (name, seq)