Attribute error list object has no attribute strip - python

I am writing some code to tag a file, which looks at the previous line to see if there is a SCI tag, and if so, tag the current line with SCI_NXT in a fifth column (in a tab delimited file).
However, I get the attribute error that I am trying to strip a list (at line previous_line = split_line(previous_line) when the first line which is not a one item line is the object of the variable. This I understand is because it is writing the lines as lists, not as strings, but I do not understand how I might go about rectifying this. I have tried using "extend" but this resulted in the first line being written as each character being a different element, which is also not what I am looking to do.
Here is the test text I am working on:
</s>
<s>
Diptera NP Diptera-n SCI
was VBD be-v
the DT the-x
most RBS most-a
common JJ common-j
prey NN prey-n
among IN among-i
the DT the-x
insects NNS insect-n
potentially RB potentially-a
available JJ available-j
to IN to-i
Here is the code:
"""Tags a file with NEXT_SCI in extra feature column. Reads and writes vert files.
"""
import json
#from pip._vendor.pyparsing import line
VFILE = 'test_next.vert'
def split_line(line):
"""Split a line into five parts, word, tag, lempos, ti, sci"""
# TODO: Speak to Diana about the spaces in the vert file - do they mean
# anything?
line = line.strip().split()
if len(line) == 1:
word = line[0]
pos, lempos, tag = None, None, None
elif len(line) == 3:
word, pos, lempos = line
tag = None
elif len(line) == 4:
word, pos, lempos, tag = line
return [word, pos, lempos, tag]
def tag_next_sci(lines):
"""Loops through lines of original document to add to new file (tagged)
"""
taggedlines = []
for line in lines:
taggedlines.append(tagline_next_sci(line, taggedlines))
return taggedlines
def tagline_next_sci(line, taggedlines):
"""Assigns an indicator tag to a line
"""
#<> are structural and do not need to be considered for feature tags so can be committed directly
if line.startswith('<'):
return line
#look back at previous line to see if SCI, if so tag current line
previous_line = taggedlines[-1]
previous_line = split_line(previous_line)
line = split_line(line)
#look at last column. if SCI, print line, go to next line and add tag in final column ("\t\t\tNXT_SCI\n")
if previous_line[-1] == "SCI":
if len(line) == 3:
print(line + "\t\t\tSCI_MOD\n")
return(line + "\t\t\tSCI_MOD\n")
if len(line) == 4:
print(line + "\t\tSCI_MOD\n")
return(line + "\t\tSCI_MOD\n")
return line
def read_vfile(fname):
"""Reads a vert file
"""
with open(fname, 'r') as vfile:
lines = vfile.readlines()
return lines
def write_vfile(fname, taggedlines):
"""Writes a vert file
"""
# write to file
with open(fname, 'w') as outfile:
outfile.writelines(taggedlines)
def tag_vert_sci_next(fname, fname_out):
"""Creates a new file with tags
"""
# read vertical file
lines = read_vfile(fname)
# tag file
taggedlines = tag_next_sci(lines)
# call write file
write_vfile(fname_out, taggedlines)
def main(fname, fname_out):
#call sci_next tagging
tag_vert_sci_next('test_next.vert', fname_out)
if __name__ == "__main__":
main('test_next.vert', 'zenodo_tagged_SCI_MOD.vert')
And the trackback error:
Traceback (most recent call last):
File "/home/sandra/git/trophic/tagging/tagging_NEXT.py", line 123, in <module>
main('test_next.vert', 'zenodo_tagged_SCI_MOD.vert')
File "/home/sandra/git/trophic/tagging/tagging_NEXT.py", line 120, in main
tag_vert_sci_next('test_next.vert', fname_out)
File "/home/sandra/git/trophic/tagging/tagging_NEXT.py", line 78, in tag_vert_sci_next
taggedlines = tag_next_sci(lines)
File "/home/sandra/git/trophic/tagging/tagging_NEXT.py", line 31, in tag_next_sci
taggedlines.append(tagline_next_sci(line, taggedlines))
File "/home/sandra/git/trophic/tagging/tagging_NEXT.py", line 43, in tagline_next_sci
previous_line = split_line(previous_line)
File "/home/sandra/git/trophic/tagging/tagging_NEXT.py", line 14, in split_line
line = line.strip().split()
AttributeError: 'list' object has no attribute 'strip'

Your issue seems to be that tagline_next_sci sometimes returns a list and not a string. For example, I tried putting a print inside the function to see what was going on;
...
def tagline_next_sci(line, taggedlines):
print('taggedlines', taggedlines)
"""Assigns an indicator tag to a line
"""
...
and got the output
taggedlines []
taggedlines ['</s>\n']
taggedlines ['</s>\n', '<s>\n']
taggedlines ['</s>\n', '<s>\n', ['Diptera', 'NP', 'Diptera-n', 'SCI']]
So you should check at the bottom of the function to make sure you always return a string, and maybe do a "\t".join(line) if you need to puzzle together your list to a string, with something like
return line if isinstance(line, str) else "\t".join(line)

Thank you all for your help. Here is the code I ended up with:
"""Tags a file with SCI_MOD in extra feature column. Reads and writes vert files.
"""
import json
VFILE = 'zenodotaggedWS_ALL.vert'
def split_line(line):
"""Split a line into its parts"""
line = line.strip().split()
if len(line) == 1:
word = line[0]
pos, lempos, tag ="", "", ""
elif len(line) == 3:
word, pos, lempos = line
tag = ""
elif len(line) == 4:
word, pos, lempos, tag = line
return [word, pos, lempos, tag]
def tag_next_sci(lines):
"""Loops through lines of original document to add to new file (tagged)
"""
taggedlines = []
for line in lines:
taggedlines.append(tagline_next_sci(line, taggedlines))
return taggedlines
def tagline_next_sci(line, taggedlines):
"""Assigns an indicator tag to a line
"""
#<> are structural and do not need to be considered for feature tags so can be committed directly
if line.startswith('<'):
return line
#look back at previous line to see if SCI, if so tag current line
previous_line = taggedlines[-1]
previous_line = split_line(previous_line)
line = split_line(line)
if previous_line[2] == "SCI-n":
print("\t".join(line) + "\tSCI_MOD\n")
return "\t".join(line) + "\tSCI_MOD\n"
return line + "\n" if isinstance(line, str) else "\t".join(line) + "\n"
def read_vfile(fname):
"""Reads a vert file
"""
with open(fname, 'r') as vfile:
lines = vfile.readlines()
return lines
def write_vfile(fname, taggedlines):
"""Writes a vert file
"""
# write to file
with open(fname, 'w') as outfile:
outfile.writelines(taggedlines)
def tag_vert_sci_next(fname, fname_out):
"""Creates a new file with tags
"""
# vertical file location
# make list of species names
# read vertical file
lines = read_vfile(fname)
# tag file
taggedlines = tag_next_sci(lines)
# call write file
write_vfile(fname_out, taggedlines)
def main(fname, fname_out):
#call sci_next tagging
tag_vert_sci_next('zenodotaggedWS_ALL.vert', fname_out)
if __name__ == "__main__":
main('zenodotaggedWS_ALL.vert', 'zenodo_tagged_SCIMOD2.vert')

Related

How Can I read a string from a txt file line by line and run it with a function?

I have a txt file named a.txt. In this file a has a string per line. I want to append these strings line by line to the keyword = {} dict and run my double_letter function for each line of string. How can I do it?
my double_letter function:
keyword = {}
def double_letter():
print("\nDouble Letter:\n")
idx = random.randint(0, len(keyword) - 1)
keyword = keyword[:idx] + keyword[idx] + keyword[idx:]
print(keyword)
You can open, read and print the contents of a txt file as follows:
f = open("a.txt", "r")
for line in f:
print(line)
You can add in your function for each run through the for loop, i.e. calling it during each line of the text:
f = open("a.txt", "r")
for line in f:
print(line)
double_letter()
IIUC
Code
import random
def double_letter(line):
'''
Repeats random letter in line
'''
if line:
idx = random.randint(0, len(line) - 1)
return line[:idx] + line[idx] + line[idx:]
else:
return line # does nothing with blank lines
with open("a.txt", "r") as f: # with preferred with open file
keyword = {} # setup results dictionary
for line in f:
line = line.rstrip() # remove the '\n' at the end of each line
keyword[line] = double_letter(line) # add line with it's repeat to dictionary
print(keyword)
File a.txt
Welcome
To
Stackoverflow
Output
{'Welcome': 'Welcomee', 'To': 'Too', 'Stackoverflow': 'Stackoverfloow'}

python unpacking a text file into separate variables

I have a .txt file formatted like this
60
4
20
YF
X : YF+XF+Y
Y : XF-YF-X
I need each line to be a separate variable and the last two to be broken into a key and value in a dictionary. I currently have this:
class LSystem:
def __init__(self,filename):
#complete this method
self.rules = {}
file = open(filename)
for i, line in enumerate(filename):
if i == 0:
self.angle = line
elif i == 1:
self.iteration = line
elif i == 2:
self.distance = line
elif i == 3:
self.axiom = line
elif i >= 4:
(key,val)= line.split
self.rules[key] = val
file.close()
This gives me this error:
Traceback (most recent call last):
File "lab10.py", line 65, in <module>
main()
File "lab10.py", line 10, in main
sys = lsystem.LSystem("arrowheadcurve.txt")
File "/Users/alongo/Dropbox/Freshman Fall Semester/CS 110/Labs/lab-10-fall18-antmelon/lsystem.py", line 17, in __init__
(key,val)= line.split
TypeError: cannot unpack non-iterable builtin_function_or_method object
How do you go about fixing this?
As #Carcigenicate commented, line.split does not actually call the split() function. You need to invoke it by including the parentheses:
(key,val) = line.split()
But note that split() will split on white space characters. For your input this will result in a list containing three items, and unpacking that into only two variables will also fail.
I assume that you should be splitting on the : (further assuming that : can not be present else where in the expression). Try this:
(key, val) = line.split(' : ')
I have included the surrounding spaces in the delimiter to the leading and trailing spaces are not present in the result. If the white space is inconsistent you can handle it like this:
key, val = [s.strip() for s in line.split(':')]
Also, fix the file iteration by using the file object, not the filename string, and open it in a with statement (so that it will be guaranteed to be properly closed):
with open(filename) as f:
for i, line in enumerate(f):
line = line.strip() # remove leading and trailing white space
if i == 0:
self.angle = line
You've missed few small things that I'm commenting in the following code:
class LSystem:
def __init__(self, filename):
self.rules = {}
your_file = open(filename) # Try to avoid using keyword as variable names
for i, line in enumerate(your_file): # you should enumerate over your file object not its string name
if i == 0:
self.angle = line
elif i == 1:
self.iteration = line
elif i == 2:
self.distance = line
elif i == 3:
self.axiom = line
elif i >= 4:
key, val = line.split(' : ') # You should call split passing your desired separator
f.close()

Replace string in line without adding new line?

I want to replace string in a line which contain patternB, something like this:
from:
some lines
line contain patternA
some lines
line contain patternB
more lines
to:
some lines
line contain patternA
some lines
line contain patternB xx oo
more lines
I have code like this:
inputfile = open("d:\myfile.abc", "r")
outputfile = open("d:\myfile_renew.abc", "w")
obj = "yaya"
dummy = ""
item = []
for line in inputfile:
dummy += line
if line.find("patternA") != -1:
for line in inputfile:
dummy += line
if line.find("patternB") != -1:
item = line.split()
dummy += item[0] + " xx " + item[-1] + "\n"
break
outputfile.write(dummy)
It do not replace the line contain "patternB" as expected, but add an new line below it like :
some lines
line contain patternA
some lines
line contain patternB
line contain patternB xx oo
more lines
What can I do with my code?
Of course it is, since you append line to dummy in the beginning of the for loop and then the modified version again in the "if" statement. Also why check for Pattern A if you treat is as you treat everything else?
inputfile = open("d:\myfile.abc", "r")
outputfile = open("d:\myfile_renew.abc", "w")
obj = "yaya"
dummy = ""
item = []
for line in inputfile:
if line.find("patternB") != -1:
item = line.split()
dummy += item[0] + " xx " + item[-1] + "\n"
else:
dummy += line
outputfile.write(dummy)
The simplest will be:
1. Read all File into string
2. Call string.replace
3. Dump string to file
If you want to keep line by line iterator
(for a big file)
for line in inputfile:
if line.find("patternB") != -1:
dummy = line.replace('patternB', 'patternB xx oo')
outputfile.write(dummy)
else:
outputfile.write(line)
This is slower than other responses, but enables big file processing.
This should work
import os
def replace():
f1 = open("d:\myfile.abc","r")
f2 = open("d:\myfile_renew.abc","w")
ow = raw_input("Enter word you wish to replace:")
nw = raw_input("Enter new word:")
for line in f1:
templ = line.split()
for i in templ:
if i==ow:
f2.write(nw)
else:
f2.write(i)
f2.write('\n')
f1.close()
f2.close()
os.remove("d:\myfile.abc")
os.rename("d:\myfile_renew.abc","d:\myfile.abc")
replace()
You can use str.replace:
s = '''some lines
line contain patternA
some lines
line contain patternB
more lines'''
print(s.replace('patternB', 'patternB xx oo'))

parsing .xml blast output with re

I'm trying to parse BLAST output in XML format using re, have never done it before, below is my code.
However,since some hits have Hsp_num sometimes more than once, I get more results for query_from and query_to, and less for query_len, how to specify that if Hsp_num is more than 1 do print query_len for it again? thank you
import re
output = open('result.txt','w')
n = 0
with open('file.xml','r') as xml:
for line in xml:
if re.search('<Hsp_query-from>', line) != None:
line = line.strip()
line = line.rstrip()
line = line.strip('<Hsp_query-from>')
line = line.rstrip('</')
query_from = line
if re.search('<Hsp_query-to>', line) != None:
line = line.strip()
line = line.rstrip()
line = line.strip('<Hsp_query-to>')
line = line.rstrip('</')
query_to = line
if re.search('<Hsp_num>', line) != None:
line = line.strip()
line = line.rstrip()
line = line.strip('<Hsp_num>')
line = line.rstrip('</')
Hsp_num = line
print >> output, Hsp_num+'\t'+query_from+'\t'+query_to
output.close()
I did query_len in a separate file, since it didnt work..
with open('file.xml','r') as xml:
for line in xml:
if re.search('<Iteration_query-len>', line) != None:
line = line.strip()
line = line.rstrip()
line = line.strip('<Iteration_query-len>')
line = line.rstrip('</')
query_len = line
Are you familiar with Biopython? Its Bio.Blast.NCBIXML module may be just what you need. Chapter 7 of the Tutorial and Cookbook is all about BLAST, and section 7.3 deals with parsing. You'll get an idea of how it works, and it will be a lot easier than using regex to parse XML, which will only lead to tears and mental breakdowns.

python record separtor file iteration

I have a very very large text file (much larger than can fit in memory). What I would like to do is use something similar to:
for record in myFile:
process_record();
with the added trick that my records are separated by blank lines (with all kinds of stuff in between). For example...
data1
data2,data3,moredata
anotherrecord,otherstuff
yippee
kaiyay
mom
aThird,record:here
How would one iterate through the file in python where each loop iteration accesses a single record from the file?
You can do it with a generator function:
def records(textfile):
record_lines = []
for line in textfile:
if line != '\n':
record_lines.append(line)
else:
yield ''.join(record_lines)
record_lines = []
yield ''.join(record_lines)
for record in records(the_file):
process(record)
You could create an iterator that joins the lines until you find a blank line.
class MyIter:
def __init__(self, infile):
self.infile=infile
def __iter__(self):
return self
def next(self):
lines = []
for line in infile:
line = line.strip()
if len(line) > 0:
lines.append(line)
else:
break
if len(lines)==0:
raise StopIteration
else:
return ",".join(lines)
and try it with
for line in MyIter(infile):
print line

Categories

Resources