python unpacking a text file into separate variables - python

I have a .txt file formatted like this
60
4
20
YF
X : YF+XF+Y
Y : XF-YF-X
I need each line to be a separate variable and the last two to be broken into a key and value in a dictionary. I currently have this:
class LSystem:
def __init__(self,filename):
#complete this method
self.rules = {}
file = open(filename)
for i, line in enumerate(filename):
if i == 0:
self.angle = line
elif i == 1:
self.iteration = line
elif i == 2:
self.distance = line
elif i == 3:
self.axiom = line
elif i >= 4:
(key,val)= line.split
self.rules[key] = val
file.close()
This gives me this error:
Traceback (most recent call last):
File "lab10.py", line 65, in <module>
main()
File "lab10.py", line 10, in main
sys = lsystem.LSystem("arrowheadcurve.txt")
File "/Users/alongo/Dropbox/Freshman Fall Semester/CS 110/Labs/lab-10-fall18-antmelon/lsystem.py", line 17, in __init__
(key,val)= line.split
TypeError: cannot unpack non-iterable builtin_function_or_method object
How do you go about fixing this?

As #Carcigenicate commented, line.split does not actually call the split() function. You need to invoke it by including the parentheses:
(key,val) = line.split()
But note that split() will split on white space characters. For your input this will result in a list containing three items, and unpacking that into only two variables will also fail.
I assume that you should be splitting on the : (further assuming that : can not be present else where in the expression). Try this:
(key, val) = line.split(' : ')
I have included the surrounding spaces in the delimiter to the leading and trailing spaces are not present in the result. If the white space is inconsistent you can handle it like this:
key, val = [s.strip() for s in line.split(':')]
Also, fix the file iteration by using the file object, not the filename string, and open it in a with statement (so that it will be guaranteed to be properly closed):
with open(filename) as f:
for i, line in enumerate(f):
line = line.strip() # remove leading and trailing white space
if i == 0:
self.angle = line

You've missed few small things that I'm commenting in the following code:
class LSystem:
def __init__(self, filename):
self.rules = {}
your_file = open(filename) # Try to avoid using keyword as variable names
for i, line in enumerate(your_file): # you should enumerate over your file object not its string name
if i == 0:
self.angle = line
elif i == 1:
self.iteration = line
elif i == 2:
self.distance = line
elif i == 3:
self.axiom = line
elif i >= 4:
key, val = line.split(' : ') # You should call split passing your desired separator
f.close()

Related

python error: Traceback (most recent call last), IndexError: list index out of range

I'm trying to run the below python script (vcf2treemix.py) with the command
<./vcf2treemix.py -vcf allsamples14_filtered_1_autosomes38_bisnps.vcf.gz -pop allsamples14.clust.pop>
I got this error with both python 2 and 3
######### error ###
Traceback (most recent call last):
File "./vcf2treemix.py", line 99, in <module>
main()
File "./vcf2treemix.py", line 95, in main
pop_obj = get_pops(pop_file)
File "./vcf2treemix.py", line 34, in get_pops
pops[fields[0]] = fields[1].split()
IndexError: list index out of range
######### vcf2treemix.py ###
#!/usr/bin/python
# vcf2treemix.py
# Converts a vcf file into TreeMix input
import argparse
from collections import OrderedDict
parser = argparse.ArgumentParser(description="Parsing statistical output of"
" VCFtools")
parser.add_argument("-vcf", dest="vcf_file", help="/mnt/ursus/GROUP-sbifh3/c1845371/whole_genome/data_dog/align_out/treemix/allsamples14_filtered_1_autosomes38_bisnps_main.vcf.gz",
required=True)
parser.add_argument("-pop", dest="pop_file", help="/mnt/ursus/GROUP-sbifh3/c1845371/whole_genome/data_dog/align_out/treemix/allsamples14.clust.pop",
required=True)
arg = parser.parse_args()
def get_pops(pop_file):
"""
Returns a dictionary with pop identifier as key and taxa as a list of
strings. In the pop file, each populations should be in one line, starting
withe pop name, a colon and the corresponding taxa separated by whitespace.
E.g.:
pop1: taxon1 taxon2 taxon3
"""
pops = OrderedDict()
with open(pop_file) as fh:
for line in fh:
fields = line.strip().split(":")
pops[fields[0]] = fields[1].split()
return pops
def vcf2treemix(vcf_file, pop_obj):
"""
Converts a vcf file into treemix format.
"""
vcf_fh = open(vcf_file)
output_name = vcf_file.strip(".vcf") + ".tmix"
output_fh = open(output_name, "w")
# Write header for tmix file
output_fh.write("{}\n".format(" ".join([x for x in pop_obj.keys()])))
for line in vcf_fh:
# Skip header
if line.startswith("##"):
pass
# Get taxon positions
elif line.startswith("#CHROM"):
taxa_pos = line.strip().split()
# Ignore empty lines
elif line.strip() != "":
fields = line.strip().split()
# Ignore loci with more than two alleles
if len(fields[4]) > 1:
continue
# Get allele counts for each populations
temp_pop = OrderedDict((x, [0,0]) for x in pop_obj.keys())
for pop, taxa in pop_obj.items():
for taxon in taxa:
# Get taxon genotype
gen = fields[taxa_pos.index(taxon)]
# Skip if gen is missing data
if gen == "./.":
continue
temp_pop[pop][0] += gen.count("0")
temp_pop[pop][1] += gen.count("1")
# Write current locus to file
output_fh.write("{}\n".format(" ".join([str(x[0]) + "," + str(x[1]) for x in temp_pop.values()])))
vcf_fh.close()
output_fh.close()
def main():
# Args
vcf_file = arg.vcf_file
pop_file = arg.pop_file
pop_obj = get_pops(pop_file)
vcf2treemix(vcf_file, pop_obj)
main()
I have zero experience with python and I just run the script to manipulate genetic data.
Any help will be highly appreciable.
Thanks
Ali
I tried python 2 and 3 and I expect the script to work straightforward. I think there is no problem with the input data.

Attribute error list object has no attribute strip

I am writing some code to tag a file, which looks at the previous line to see if there is a SCI tag, and if so, tag the current line with SCI_NXT in a fifth column (in a tab delimited file).
However, I get the attribute error that I am trying to strip a list (at line previous_line = split_line(previous_line) when the first line which is not a one item line is the object of the variable. This I understand is because it is writing the lines as lists, not as strings, but I do not understand how I might go about rectifying this. I have tried using "extend" but this resulted in the first line being written as each character being a different element, which is also not what I am looking to do.
Here is the test text I am working on:
</s>
<s>
Diptera NP Diptera-n SCI
was VBD be-v
the DT the-x
most RBS most-a
common JJ common-j
prey NN prey-n
among IN among-i
the DT the-x
insects NNS insect-n
potentially RB potentially-a
available JJ available-j
to IN to-i
Here is the code:
"""Tags a file with NEXT_SCI in extra feature column. Reads and writes vert files.
"""
import json
#from pip._vendor.pyparsing import line
VFILE = 'test_next.vert'
def split_line(line):
"""Split a line into five parts, word, tag, lempos, ti, sci"""
# TODO: Speak to Diana about the spaces in the vert file - do they mean
# anything?
line = line.strip().split()
if len(line) == 1:
word = line[0]
pos, lempos, tag = None, None, None
elif len(line) == 3:
word, pos, lempos = line
tag = None
elif len(line) == 4:
word, pos, lempos, tag = line
return [word, pos, lempos, tag]
def tag_next_sci(lines):
"""Loops through lines of original document to add to new file (tagged)
"""
taggedlines = []
for line in lines:
taggedlines.append(tagline_next_sci(line, taggedlines))
return taggedlines
def tagline_next_sci(line, taggedlines):
"""Assigns an indicator tag to a line
"""
#<> are structural and do not need to be considered for feature tags so can be committed directly
if line.startswith('<'):
return line
#look back at previous line to see if SCI, if so tag current line
previous_line = taggedlines[-1]
previous_line = split_line(previous_line)
line = split_line(line)
#look at last column. if SCI, print line, go to next line and add tag in final column ("\t\t\tNXT_SCI\n")
if previous_line[-1] == "SCI":
if len(line) == 3:
print(line + "\t\t\tSCI_MOD\n")
return(line + "\t\t\tSCI_MOD\n")
if len(line) == 4:
print(line + "\t\tSCI_MOD\n")
return(line + "\t\tSCI_MOD\n")
return line
def read_vfile(fname):
"""Reads a vert file
"""
with open(fname, 'r') as vfile:
lines = vfile.readlines()
return lines
def write_vfile(fname, taggedlines):
"""Writes a vert file
"""
# write to file
with open(fname, 'w') as outfile:
outfile.writelines(taggedlines)
def tag_vert_sci_next(fname, fname_out):
"""Creates a new file with tags
"""
# read vertical file
lines = read_vfile(fname)
# tag file
taggedlines = tag_next_sci(lines)
# call write file
write_vfile(fname_out, taggedlines)
def main(fname, fname_out):
#call sci_next tagging
tag_vert_sci_next('test_next.vert', fname_out)
if __name__ == "__main__":
main('test_next.vert', 'zenodo_tagged_SCI_MOD.vert')
And the trackback error:
Traceback (most recent call last):
File "/home/sandra/git/trophic/tagging/tagging_NEXT.py", line 123, in <module>
main('test_next.vert', 'zenodo_tagged_SCI_MOD.vert')
File "/home/sandra/git/trophic/tagging/tagging_NEXT.py", line 120, in main
tag_vert_sci_next('test_next.vert', fname_out)
File "/home/sandra/git/trophic/tagging/tagging_NEXT.py", line 78, in tag_vert_sci_next
taggedlines = tag_next_sci(lines)
File "/home/sandra/git/trophic/tagging/tagging_NEXT.py", line 31, in tag_next_sci
taggedlines.append(tagline_next_sci(line, taggedlines))
File "/home/sandra/git/trophic/tagging/tagging_NEXT.py", line 43, in tagline_next_sci
previous_line = split_line(previous_line)
File "/home/sandra/git/trophic/tagging/tagging_NEXT.py", line 14, in split_line
line = line.strip().split()
AttributeError: 'list' object has no attribute 'strip'
Your issue seems to be that tagline_next_sci sometimes returns a list and not a string. For example, I tried putting a print inside the function to see what was going on;
...
def tagline_next_sci(line, taggedlines):
print('taggedlines', taggedlines)
"""Assigns an indicator tag to a line
"""
...
and got the output
taggedlines []
taggedlines ['</s>\n']
taggedlines ['</s>\n', '<s>\n']
taggedlines ['</s>\n', '<s>\n', ['Diptera', 'NP', 'Diptera-n', 'SCI']]
So you should check at the bottom of the function to make sure you always return a string, and maybe do a "\t".join(line) if you need to puzzle together your list to a string, with something like
return line if isinstance(line, str) else "\t".join(line)
Thank you all for your help. Here is the code I ended up with:
"""Tags a file with SCI_MOD in extra feature column. Reads and writes vert files.
"""
import json
VFILE = 'zenodotaggedWS_ALL.vert'
def split_line(line):
"""Split a line into its parts"""
line = line.strip().split()
if len(line) == 1:
word = line[0]
pos, lempos, tag ="", "", ""
elif len(line) == 3:
word, pos, lempos = line
tag = ""
elif len(line) == 4:
word, pos, lempos, tag = line
return [word, pos, lempos, tag]
def tag_next_sci(lines):
"""Loops through lines of original document to add to new file (tagged)
"""
taggedlines = []
for line in lines:
taggedlines.append(tagline_next_sci(line, taggedlines))
return taggedlines
def tagline_next_sci(line, taggedlines):
"""Assigns an indicator tag to a line
"""
#<> are structural and do not need to be considered for feature tags so can be committed directly
if line.startswith('<'):
return line
#look back at previous line to see if SCI, if so tag current line
previous_line = taggedlines[-1]
previous_line = split_line(previous_line)
line = split_line(line)
if previous_line[2] == "SCI-n":
print("\t".join(line) + "\tSCI_MOD\n")
return "\t".join(line) + "\tSCI_MOD\n"
return line + "\n" if isinstance(line, str) else "\t".join(line) + "\n"
def read_vfile(fname):
"""Reads a vert file
"""
with open(fname, 'r') as vfile:
lines = vfile.readlines()
return lines
def write_vfile(fname, taggedlines):
"""Writes a vert file
"""
# write to file
with open(fname, 'w') as outfile:
outfile.writelines(taggedlines)
def tag_vert_sci_next(fname, fname_out):
"""Creates a new file with tags
"""
# vertical file location
# make list of species names
# read vertical file
lines = read_vfile(fname)
# tag file
taggedlines = tag_next_sci(lines)
# call write file
write_vfile(fname_out, taggedlines)
def main(fname, fname_out):
#call sci_next tagging
tag_vert_sci_next('zenodotaggedWS_ALL.vert', fname_out)
if __name__ == "__main__":
main('zenodotaggedWS_ALL.vert', 'zenodo_tagged_SCIMOD2.vert')

Unhashable type: list

I am working on a program that parses through log files and returns the top hits for IP addresses and a couple other things. Currently I am having trouble and I cannot interpret any of the answers to this problem to what I have going on right now. This is all of my code:
import gzip
from collections import Counter
logFileName = open('C:\\Users\\Pawlaczykm\\Desktop\\fileNames.txt', 'r')
ipAdd = []
landingPages = []
ALL_ipAdd = []
ALL_landingPages = []
# everything after this line gets done to all files
for line in logFileName.readlines():
# rstrip removes a blank line from output
# print 'Summary of: ' + line.rstrip()
# use gzip to decompress the file
with gzip.open('C:\\Users\\Pawlaczykm\\Desktop\\logFiles\\' + line.rstrip() + '.gz', 'rb') as f:
# we extract the ip addresses in lines 15-18
for eachLine in f:
parts = eachLine.split('\t')
if len(parts) > 1:
ipAdd.append(parts[2])
ALL_ipAdd.append(ipAdd)
# use gzip to decompress the file
with gzip.open('C:\\Users\\Pawlaczykm\\Desktop\\logFiles\\' + line.rstrip() + '.gz', 'rb') as f:
# we extract the landing pages
for eachLine in f:
parts = eachLine.split('\t')
if len(parts) > 1:
variable = parts[8].split('?')[0]
landingPages.append(variable)
v): (-v, k))[:10]
ALL_landingPages.append(landingPages)
ALL_ipAddDict = dict(Counter(ALL_ipAdd).most_common())
sortedALL_ipAdd = sorted(ALL_ipAddDict.iteritems(), key=lambda (k, v): (-v, k))[:10]
print 'Top IPs of all files'
print(sortedALL_ipAdd)
ALL_LandingPageDict = dict(Counter(ALL_landingPages).most_common())
sortedALL_LandingPage = sorted(ALL_LandingPageDict.iteritems(), key=lambda (k, v): (-v, k))[:10]
print 'Top landing pages of all files'
print (sortedALL_LandingPage)
Now where I am having trouble is in the following line:
ALL_ipAddDict = dict(Counter(ALL_ipAdd).most_common())
The output when I run the whole program is this:
Traceback (most recent call last):
File "C:/Users/Pawlaczykm/PycharmProjects/LogParse/parseText.py", line 35, in <module>
ALL_ipAddDict = dict(Counter(ALL_ipAdd).most_common())
File "C:\Python27\lib\collections.py", line 477, in __init__
self.update(*args, **kwds)
File "C:\Python27\lib\collections.py", line 567, in update
self[elem] = self_get(elem, 0) + 1
TypeError: unhashable type: 'list'
Can somebody help me? This is frustrating.
From your code ALL_ipAdd = [] and ipAdd = [] and ALL_ipAdd.append(ipAdd) we can conclude that ALL_ipAdd is a list of list. Counter is a subtype of dict, which hashes its items before it counts them. Lists cannot be hashed because they are mutable (if the list changed the hash would change) and thus lists can't be counted by Counter objects.
To solve this you can convert the inner lists to tuples before counting them:
ALL_ipAddDict = dict(Counter(map(tuple, ALL_ipAdd)).most_common())
That's normal. ALL_ipAdd is a list of lists. Counter needs a list, a string or any other hashable type :)

Replace string in line without adding new line?

I want to replace string in a line which contain patternB, something like this:
from:
some lines
line contain patternA
some lines
line contain patternB
more lines
to:
some lines
line contain patternA
some lines
line contain patternB xx oo
more lines
I have code like this:
inputfile = open("d:\myfile.abc", "r")
outputfile = open("d:\myfile_renew.abc", "w")
obj = "yaya"
dummy = ""
item = []
for line in inputfile:
dummy += line
if line.find("patternA") != -1:
for line in inputfile:
dummy += line
if line.find("patternB") != -1:
item = line.split()
dummy += item[0] + " xx " + item[-1] + "\n"
break
outputfile.write(dummy)
It do not replace the line contain "patternB" as expected, but add an new line below it like :
some lines
line contain patternA
some lines
line contain patternB
line contain patternB xx oo
more lines
What can I do with my code?
Of course it is, since you append line to dummy in the beginning of the for loop and then the modified version again in the "if" statement. Also why check for Pattern A if you treat is as you treat everything else?
inputfile = open("d:\myfile.abc", "r")
outputfile = open("d:\myfile_renew.abc", "w")
obj = "yaya"
dummy = ""
item = []
for line in inputfile:
if line.find("patternB") != -1:
item = line.split()
dummy += item[0] + " xx " + item[-1] + "\n"
else:
dummy += line
outputfile.write(dummy)
The simplest will be:
1. Read all File into string
2. Call string.replace
3. Dump string to file
If you want to keep line by line iterator
(for a big file)
for line in inputfile:
if line.find("patternB") != -1:
dummy = line.replace('patternB', 'patternB xx oo')
outputfile.write(dummy)
else:
outputfile.write(line)
This is slower than other responses, but enables big file processing.
This should work
import os
def replace():
f1 = open("d:\myfile.abc","r")
f2 = open("d:\myfile_renew.abc","w")
ow = raw_input("Enter word you wish to replace:")
nw = raw_input("Enter new word:")
for line in f1:
templ = line.split()
for i in templ:
if i==ow:
f2.write(nw)
else:
f2.write(i)
f2.write('\n')
f1.close()
f2.close()
os.remove("d:\myfile.abc")
os.rename("d:\myfile_renew.abc","d:\myfile.abc")
replace()
You can use str.replace:
s = '''some lines
line contain patternA
some lines
line contain patternB
more lines'''
print(s.replace('patternB', 'patternB xx oo'))

Why is this writing part of the text to a new line? (Python)

I'm adding some new bits to one of the lines in a text file and then writing it along with the rest of the lines in the file to a new file. Referring to the 2nd if statement in the while loop, I want that to be all on the same line:
path = raw_input("Enter the name of the destination folder: ")
source_file = open("parameters")
lnum=1
for line in source_file:
nums = line.split()
if (lnum==10):
mTot = float(nums[0])
if (lnum==11):
qinit = float(nums[0])
if (lnum==12):
qfinal = float(nums[0])
if (lnum==13):
qgrowth = float(nums[0])
if (lnum==14):
K = float(nums[0])
lnum = lnum+1
q = qinit
m1 = mTot/(1+qinit)
m2 = (mTot*qinit)/(1+qinit)
taua = (1/3.7)*(mTot**(-4.0/3.0))
taue = taua/K
i = 1
infname = 'parameters'
while (q <= qfinal):
outfname = path+'/'+str(i)
oldfile = open(infname)
lnum=1
for line in oldfile:
if (lnum==17):
line = "{0:.2e}".format(m1)+' '+line
if (lnum==18):
line = "{0:.2e}".format(m2)+' '+line+' '+"{0:.2e}".format(taua)+' '+" {0:.2e}".format(taue)
newfile = open(outfname,'a')
newfile.write(line)
lnum=lnum+1
oldfile.close()
newfile.close()
i=i+1
q = q + q*(qgrowth)
m1 = mTot/(1+q)
m2 = (mTot*q)/(1+q)
but taua and taue are being written on the line below the rest of it. What am I missing here?
That is because line still contains the trailing newline, and when you concatenate it you are also including the newline.
Insert a
line = line.strip()
right after the if (lnum == 19): but before you put the longer line together to get rid of the newline.
Note that write will not add a newline automatically, so you'll want to add a trailing newline of your own.
UPDATE:
This is untested, but I think unless I messed up, you could just use this instead of your longer line:
line = line.strip()
line = "{0:.2e} {} {0:.2e} {0:.2e}\n".format(x, line, y, z)
If you use line = rstrip(line) on line before you change the line then it will trim the new line (as well as any whitespace).

Categories

Resources