read fasta file and compute DNA gc content - python

First, I want to say that I am new to python programming. I spent nearly 20 hours for figuring this out, however, my code is still frustrating. I have a fasta file which comprise of ID and DNA sequence. I would like to read in the FASTA data and do some computational work.
The FASTA file reads like this:
>1111886
AACGAACGCTGGCGGCATGCCTAACACATGCAAGTCGAACGA…
>1111885
AGAGTTTGATCCTGGCTCAGAATGAACGCTGGCGGCGTGCCT…
>1111883
GCTGGCGGCGTGCCTAACACATGTAAGTCGAACGGGACTGGG…
I wrote the following code to read in the fasta data and do some analysis such as compute gc content by ID, average sequence length ,etc. I put the detailed description for what I want to do in docstring. I appreciate anyone who can improve my code, especially how to get gc content for each ID.
class fasta(object):
def __init__(self, filename):
self.filename = filename
self.num_sequences = None
self.sequences = {} #{seq_id} = sequence
def parse_file(self):
**"""Reads in the sequence data contained in the filename associated with this instance of the class.
Stores both the sequence and the optional comment for each ID."""**
with open(self.filename) as f:
return f.read().split('>')[1:]
def get_info(self):
**"""Returns a description of the class in a pretty string format.
The description should include the filename for the instance and the number of sequences."""**
for line in file(self.filename, 'r'):
if line.startswith('>'):
self.num_sequences += 1
return self.num_sequences
def compute_gc_content(self,some_id):
**"""compute the gc conent for sequence ID some_id. If some_id, return an appropriate error values"""**
baseFrequency = {}
for line in file(self.filename, 'r'):
if not line.startswith(">"):
for base in sequence:
baseFrequency[base] = baseFrequency.get(base,0)+1
items = baseFrequency.items()
items.sort()
for i in items:
gc=(baseFrequency['G'] + baseFrequency['C'])/float(len(sequence))
return gc
def sequence_statistics(self):
**"""returns a dictionary containing
The average sequence length
The average gc content"""**
baseFrequency = {}
for line in file(self.filename, 'r'):
if not line.startswith(">"):
for base in sequence:
baseFrequency[base] = baseFrequency.get(base,0)+1
items = baseFrequency.items()
items.sort()
for i in items:
gc=(baseFrequency['G'] + baseFrequency['C'])/float(len(sequence))
aveseq=sum(len(sequence))/float(self.count)
return (gc,aveseq)
def get_all_kmers(self, k=8):
**"""get all kmer counts of size k in fasta file. Returns a dictionary with keys equal to the kmers
and values equal to the counts"""**
t={}
for x in range (len(self.sequence)+1-k):
kmer=self.sequence[x:x+k]
t[kmer]=f.get(kmer,0)+1
kmers = get_all_kmers(k=8)
return(t,len(kmers))
def query_sequence_id(self, some_id):
**"""query sequence ids for some_id. If some_id does not exist in the class, return
a string error message"""**
for line in file(self.filename, 'r'):
if id in line:
print "The id exists"
else:
print "The id does not exist"

This should be able to read and parse your file. saving it in a dictionary with the id as keys, and info\data as a sub dictionary for each id.
Please ask if you don't understand all of this.
def parse_file(self):
"""Reads in the sequence data contained in the filename associated with this instance of the class.
Stores both the sequence and the optional comment for each ID."""
def parser(filename):
seq = []
with open(filename, 'r') as f:
for line in f:
if line.startswith('>'):
if seq:
yield seqId, seqInfo, ''.join(seq)
line = line.split()
seqId = line[0][1:]
if len(line) > 1:
seqInfo = ' '.join(line[1:])
else:
seqInfo = ''
seq = []
else:
seq.append(line.replace('\n', ''))
if seq:
yield seqId, seqInfo, ''.join(seq)
sequences = parser(self.filename)
self.sequences = {sequenceId: {'info': sequenceInfo, 'data': sequenceData} for (sequenceId, sequenceInfo, sequenceData) in sequences}

Related

Write parsed fasta file back to fasta format from a dictionary

I have created a function that parses a Fasta file because I needed to remove some odd characters. Now I have a dictionary and want to turn it back to a fasta format. I am new to Fasta files so I don't know how to proceed.
The dictionary has this format:
{'NavAb:/1126': 'TNIVESSFFTKFIIYLIVLNGITMGLETSKTFMQSFGVYTTLFNQIVITIFTIEIILRIYVHRISFFKDPWSLFDFFVVAISLVPTSSGFEILRVLRVLRLFRLVTAVPQMRKI', 'Shaker:/1656': 'SSQAARVVAIISVFVILLSIVIFCLETLEDEVPDITDPFFLIETLCIIWFTFELTVRFLACPLNFCRDVMNVIDIIAIIPYFITTLNLLRVIRLVRVFRIFKLSRHSKGLQIL', .....
The function:
def parse_file(input_file):
parsed_seqs = {}
curr_seq_id = None
curr_seq = []
for line in newfile:
line = line.strip()
line = line.replace('-', '')
if line.startswith(">"):
if curr_seq_id is not None:
parsed_seqs[curr_seq_id] = ''.join(curr_seq)
curr_seq_id = line[1:]
curr_seq = []
continue
curr_seq.append(line)
parsed_seqs[curr_seq_id] = ''.join(curr_seq)
return parsed_seqs
newfile = open("file")
parsed_seqs = parse_file(newfile)
print(parsed_seqs)
If you can use an existing library for this task, you may use Biotite:
import biotite.sequence.io.fasta as fasta
seq_dict = {
'NavAb:/1126': 'TNIVESSFFTKFIIYLIVLNGITMGLETSKTFMQSFGVYTTLFNQIVITIFTIEIILRIYVHRISFFKDPWSLFDFFVVAISLVPTSSGFEILRVLRVLRLFRLVTAVPQMRKI',
'Shaker:/1656': 'SSQAARVVAIISVFVILLSIVIFCLETLEDEVPDITDPFFLIETLCIIWFTFELTVRFLACPLNFCRDVMNVIDIIAIIPYFITTLNLLRVIRLVRVFRIFKLSRHSKGLQIL'
}
fasta_file = fasta.FastaFile()
for header, seq_str in seq_dict.items():
fasta_file[header] = seq_str
fasta_file.write("path/to/file.fasta")
path/to/file.fasta:
>NavAb:/1126
TNIVESSFFTKFIIYLIVLNGITMGLETSKTFMQSFGVYTTLFNQIVITIFTIEIILRIYVHRISFFKDPWSLFDFFVVA
ISLVPTSSGFEILRVLRVLRLFRLVTAVPQMRKI
>Shaker:/1656
SSQAARVVAIISVFVILLSIVIFCLETLEDEVPDITDPFFLIETLCIIWFTFELTVRFLACPLNFCRDVMNVIDIIAIIP
YFITTLNLLRVIRLVRVFRIFKLSRHSKGLQIL
Note that I belong to the developers of this package. There are also solutions in a multitude of other packages, such as Biopython.

How can I print a dictionary key while in a for loop in python

I have a problem, I have a dictionary and i need to pass a function to each value of the dictionary, I need my output to include the value key + the result of the function.
The program itself should identify open reading frames in a given DNA sequences, I have this part working but i need to print the name of sequence + the open reading frames.
Im new to programming and i got stuck. All help will be greatly appreciated.
#converts a fasta file into a dictionary.
import re
myfile = input("Enter a file name and directory:")
try:
f=open(myfile)
except IOError:
print("File doesn't exist!")
seqs = {}
for line in f:
line=line.rstrip()
#Gets rid of triling empy spaces.
if line[0]=='>':
words=line.split()
name=words[0][1:]
seqs[name] = ''
else:
seqs[name] = seqs[name] + line
print("Number of entries:",len(seqs.keys()))
length_seqs = {key:len(seq)for key, seq in seqs.items()}
sorted_length_seqs = sorted(length_seqs.items(), key=lambda kv:kv[1])
print("Entries by length:",sorted_length_seqs)
#finds the ORF in the dictionary sequences.
def find_ORFs(DNA):
ORFs = []
if 'ATG' in DNA:
for startMatch in re.finditer('ATG',DNA):
remaining = DNA[startMatch.start():]
for stopMatch in re.finditer('TAA|TGA|TAG',remaining):
substring = remaining[:stopMatch.end()]
if len(substring) % 3 == 0:
ORFs.append(substring)
break
else:
print("There are no ORFs in your sequence")
ORFs.sort(key=len, reverse=True)
print(ORFs)
for ORF in ORFs:
print(ORF,'ORF lenght',len(ORF))
#passes the function to the dictionary values.
#Here i need to pass the function to each of the values of the dictionary but i cant manage to make it print the the dictionary key of each value.
for seq in seqs:
DNA = seqs[seq]
ORF = find_ORFs(DNA)
You could pass the name of the sequence to the function as well:
def find_ORFs(name, DNA):
ORFs = []
if 'ATG' in DNA:
for startMatch in re.finditer('ATG',DNA):
remaining = DNA[startMatch.start():]
for stopMatch in re.finditer('TAA|TGA|TAG',remaining):
substring = remaining[:stopMatch.end()]
if len(substring) % 3 == 0:
ORFs.append(substring)
break
else:
print("There are no ORFs in your sequence: {}".format(name))
ORFs.sort(key=len, reverse=True)
print(ORFs)
for ORF in ORFs:
print("{}, {}, {}".format(name, len(ORF), ORF))
for name, DNA in seqs.items():
ORF = find_ORFs(name, DNA)
You should also take a look at some external libraries that can help you with parsing FASTA files, such as SeqIO from BioPython.

Trying to only look at specific lines in txt file TypeError: 'in <string>' requires string as left operand, not dict raises when I try

ask questions for clarification
I think it would be best if I just posted my code for better understanding. So first I just simply ask the user to input a file name, just to see if it exists. And if it doesn't it exits the program.
I have a separate file containing a list of keywords, which I put into a dict
then I check if another file exists... similar to before. and with this file, Im checking to see if the keywords in the first file are contained in the second one. and calculating the sentiment value. But this is not what I need help with I just wanted to explain a little before hand
So my question is, in the section below:
for line in open('tweets.txt'):
line = line.split(" ")
lat = float(line[0][1:-1]) #Stripping the [ and the ,
long = float(line[1][:-1]) #Stripping the ]
if eastern.contains(lat, long):
eastScore += score(line)
elif central.contains(lat, long):
centralScore += score(line)
elif mountain.contains(lat, long):
mountainScore += score(line)
elif pacific.contains(lat, long):
pacificScore += score(line)
else:
continue
how would I be able to focus on only lines in the file that contain keywords and not the entire file?
like you see in this part where I ignore lines without values?
with open('tweets.txt') as f:
for line in f:
values = Counter(word for word in line.split() if word in sentiments)
if not values:
continue
I tried methods such as creating a new file and writing the lines that contain keywords into the new file, but that raised
TypeError: 'in <string>' requires string as left operand, not dict
but it wasnt really what I wanted to do anyways. So my first question is, how would I be able to focus on only lines that contain keywords for that section above?
[41.923916200000001, -88.777469199999999] 6 2011-08-28 19:24:18 My life is a moviee.
from collections import Counter
try:
keyW_Path = input("Enter file named keywords: ")
keyFile = open(keyW_Path, "r")
except IOError:
print("Error: file not found.")
exit()
# Read the keywords into a list
keywords = {}
wordFile = open('keywords.txt', 'r')
for line in wordFile.readlines():
word = line.replace('\n', '')
if not(word in keywords.keys()): #Checks that the word doesn't already exist.
keywords[word] = 0 # Adds the word to the DB.
wordFile.close()
# Read the file name from the user and open the file.
try:
tweet_path = input("Enter file named tweets: ")
tweetFile = open(tweet_path, "r")
except IOError:
print("Error: file not found.")
exit()
#Calculating Sentiment Values
with open('keywords.txt') as f:
sentiments = {word: int(value) for word, value in (line.split(",") for line in f)}
with open('tweets.txt') as f:
for line in f:
values = Counter(word for word in line.split() if word in sentiments)
if not values:
continue
happyScore_Tweet = (sum(values[word]*sentiments[word] for word in values)) // (len(values))
print(happyScore_Tweet)
def score(tweet):
total = 0
for word in tweet:
if word in sentiments:
total += 1
return total
#Classifying the regions
class Region:
def __init__(self, lat_range, long_range):
self.lat_range = lat_range
self.long_range = long_range
def contains(self, lat, long):
return self.lat_range[0] <= lat and lat < self.lat_range[1] and\
self.long_range[0] <= long and long < self.long_range[1]
eastern = Region((24.660845, 49.189787), (-87.518395, -67.444574))
central = Region((24.660845, 49.189787), (-101.998892, -87.518395))
mountain = Region((24.660845, 49.189787), (-115.236428, -101.998892))
pacific = Region((24.660845, 49.189787), (-125.242264, -115.236428))
eastScore = 0
centralScore = 0
pacificScore = 0
mountainScore = 0
happyScoreE = 0
for line in open('tweets.txt'):
line = line.split(" ")
lat = float(line[0][1:-1]) #Stripping the [ and the ,
long = float(line[1][:-1]) #Stripping the ]
if eastern.contains(lat, long):
eastScore += score(line)
elif central.contains(lat, long):
centralScore += score(line)
elif mountain.contains(lat, long):
mountainScore += score(line)
elif pacific.contains(lat, long):
pacificScore += score(line)
else:
continue
Use regex to extract the lat and long.
import re
text = open(filename, 'r')
matches = re.findall("(\-?\d+\.\d+?),\s*(\-?\d+\.\d+)", text.read())
Matches will return a list of strings containing only your lat and long.
Also, there are some very good tools in python you can use for spatial queries, you should look them up.

Reading files into dictionaries

I am trying to read from a file into a dictionary. The lane.split() method will not work as I am formatting my file over separate lines, with too many spaces.
in inventory2
(item, description) = line.split()
ValueError: too many values to unpack
Here is my text file. Key \n Value.
Key
A rusty old key, you used it to gain entry to the manor.
A stick
You found it on your way in, it deals little damage.
Health potion
A health potion, it can restore some health.
Any solutions to this would be much appreciated.
def inventory2():
inventory_file = open("inventory_test.txt", "r")
inventory = {}
for line in inventory_file:
(item, description) = line.split()
inventory[(item)] = description
#invenory = {inventory_file.readline(): inventory_file.readline()}
print(line)
inventory_file.close
You are looping over each line in the file, so there will never be a line with both key and value. Use the next() function to get the next line for a given key instead:
def inventory2():
with open("inventory_test.txt", "r") as inventory_file:
inventory = {}
for line in inventory_file:
item = line.strip()
description = next(inventory_file).strip()
inventory[item] = description
return inventory
or, more compact with a dict comprehension:
def inventory2():
with open("inventory_test.txt", "r") as inventory_file:
return {line.strip(): next(inventory_file).strip() for line in inventory_file}
Here is another way:
def inventory2():
inventory_file = open("inventory_test.txt", "r")
inventory = {}
lines = inventory_file.readlines()
x = 0
while (x < len(lines)):
item = lines[x].strip()
description = lines[x+1].strip()
inventory[item] = description
x += 2
print inventory
return inventory
Outputs:
{'Health potion': 'A health potion, it can restore some health.', 'A stick': 'You found it on your way in, it deals little damage.', 'Key': 'A rusty old key, you used it to gain entry to the manor.'}

Optimizing python file search?

I'm having some trouble optimizing this part of code.
It works, but seems unnecessary slow.
The function searches after a searchString in a file starting on line line_nr and returns the line number for first hit.
import linecache
def searchStr(fileName, searchString, line_nr = 1, linesInFile):
# The above string is the input to this function
# line_nr is needed to search after certain lines.
# linesInFile is total number of lines in the file.
while line_nr < linesInFile + 1:
line = linecache.getline(fileName, line_nr)
has_match = line.find(searchString)
if has_match >= 0:
return line_nr
break
line_nr += 1
I've tried something along these lines, but never managed to implement the "start on a certain line number"-input.
Edit: The usecase. I'm post processing analysis files containing text and numbers that are split into different sections with headers. The headers on line_nr are used to break out chunks of the data for further processing.
Example of call:
startOnLine = searchStr(fileName, 'Header 1', 1, 10000000):
endOnLine = searchStr(fileName, 'Header 2', startOnLine, 10000000):
Why don't you start with simplest possible implementation ?
def search_file(filename, target, start_at = 0):
with open(filename) as infile:
for line_no, line in enumerate(infile):
if line_no < start_at:
continue
if line.find(target) >= 0:
return line_no
return None
I guess your file is like:
Header1 data11 data12 data13..
name1 value1 value2 value3...
...
...
Header2 data21 data22 data23..
nameN valueN1 valueN2 valueN3..
...
Does the 'Header' string contains any constant formats(i.e: all start with '#' or sth). If so, you can read the line directly, judge if the line contains this format (i.e: if line[0]=='#') and write different code for different kinds of lines(difination line and data line in your example).
Record class:
class Record:
def __init__(self):
self.data={}
self.header={}
def set_header(self, line):
...
def add_data(self, line):
...
iterate part:
def parse(p_file):
record = None
for line in p_file:
if line[0] == "#":
if record : yield record
else:
record = Record()
record.set_header(line)
else:
record.add_data(line)
yield record
main func:
data_file = open(...)
for rec in parse(data_file):
...

Categories

Resources