I am trying to solve the "Consensus and Profile" challenge on Rosalind.
The challenge instructions are as follows:
Given: A collection of at most 10 DNA strings of equal length (at most 1 kbp) in FASTA format.
Return: A consensus string and profile matrix for the collection. (If several possible consensus strings exist, then you may return any one of them.)
My code is as follows (I got most of it from another user on this website). My only issue is that some of the DNA strands are broken down into multiple separate lines, so they are being appended to the "allstrings" list as separate strings. I am trying to figure out how to write each consecutive line that does not contain ">" as a single string.
import numpy as np
seq = []
allstrings = []
temp_seq = []
matrix = []
C = []
G = []
T = []
A = []
P = []
consensus = []
position = 1
file = open("C:/Users/knigh/Documents/rosalind_cons (3).txt", "r")
conout = open("C:/Users/knigh/Documents/consensus.txt", "w")
# Right now, this is reading and writing each as an individual line. Thus, it
# is splitting each sequence into multiple small sequences. You need to figure
# out how to read this in FASTA format to prevent this from occurring
desc = file.readlines()
for line in desc:
allstrings.append(line)
for string in range(1, len(allstrings)):
if ">" not in allstrings[string]:
temp_seq.append(allstrings[string])
else:
seq.insert(position, temp_seq[0])
temp_seq = []
position += 1
# This last insertion into the sequence must be performed after the loop to empty
# out the last remaining string from temp_seq
seq.insert(position, temp_seq[0])
for base in seq:
matrix.append([pos for pos in base])
M = np.array(matrix).reshape(len(seq), len(seq[0]))
for base in range(len(seq[0])):
A_count = 0
C_count = 0
G_count = 0
T_count = 0
for pos in M[:, base]:
if pos == "A":
A_count += 1
elif pos == "C":
C_count += 1
elif pos == "G":
G_count += 1
elif pos == "T":
T_count += 1
A.append(A_count)
C.append(C_count)
G.append(G_count)
T.append(T_count)
profile_matrix = {"A": A, "C": C, "G": G, "T": T}
P.append(A)
P.append(C)
P.append(G)
P.append(T)
profile = np.array(P).reshape(4, len(A))
for pos in range(len(A)):
if max(profile[:, pos]) == profile[0, pos]:
consensus.append("A")
elif max(profile[:, pos]) == profile[1, pos]:
consensus.append("C")
elif max(profile[:, pos]) == profile[2, pos]:
consensus.append("G")
elif max(profile[:, pos]) == profile[3, pos]:
consensus.append("T")
conout.write("".join(consensus) + "\n")
for k, v in profile_matrix.items():
conout.write(k + ": " + " ".join(str(x) for x in v) + "\n")
conout.close()
There are a couple of ways that you can iterate a FASTA file as records. You can use a prebuilt library or write your own.
A widely used library for working with sequence data is biopython. This code snippet will create a list of strings.
from Bio import SeqIO
file = "path/to/your/file.fa"
sequences = []
with open(file, "r") as file_handle:
for record in SeqIO.parse(file_handle, "fasta"):
sequences.append(record.seq)
Alternatively, you can write your own FASTA parser. Something like this should work:
def read_fasta(fh):
# Iterate to get first FASTA header
for line in fh:
if line.startswith(">"):
name = line[1:].strip()
break
# This list will hold the sequence lines
fa_lines = []
# Now iterate to find the get multiline fasta
for line in fh:
if line.startswith(">"):
# When in this block we have reached
# the next FASTA record
# yield the previous record's name and
# sequence as tuple that we can unpack
yield name, "".join(fa_lines)
# Reset the sequence lines and save the
# name of the next record
fa_lines = []
name = line[1:].strip()
# skip to next line
continue
fa_lines.append(line.strip())
yield name, "".join(fa_lines)
You can use this function like so:
file = "path/to/your/file.fa"
sequences = []
with open(file, "r") as file_handle:
for name, seq in read_fasta(file_handle):
sequences.append(seq)
Related
How would I find the size of every function in a file using python?
For context, I'm learning pyplot as well as scipy, and I wanted to measure the size of the functions in a programming file, and then measure the frequency of that size.
What I'm looking for is a way to read a file, identify a function, count the lines of the function, and then add it to a list, I don't really care if the function returns a list, I can always write a function that condenses the list of lists into one list.
Example of what I'm trying to implement:
Function A has a length of 10 lines
Function B has a length of 16 lines
Function C has a length of 8 lines
Function D has a length of 5 lines
Given the above data, I want to be able to condense that into a list of [10,16,8,5].
Additionally: I'm going to be testing this on a couple of beginner C projects, and only want to test the files with the .c extension, not .h.
Code I have so far:
# Counts the number of lines in the file
def line_counter(file_name):
tot_line = 0
with open(file_name, 'r') as f:
for line in f:
tot_line += 1
return (tot_line)
# Counts the number of lines that end with a semicolon
def semi_counter(file_name):
tot_semi = 0
with open(file_name, 'r') as f:
for line in f:
sline = line.strip()
if len(sline) > 0 and sline[-1] == ';':
tot_semi += 1
return (tot_semi)
# Returns the maximum nesting depth of the file
def max_depth(file_name):
max_dep = 0
dep = 0
with open(file_name, 'r') as f:
for line in f:
for ch in line:
if ch == '{':
dep += 1
if ch == '}':
dep -= 1
if dep > max_dep:
max_dep = dep
return max_dep
# Counts the number of characters in a file
def char_counter(file_name):
tot_chars = 0
with open(file_name, 'r') as f:
for line in f:
tot_chars += len(line)
return (tot_chars)
# Counts the number of comments in the file
def comm_counter(file_name):
comm_chars = 0
with open(file_name, 'r') as f:
block_comment = False
for line in f:
lch = None
line_comment = False
for ch in line:
if lch == '/' and ch == '*':
block_comment = True
if lch == '*' and ch == '/':
block_comment = False
if lch == '/' and ch == '/':
line_comment = True
lch = ch
if line_comment or block_comment:
comm_chars += 1
return comm_chars
# Counts the number of root level functions
def block_counter(file_name):
block_tot = 0
dep = 0
with open(file_name, 'r') as f:
for line in f:
for ch in line:
if ch == '{':
dep += 1
if ch == '}':
dep -= 1
if dep == 0:
block_tot += 1
return block_tot
Beginner Coding problem I am supposed to write a code that reverses the contents of a file and then inserts a number of random characters based on a strength the user chooses. It then creates a new file containing the obstructed file.
For example, if the user chooses strength = 2, it will insert 2 random characters between each letter in the text file: The cat sits ---> sgyt6gilns t7faxdc e3dh1kT
Right now my program inserts too many characters in between and I can't figure out why.
This is what it's doing:
input: CAT
Output of strength = 1: TeAEADQoC
import string
import random
def getRandomChar():
alpha = string.ascii_letters + string.digits
return random.choice(alpha)
def randomString(EncrypStrength):
count = 0
result = ''
while count < len(EncrypStrength):
result += getRandomChar()
count += 1
return result
def ReverseString(OrigFile):
return OrigFile[::-1]
def LineEncrypt(line, EncrypStrength):
EncrypStrength = ReverseString(line)
index = 0
newline = EncrypStrength[index]
index += 1
while index < len(EncrypStrength):
newline += randomString(EncrypStrength)
newline += EncrypStrength[index]
index += 1
return newline
def main():
OrigFile =input('Original File Name:')
EncryptedFile = input("obfuscated File Name:")
EncrypStrength = int(input('Enter the Encryption Strength:'))
Orig = open(OrigFile, 'r')
Encrypted = open(EncryptedFile, 'w')
line = Orig.readline()
while line!= '':
encryptLine = LineEncrypt(line, EncrypStrength)
Encrypted.write(encryptLine +"\n")
line = Orig.readline()
Orig.close()
Encrypted.close()
if __name__=="__main__":
main()
In Line Encrypt method you are using incorrectly Encrypt Strength, you are overriding the number of characters to put as EncryptStrength with reversed line.
def LineEncrypt(line, EncrypStrength):
reversedString = ReverseString(line)
index = 0
newline = reversedString[index]
index += 1
while index < len(reversedString):
newline += randomString(EncrypStrength)
newline += reversedString[index]
index += 1
You are confusing EncrypStrength and overriding it as Ritesh mentioned.
Here is the full corrected code, I hope it will work as you expected.
import string
import random
def getRandomChar():
alpha = string.ascii_letters + string.digits
return random.choice(alpha)
def randomString(EncrypStrength):
count = 0
result = ''
while count < EncrypStrength:
result += getRandomChar()
count += 1
return result
def ReverseString(OrigFile):
return OrigFile[::-1]
def LineEncrypt(line, EncrypStrength):
RevStr = ReverseString(line)
index = 0
newline = RevStr[index]
index += 1
while index < len(RevStr):
newline += randomString(EncrypStrength)
newline += RevStr[index]
index += 1
return newline
def main():
OrigFile =input('Original File Name:')
EncryptedFile = input("obfuscated File Name:")
EncrypStrength = int(input('Enter the Encryption Strength:'))
Orig = open(OrigFile, 'r')
Encrypted = open(EncryptedFile, 'w')
line = Orig.readline()
while line!= '':
encryptLine = LineEncrypt(line, EncrypStrength)
Encrypted.write(encryptLine +"\n")
line = Orig.readline()
Orig.close()
Encrypted.close()
if __name__=="__main__":
main()
I've been writing a Countdown program in Python, and in it. I've written this:
#Letters Game
global vowels, consonants
from random import choice, uniform
from time import sleep
from itertools import permutations
startLetter = ""
words = []
def check(word, startLetter):
fileName = startLetter + ".txt"
datafile = open(fileName)
for line in datafile:
print("Checking if", word, "is", line.lower())
if word == line.lower():
return True
return False
def generateLetters():
lettersLeft = 9
output = []
while lettersLeft >= 1:
lType = input("Vowel or consonant? (v/c)")
sleep(uniform(0.5, 1.5))
if lType not in ("v", "c"):
print("Please input v or c")
continue
elif lType == "v":
letter = choice(vowels)
print("Rachel has picked an", letter)
vowels.remove(letter)
output.append(letter)
elif lType == "c":
letter = choice(consonants)
print("Rachel has picked a", letter)
consonants.remove(letter)
output.append(letter)
print("Letters so far:", output)
lettersLeft -= 1
return output
def possibleWords(letters, words):
for i in range(1,9):
print(letters)
print(i)
for item in permutations(letters, i):
item = "".join(list(item))
startLetter = list(item)[0]
if check(item, startLetter):
print("\n\n***Got one***\n", item)
words.append(item)
return words
vowels = ["a"]*15 + ["e"]*21 + ["i"]*13 + ["o"]*13+ ["u"]*5
consonants = ["b"]*2 + ["c"]*3 + ["d"]*6 + ["f"]*2 + ["g"]*3 +["h"]*2 +["j"]*1 +["k"]*1 +["l"]*5 +["m"]*4 +["n"]*8 +["p"]*4 +["q"]*1 +["r"]*9 +["s"]*9 +["t"]*9 + ["v"]*1 +["w"]*1 +["x"]*1 +["y"]*1 +["z"]*1
print("***Let's play a letters game!***")
sleep(3)
letters = generateLetters()
sleep(uniform(1, 1.5))
print("\n\n***Let's play countdown***\n\n\n\n\n")
print(letters)
for count in reversed(range(1, 31)):
print(count)
sleep(1)
print("\n\nStop!")
print("All possible words:")
print(possibleWords(letters, words))
'''
#Code for sorting the dictionary into files
alphabet = "abcdefghijklmnopqrstuvwxyz"
alphabet = list(alphabet)
for letter in alphabet:
allFile = open("Dictionary.txt", "r+")
filename = letter + ".txt"
letterFile = open(filename, "w")
for line in allFile:
if len(list(line.lower())) <= 9:
if list(line.lower())[0] == letter:
print("Writing:", line.lower())
letterFile.write(line.lower())
allFile.close()
letterFile.close()
I have 26 text files called a.txt, b.txt, c.txt... to make the search quicker
(Sorry it's not very neat - I haven't finished it yet)
However, instead of returning what I expect (pan), it returns all words with pan in it (pan, pancake, pans, pandemic...)
Is there any way in Python you can only return the line if it's EXACTLY the same as the string? Do I have to .read() the file first?
Thanks
Your post is strangely written so excuse me if I missmatch
Is there any way in Python you can only return the line if it's EXACTLY the same as the string? Do I have to .read() the file first?
Yes, there is!!!
file = open("file.txt")
content = file.read() # which is a str
lines = content.split('\n') # which is a list (containing every lines)
test_string = " pan "
positive_match = [l for l in lines if test_string in l]
This is a bit hacky since we avoid getting pancake for pan (for instance) but using spaces (and then, what about cases like ".....,pan "?). You should have a look at tokenization function. As pythonists, we hve one of the best library for this: nltk
(because, basically, you are reinventing the wheel)
I keep getting the error when running my code:
TypeError: object of type '_io.TextIOWrapper' has no len() function
How do I get it to open/read the file and run it through the loop?
Here's a link to the file that I am trying to import:
download link of the DNA sequence
def mostCommonSubstring():
dna = open("dna.txt", "r")
mink = 4
maxk = 9
count = 0
check = 0
answer = ""
k = mink
while k <= maxk:
for i in range(len(dna)-k+1):
sub = dna[i:i+k]
count = 0
for i in range(len(dna)-k+1):
if dna[i:i+k] == sub:
count = count + 1
if count >= check:
answer = sub
check = count
k=k+1
print(answer)
print(check)
The problem occurs due to the way you are opening the text file.
You should add dna = dna.read() to your code.
so your end code should look something like this:
def mostCommonSubstring():
dna = open("dna.txt", "r")
dna = dna.read()
mink = 4
maxk = 9
count = 0
check = 0
answer = ""
k = mink
while k <= maxk:
for i in range(len(dna)-k+1):
sub = dna[i:i+k]
count = 0
for i in range(len(dna)-k+1):
if dna[i:i+k] == sub:
count = count + 1
if count >= check:
answer = sub
check = count
k=k+1
print(answer)
print(check)
#tfabiant : I suggest this script to read and process a DNA sequence.
To run this code, in the terminal: python readfasta.py fastafile.fasta
import string, sys
##########I. To Load Fasta File##############
file = open(sys.argv[1])
rfile = file.readline()
seqs = {}
##########II. To Make fasta dictionary####
tnv = ""#temporal name value
while rfile != "":
if ">" in rfile:
tnv = string.strip(rfile)
seqs[tnv] = ""
else:
seqs[tnv] += string.strip(rfile)
rfile = file.readline()
##############III. To Make Counts########
count_what = ["A", "T", "C", "G", "ATG"]
for s in seqs:
name = s
seq = seqs[s]
print s # to print seq name if you have a multifasta file
for cw in count_what:
print cw, seq.count(cw)# to print counts by seq
I have a file where each line is ordered alphabetically. The file is 12Gb, which means I can't simply read it line by line. The data looks like this:
brown 0 1 0 1 2
fox 3 5 0 0 1
jumped 2 0 6 1 0
The words at the beginning of each line are unique. The word and the numbers on each line are separated by tabs. I want to be able to query the file for specific keywords. For example, if I query "fox", the program should return "fox 3 5 0 0 1".
It seems that a good candidate for this would be the bisect module: https://docs.python.org/3.0/library/bisect.html
I found a post which uses bisect to find out the line number of a keyword: How do I perform binary search on a text file to search a keyword in python?
This is what the code looks like:
import bisect
import os
class Query(object):
def __init__(self, query, index=5):
self.query = query
self.index = index
def __lt__(self, comparable):
return self.query < comparable[self.index:]
class FileSearcher(object):
def __init__(self, file_pointer, record_size=35):
self.file_pointer = file_pointer
self.file_pointer.seek(0, os.SEEK_END)
self.record_size = record_size + len(os.linesep)
self.num_bytes = self.file_pointer.tell()
self.file_size = (self.num_bytes // self.record_size)
def __len__(self):
return self.file_size
def __getitem__(self, item):
self.file_pointer.seek(item * self.record_size)
return self.file_pointer.read(self.record_size)
with open('myfile') as file_to_search:
query = 'fox\t' #token to query
wrapped_query = Query(query)
searchable_file = FileSearcher(file_to_search)
linepos = bisect.bisect(searchable_file, wrapped_query)
print "Located # line: ", linepos
#print content of line?
However, I can't figure out how to actually print the content of the line. I should at least add a read statement somewhere, but I don't know where.
Is it possible to print the content of the line with the bisect module?
If you want go with Python solution, you can do the following:
Read file by small chunks of MAX_LINE bytes, each time moving forward by fixed offset
That offset determines block size
For each such read, determine the key (first word in a line)
These keys serve as delimiters of blocks
Construct the list of such keys. The list would be sorted as keys are ordered
You may persist such list somewhere via pickle/json.dumps/...
When quering, find via bisect the index of a block where you key is located
Read that block entirely and find the key with data
Here is the example file bigfile:
abc 4
bar 2
baz 3
egg 6
foo 1
god 8
ham 5
sex 7
The code:
import os
from bisect import bisect
MAX_LINE = 7
BLOCK_SIZE = 10
def parse_chunks(filename):
size = os.path.getsize(filename)
chunks = []
with open(filename, 'rb') as file:
block = str(file.read(MAX_LINE*2))
first_line = block[:block.find('\n') + 1]
chunks.append(first_line.split()[0])
pos = BLOCK_SIZE
while pos < size:
file.seek(pos)
block = str(file.read(MAX_LINE*2))
first_eol = block.find('\n')
second_eol = block.find('\n', first_eol + 1)
if first_eol == -1 or second_eol == -1:
break
line = block[first_eol + 1:second_eol]
key = line.split()[0]
chunks.append(key)
pos += BLOCK_SIZE
return chunks
if __name__ == '__main__':
BLOCK_SIZE = 10
filename = 'bigfile'
chunks = parse_chunks(filename)
query = 'abc'
pos_before = bisect(chunks, query) - 1
with open(filename, 'rb') as file:
file.seek(pos_before*BLOCK_SIZE)
block = str(file.read(BLOCK_SIZE + MAX_LINE))
line_start = block.find(query)
line_end = block.find('\n', line_start + 1)
line = block[line_start:line_end]
print(line)
In this toy example I use block size of 10 bytes, in your case of 12GB file I'd suggest you to start with 1M.
The following recursive function should be able to narrow the search interval. I'm not sure that you can modify it so that it returns a match or None for no match.
def bisearch(f, word, i, j)
if (j-1)<1E6: return i,j
k = (i+j)/2
f.seek(k)
while k<j:
c = f.read(1)
k = k+1
if c == '\n': break
else:
# ??? no match ??? I'm not sure
w = []
while 1:
c = f.read(1)
if c == '\t': break
w.append(c)
w = "".join(w)
if w == word:
return k, k
if w < word:
return bisearch(f, word, k, j)
else:
return bisearch(f, word, i, k)
and here an example of usage
word = ...
f = open(...)
i,j = bisearch(f, word, 0, len_f)
f.seek(i)
if i==j:
line = f.readline()
else:
#################### EDIT ################
# OLD
# buffer = f.read(1E6)
# NEW
buffer = f.read(j-i)
lenw = len(word)
for line in buffer.split('\n'):
if line[:lenw] == word: break
else:
# no matches, SOS
result = process(line)
Try seeking to the line in question and using readline.
print "Located # line: ", linepos
file_to_search.seek(linepos)
line = file_to_search.readline()
This is assuming linepos is the position of the line, counted in bytes from the beginning of the file. If it's the position counted in line numbers, you'll need to multiply by the number of bytes per line before seeking.
print "Located # line: ", linepos
file_to_search.seek(linepos * searchable_file.record_size)
line = file_to_search.readline()