How to select sequences using substrings from several dictionaries - python

I have four dictionaries that contain substrings:
fw1={'PLAU_fw1':'CCCFFF','EPCAM_fw1':'GGGTTT','MIF_fw1':'HHHFFF'}
fw1_rc={'PLAU_fw1_rc':'cccfff','EPCAM_fw1_rc':'gggttt','MIF_fw1_rc':'hhhfff'}
fw2={'PLAU_fw2':'RRREEE','EPCAM_fw2':'OOOPPP','MIF_fw2':'KKKZZZ'}
fw2_rc={'PLAU_fw2_rc':'rrreee','EPCAM_fw2_rc':'oooppp','MIF_fw2_rc':'kkkzzz'}
and a fasta file:
>MN00153:75:000H37WNG:1:11102:8051:1085
NNNNNNNNCCCFFFNNNNNGGGTTTNNNNNNN
>MN00153:75:000H37WNG:1:11102:00000:1088
NNNNNNCCCFFFNNNNNrrreeeNNNNNNN
>MN00153:75:000H37WNG:1:11102:16389:1090
NNNHHHFFFNNNNNNNOOOPPPNNNNNNN
>MN00153:75:000H37WNG:1:11102:00000:1095
cccfffNNNNNNNKKKZZZNNNNNNN
I want to select sequences if two substrings are from specific dictionaries. The order of substrings is not important.
In other words, I want my code to select reads if one substring is from fw1 and another one is from fw2_rc dictionary OR one substring is from fw1_rc and another one is from fw2 dictionary.
This is my code; It selects correct reads but repeats outputs many times:
from Bio import SeqIO
count=0
with open('file.fasta','r') as f:
for record in SeqIO.parse(f,'fasta'):
for k1,Fw1 in fw1.items():
for k2,Fw1_rc in fw1_rc.items():
for k3,Fw2 in fw2.items():
for k4,Fw2_rc in fw2_rc.items():
if Fw1 in record.seq and Fw2_rc in record.seq:
pos1 = record.seq.find(Fw1) + 1
pos2 = record.seq.find(Fw2_rc) + 1
if pos1 < pos2:
distance = pos2 - pos1
if pos1 > pos2:
distance = pos1 - pos2
print("sample_2")
print(record.id)
print(record.seq)
print(str(k1) + " : " + str(Fw1) + " - The position is " + str(pos1))
print(str(k4) + " : " + str(Fw2_rc) + " - The position is " + str(pos2))
print('\n')
if Fw1_rc in record.seq and Fw2 in record.seq:
pos1 = record.seq.find(Fw1_rc) + 1
pos2 = record.seq.find(Fw2) + 1
if pos1 < pos2:
distance = pos2 - pos1
if pos1 > pos2:
distance = pos1 - pos2
print(record.id)
print(record.seq)
print(str(k2) + " : " + str(Fw1_rc) + " - The position is " + str(pos1))
print(str(k3) + " : " + str(Fw2) + " - The position is " + str(pos2))
print('\n')
count+=1
print("The total number of reads that have both 21nt protein-specific sequences is " + str(count))
The desired output should be:
sample_2
MN00153:75:000H37WNG:1:11102:00000:1088
NNNNNNCCCFFFNNNNNrrreeeNNNNNNN
PLAU_fw1 : CCCFFF - The position is 7
PLAU_fw2_rc : rrreee - The position is 18
sample_2
MN00153:75:000H37WNG:1:11102:00000:1095
cccfffNNNNNNNKKKZZZNNNNNNN
PLAU_fw1_rc : cccfff - The position is 1
MIF_fw2 : KKKZZZ - The position is 14
The total number of reads that have both 21nt protein-specific sequences is 2

I wasn't able to get the counts and I reversed the key/value items in the dictionaries to allow lookups (that otherwise wouldn't have been possible for your desired results)
Also, I wasn't able to use Bio but just read from a text file but the changes to my code could be easily changed to the Bio seq and id.
import re
fw1={'PLAU_fw1':'CCCFFF','EPCAM_fw1':'GGGTTT','MIF_fw1':'HHHFFF'}
fw1 = dict(zip(fw1.values(), fw1.keys()))
fw1_rc={'PLAU_fw1_rc':'cccfff','EPCAM_fw1_rc':'gggttt','MIF_fw1_rc':'hhhfff'}
fw1_rc= dict(zip(fw1_rc.values(), fw1_rc.keys()))
fw2={'PLAU_fw2':'RRREEE','EPCAM_fw2':'OOOPPP','MIF_fw2':'KKKZZZ'}
fw2 = dict(zip(fw2.values(), fw2.keys()))
fw2_rc={'PLAU_fw2_rc':'rrreee','EPCAM_fw2_rc':'oooppp','MIF_fw2_rc':'kkkzzz'}
fw2_rc= dict(zip(fw2_rc.values(), fw2_rc.keys()))
one_upcase = '(' + '|'.join(fw1.keys()) + ')'
one_locase = '(' + '|'.join(fw1_rc.keys()) + ')'
two_upcase = '(' + '|'.join(fw2.keys()) + ')'
two_locase = '(' + '|'.join(fw2_rc.keys()) + ')'
with open('f1.txt', 'r') as f:
_id = ''
count = 0
for line in f:
line = line.rstrip()
if line.startswith('>'):
_id = line[1:]
else:
if match := re.search(f'(?=.*{one_upcase})(?=.*{two_locase})', line):
print(_id)
print(line)
for item in match.groups():
idx = 1 + line.index(item)
if item.isupper():
print(fw1[item], ': ', end='')
else:
print(fw2_rc[item], ': ', end='')
print(item, 'The position is', idx)
print()
elif match := re.search(f'(?=.*{one_locase})(?=.*{two_upcase})', line):
print(_id)
print(line)
for item in match.groups():
idx = 1 + line.index(item)
if item.isupper():
print(fw2[item], ': ', end='')
else:
print(fw1_rc[item], ': ', end='')
print(item, 'The position is', idx)
print()
The ouput matches your output:
MN00153:75:000H37WNG:1:11102:00000:1088
NNNNNNCCCFFFNNNNNrrreeeNNNNNNN
PLAU_fw1 : CCCFFF The position is 7
PLAU_fw2_rc : rrreee The position is 18
MN00153:75:000H37WNG:1:11102:00000:1095
cccfffNNNNNNNKKKZZZNNNNNNN
PLAU_fw1_rc : cccfff The position is 1
MIF_fw2 : KKKZZZ The position is 14
UPDATE
Here is the solution not using regular expressions
(Also, I installed Bio to use it for this solution)
from Bio import SeqIO
# make values, (fasta seq), keys in dict and original keys now become values
fw1={'PLAU_fw1':'CCCFFF','EPCAM_fw1':'GGGTTT','MIF_fw1':'HHHFFF'}
fw1 = dict(zip(fw1.values(), fw1.keys()))
fw1_rc={'PLAU_fw1_rc':'cccfff','EPCAM_fw1_rc':'gggttt','MIF_fw1_rc':'hhhfff'}
fw1_rc= dict(zip(fw1_rc.values(), fw1_rc.keys()))
fw2={'PLAU_fw2':'RRREEE','EPCAM_fw2':'OOOPPP','MIF_fw2':'KKKZZZ'}
fw2 = dict(zip(fw2.values(), fw2.keys()))
fw2_rc={'PLAU_fw2_rc':'rrreee','EPCAM_fw2_rc':'oooppp','MIF_fw2_rc':'kkkzzz'}
fw2_rc= dict(zip(fw2_rc.values(), fw2_rc.keys()))
# store fasta substrings in lists
one_upcase = list(fw1.keys())
one_locase = list(fw1_rc.keys())
two_upcase = list(fw2.keys())
two_locase = list(fw2_rc.keys())
with open('fasta.txt', 'r') as f:
count = 0
for record in SeqIO.parse(f,'fasta'):
_id = record.id
seq = record.seq
last = False
for token_fw1 in one_upcase:
if last == True:
break
for token_fw2_rc in two_locase:
if token_fw1 in seq and token_fw2_rc in seq:
print(_id)
print(seq)
print(fw1[token_fw1], ':', token_fw1,
'in position', str(1+seq.index(token_fw1)))
print(fw2_rc[token_fw2_rc], ':', token_fw2_rc,
'in position', str(1+seq.index(token_fw2_rc)))
print()
last = True
break
for token_fw1_rc in one_locase:
if last == True:
break
for token_fw2 in two_upcase:
if token_fw1_rc in seq and token_fw2 in seq:
print(_id)
print(seq)
print(fw1_rc[token_fw1_rc], ':', token_fw1_rc,
'in position', str(1+seq.index(token_fw1_rc)))
print(fw2[token_fw2], ':', token_fw2,
'in position', str(1+seq.index(token_fw2)))
print()
last = True
break
I didn't have a count here as I didn't know what it is you wanted to count.
I reversed the dictionaries (I think you created the dictionaries wrong) where the values, (fasta substrings), became the keys and the keys became the values. This permitted lookups in my solution:
print(fw1_rc[token_fw1_rc] etc and print(fw2_rc[token_fw2_rc] etc
(Two of the 4 times this lookup was done).
Also, a note, I said token here to mean the fasta substring.

Related

how can i get rid of quotation marks in python print

I need the print were words in my list are separated with space, but the result of my print is as follows
['Salt']
So it prints it as list with quotations :(
sry code is in Finnish :S
def poistaTuote(Tuotteet):
print("Ostoslistassasi on " + str(len(Tuotteet)) + " tuotetta.")
Index = int(input("Anna poistettavan tuotteen järjestysnumero: "))
Index = (Index)-1
for i in range(len(Tuotteet)):
if (Index == Tuotteet[i].index):
Index = i
if (Index == -1):
print("Indeksiä " + str(Index) + " ei löydy.")
print("Tuotteiden järjestysnumerot alkavat numerosta 1.")
else:
del Tuotteet[Index]
print("\nOstoslistasi sisältää seuraavat tuotteet:")
print(Tuotteet, end = " ")
return Tuotteet
Current output:
Ostoslistasi sisältää seuraavat tuotteet:
['Leipä', ' Leivän päälliset']
Desired output
Leipä Leivän päälliset
Spread the list elements as separate arguments to print:
print(*Tuotteet)
Or use str.join:
print(" ".join(Tuotteet))

Identifying spaces between commas

I need to identify if theres a space between a number and comma then that number is invalid. So if the number has more or less than 2 decimal places and/or white spaces in between the commas then it is INVALID but if it has no whitespaces in between the commas and has 2 decimal places then it it a VALID number. That's why the first number in Line 1 is VALID
There's two methods, I prefer to work on method 2 but I thought if I put two methods it might help any of you to add on
#-----------Method 1------------------------------------------
res = 0
outfile = "output2.txt"
baconFile = open(outfile,"wt")
index = 0
invalid_string = "INVALID"
valid_string = "VALID"
with open('file.txt') as file:
for line in file:
carrera = ''
index = index + 1
print("Line {}: ".format(index), end='')
baconFile.write("Line {}: ".format(index))
number_list = line.strip().split(',')
for number in number_list:
if len(number.split('.')[-1]) == 2:
#res += 1
## print("VALID")
carrera = valid_string
if len(number.split('.')[-1]) != 2:
#res += 1
carrera = invalid_string
if len(number.split(',')[-1]) == " ": #checking for whitespace
carrera = invalid_string
print (carrera, end=' ')
baconFile.write(carrera + " ")
print('\n', end='')
baconFile.write('\n')
baconFile.close()
#-----------Method 2------------------------------------------
res = 0
outfile = "output2.txt"
baconFile = open(outfile,"wt")
index = 0
invalid_string = "INVALID"
valid_string = "VALID"
with open('file.txt') as file:
for line in file:
index = index + 1
o = "Line {}: ".format(index)
number_list = line.strip().split(',')
for x in number_list:
if len(x.split('.')[-1]) == 2:
o += valid_string + " "
if len(x.split('.')[-1]) != 2:
o += invalid_string + " "
if len(x.split(',')[-1]) == " ":
o += valid_string + " "
Here's my list of numbers in Text.file:
1,1.02, 123.0005
1.02, 1.02 , 1.02
Expected:
Line 1: INVALID VALID INVALID
Line 2: VALID INVALID INVALID (since there's spaces between the last number that's why it is INVALID)
ACTUAL:
Line 1: INVALID VALID INVALID
Line 2: VALID INVALID VALID
You can split the strings with , and decide if the string is valid or invalid based on whether the string stars with a whitespace
#Open the files
with open('file.txt') as fp:
#Extract out non-empty lines from file
lines = [line for line in fp.readlines() if line.strip()]
res = []
#Iterate over the lines
for idx, line in enumerate(lines):
#Number is valid if it doesn't start with a whitespace, has a decimal part and the decimal part is two digits long
res = ['VALID' if not item.startswith(' ') and '.' in item and len(item.split('.')[1]) == 2 else 'INVALID' for item in line.split(',')]
#Print the result
print("Line {}: {}".format(idx+1, ' '.join(res)))
The output will be
Line 1: INVALID VALID INVALID
Line 2: VALID INVALID INVALID
try this:
line="1,1.02, 123.0005"
reslt=line.split(",")
Res=" "
for i in reslt:
if " "in i:
line1="INVALID "
else:
line1="VALID "
Res +="".join(line1)
print("line1:"+Res)
READ from file :
nbline
with open('file.txt') as f:
for line in f.readlines():
print(line)
reslt=line.split(",")
Res=" "
for i in reslt:
if " "in i:
line1="INVALID "
else:
line1="VALID "
Res +="".join(line1)
nbline = nbline+1
print("line {}:{}".format(nbline,Res))
output:
line1: VALID VALID INVALID
A list comprehension based on splitting on commas, and a little string trickery would be much simpler:
line="1,1.02, 123.0005"
result = " ".join("IN"*(" " in s)+"VALID" for s in line.split(","))
print(result) # VALID VALID INVALID
With decimal.Decimal object, you can retrieve the exponent, which somehow tells you the number of decimal places (see docs):
import decimal
o += " ".join(['INVALID' if x[0] == ' ' or decimal.Decimal(x).as_tuple().exponent != -2 else 'VALID' for x in line.split(',')])
Output
#with line = "1,1.02, 123.0005"
'Line 1: INVALID VALID INVALID'
#with line = "1.02, 1.02 , 1.02"
'Line 2: VALID INVALID INVALID'

dict to list, and compare lists python

I have made a function, were I count how many times each word is used in a file, that will say the word frequency. Right now the function can calculate the sum of all words, and show me the seven most common words and how many times they are used. Now I want to compare my first file were I have analyzed the word frequency with another file were I have the most common words used in the english language, and I want to compare those words with the words I have in my first file to see if any of the words matches.
What I have come up to is to make lists of the two files and then compare them with each other. But the code I wrote for this doesn't give me any output, any idea on how I can solve this?
def CountWords():
filename = input('What is the name of the textfile you want to open?: ')
if filename == "alice" or "alice-ch1.txt" or " ":
file = open("alice-ch1.txt","r")
print('You want to open alice-ch1.txt')
wordcount = {}
for word in file.read().split():
if word not in wordcount:
wordcount[word] = 1
else:
wordcount[word] += 1
wordcount = {k.lower(): v for k, v in wordcount.items() }
print (wordcount)
sum = 0
for val in wordcount.values():
sum += val
print ('The total amount of words in Alice adventures in wonderland: ' + str(sum))
sortList = sorted(wordcount.values(), reverse = True)
most_freq_7 = sortList[0:7]
#print (most_freq_7)
print ('Totoro says: The 7 most common words in Alice Adventures in Wonderland:')
print(list(wordcount.keys())[list(wordcount.values()).index(most_freq_7[0])] + " " + str(most_freq_7[0]))
print(list(wordcount.keys())[list(wordcount.values()).index(most_freq_7[1])] + " " + str(most_freq_7[1]))
print(list(wordcount.keys())[list(wordcount.values()).index(most_freq_7[2])] + " " + str(most_freq_7[2]))
print(list(wordcount.keys())[list(wordcount.values()).index(most_freq_7[3])] + " " + str(most_freq_7[3]))
print(list(wordcount.keys())[list(wordcount.values()).index(most_freq_7[4])] + " " + str(most_freq_7[4]))
print(list(wordcount.keys())[list(wordcount.values()).index(most_freq_7[5])] + " " + str(most_freq_7[5]))
print(list(wordcount.keys())[list(wordcount.values()).index(most_freq_7[6])] + " " + str(most_freq_7[6]))
file_common = open("common-words.txt", "r")
commonwords = []
contents = file_common.readlines()
for i in range(len(contents)):
commonwords.append(contents[i].strip('\n'))
print(commonwords)
#From here's the code were I need to find out how to compare the lists:
alice_keys = wordcount.keys()
result = set(filter(set(alice_keys).__contains__, commonwords))
newlist = list()
for elm in alice_keys:
if elm not in result:
newlist.append(elm)
print('Here are the similar words: ' + str(newlist)) #Why doesn't show?
else:
print ('I am sorry, that filename does not exist. Please try again.')
I'm not in front of an interpreter so my code might be slightly off. But try something more like this.
from collections import Counter
with open("some_file_with_words") as f_file
counter = Counter(f_file.read())
top_seven = counter.most_common(7)
with open("commonwords") as f_common:
common_words = f_common.read().split()
for word, count in top_seven:
if word in common_words:
print "your word " + word + " is in the most common words! It appeared " + str(count) + " times!"

Python - how to print amount of numbers, periods, and commas in file

def showCounts(fileName):
lineCount = 0
wordCount = 0
numCount = 0
comCount = 0
dotCount = 0
with open(fileName, 'r') as f:
for line in f:
words = line.split()
lineCount += 1
wordCount += len(words)
for word in words:
# ###text = word.translate(string.punctuation)
exclude = set(string.punctuation)
text = ""
text = ''.join(ch for ch in text if ch not in exclude)
try:
if int(text) >= 0 or int(text) < 0:
numCount += 1
# elif text == ",":
# comCount += 1
# elif text == ".":
# dotCount += 1
except ValueError:
pass
print("Line count: " + str(lineCount))
print("Word count: " + str(wordCount))
print("Number count: " + str(numCount))
print("Comma count: " + str(comCount))
print("Dot count: " + str(dotCount) + "\n")
Basically it will show the number of lines and the number of words, but I can't get it to show the number of numbers, commas, and dots. I have it read a file that the user enters and then show the amount of lines and words, but for some reason it says 0 for numbers commas and dots. I commented out the part where it gave me trouble. If i remove the comma then i just get an error. thanks guys
This code loops over every character in each line, and adds 1 to its variable:
numCount = 0
dotCount = 0
commaCount = 0
lineCount = 0
wordCount = 0
fileName = 'test.txt'
with open(fileName, 'r') as f:
for line in f:
wordCount+=len(line.split())
lineCount+=1
for char in line:
if char.isdigit() == True:
numCount+=1
elif char == '.':
dotCount+=1
elif char == ',':
commaCount+=1
print("Number count: " + str(numCount))
print("Comma count: " + str(commaCount))
print("Dot count: " + str(dotCount))
print("Line count: " + str(lineCount))
print("Word count: " + str(wordCount))
Testing it out:
test.txt:
Hello, my name is B.o.b. I like biking, swimming, and running.
I am 125 years old, and I was 124 years old 1 year ago.
Regards,
B.o.b
Running:
bash-3.2$ python count.py
Number count: 7
Comma count: 5
Dot count: 7
Line count: 6
Word count: 27
bash-3.2$
Everything makes sense here, except the lineCount the reason why this is 6 is because of newlines. In my editor (nano), it adds a newline to the end of any file by default. So just imagine the text file to be this:
>>> x = open('test.txt').read()
>>> x
'Hello, my name is B.o.b. I like biking, swimming, and running.\n\nI am 125 years old, and I was 124 years old 1 year ago.\n\nRegards,\nB.o.b \n'
>>> x.count('\n')
6
>>>
Hope this helps!
For the punctuations, why not just do:
def showCounts(fileName):
...
...
with open(fileName, 'r') as fl:
f = fl.read()
comCount = f.count(',')
dotCount = f.count('.')
You could use the Counter class to take care of it you:
from collections import Counter
with open(fileName, 'r') as f:
data = f.read().strip()
lines = len(data.split('\n'))
words = len(data.split())
counts = Counter(data)
numbers = sum(v for (k,v) in counts.items() if k.isdigit())
print("Line count: {}".format(lines))
print("Word count: {}".format(words))
print("Number count: {}".format(numbers))
print("Comma count: {}".format(counts[',']))
print("Dot count: {}".format(counts['.']))

Working with nested dictionaries and formatting for display

I have a partial answer from here Construct a tree from list os file paths (Python) - Performance dependent
My specific problem requires me to go from
this
dir/file 10
dir/dir2/file2 20
dir/dir2/file3 10
dir/file3 10
dir3/file4 10
dir3/file5 10
To
dir/ **50**
dir2/ **30**
file2
file3
file
file3
dir3/ **20**
file4
file5
Basically the numbers at the end are the file sizes and
I have been trying to figure out how to display the size of all the files to the parent directory
Edit:
r = re.compile(r'(.+\t)(\d+)')
def prettify(d, indent=0):
for key, value in d.iteritems():
ss = 0
if key == FILE_MARKER:
if value:
for each in value:
mm = r.match(each)
ss += int(mm.group(2))
print ' ' * indent + each
***print ' ' * indent + format_size(ss)***
else:
print ' ' * indent + str(key)
if isinstance(value, dict):
addSizes(value, indent+1)
else:
print ' ' * (indent+1) + str(value)
This is mac's answer from the above link which i edited to use regExp
Solutions that occurred to me led me to create a new dict or adding an inner function.
I have lost my whole day and wished i had asked for help earlier in the day.
Please help.
Not the most elegant thing in the world, but this should get you where you need to be. You'll need to change the tree creation function to deal with whatever form of input you are getting. Once the tree is generated it's just using a recursive tree traversal to form the output.
import re
input_dirs = """dir/file 10
dir/dir2/file2 20
dir/dir2/file3 10
dir/file 10
dir3/file4 10
dir3/file5 10
dir/dir2/dir4/file2 10"""
def create_file_tree(input_string):
dir_dict = {}
for file_path in input_string.split('\n'):
path_list = re.sub('/',' ',file_path).split()
path_list[-1] = int(path_list[-1])
path_dict = dir_dict
final_item = ""
for item in path_list[:-1]:
parent_dict = path_dict
last_item = item
path_dict = path_dict.setdefault(item,{})
parent_dict[last_item] = path_list[-1]
return dir_dict
def pretty_file_tree(file_tree):
def traverse(sub_dict,indent=0, total=0):
string_out = ""
indent += 1
for key in sorted(sub_dict.keys()):
if type(sub_dict[key]) == dict:
sub_total = traverse(sub_dict[key],indent,0)
total += sub_total[0]
string_out += ' '*indent + key + ' ' + '**' + str(sub_total[0]) + '**' + '\n' + sub_total[1]
else:
string_out += ' '*indent + key + '\n'
total += sub_dict[key]
return total, string_out
output_string = traverse(file_tree)
print(output_string[1])
pretty_file_tree(create_file_tree(input_dirs))
Sorry it's not following the code you posted, but i'd begun to produce this before the edit...
As you process the input build a string with place holders (%d) for the numbers, then print out the string.

Categories

Resources