remove items from list in python - python

>>> n
['UUU', 'F', 'CUU', 'L', 'AUU', 'I', 'GUU', 'V', 'UUC', 'F', 'CUC', 'L', 'AUC', 'I', 'GUC', 'V', 'UUA', 'L', 'CUA', 'L', 'AUA', 'I', 'GUA', 'V', 'UUG', 'L', 'CUG', 'L', 'AUG', 'M', 'GUG', 'V', 'UCU', 'S', 'CCU', 'P', 'ACU', 'T', 'GCU', 'A', 'UCC', 'S', 'CCC', 'P', 'ACC', 'T', 'GCC', 'A', 'UCA', 'S', 'CCA', 'P', 'ACA', 'T', 'GCA', 'A', 'UCG', 'S', 'CCG', 'P', 'ACG', 'T', 'GCG', 'A', 'UAU', 'Y', 'CAU', 'H', 'AAU', 'N', 'GAU', 'D', 'UAC', 'Y', 'CAC', 'H', 'AAC', 'N', 'GAC', 'D', 'UAA', 'Stop', 'CAA', 'Q', 'AAA', 'K', 'GAA', 'E', 'UAG', 'Stop', 'CAG', 'Q', 'AAG', 'K', 'GAG', 'E', 'UGU', 'C', 'CGU', 'R', 'AGU', 'S', 'GGU', 'G', 'UGC', 'C', 'CGC', 'R', '', '', '', '', '', 'AGC', 'S', '', '', '', '', '', 'GGC', 'G', 'UGA', 'Stop', '', '', 'CGA', 'R', '', '', '', '', '', 'AGA', 'R', '', '', '', '', '', 'GGA', 'G', 'UGG', 'W', '', '', '', '', '', 'CGG', 'R', '', '', '', '', '', 'AGG', 'R', '', '', '', '', '', 'GGG', 'G', '']
>>> for item in n:
... if item=='':
... n.remove(item)
...
>>> n
['UUU', 'F', 'CUU', 'L', 'AUU', 'I', 'GUU', 'V', 'UUC', 'F', 'CUC', 'L', 'AUC', 'I', 'GUC', 'V', 'UUA', 'L', 'CUA', 'L', 'AUA', 'I', 'GUA', 'V', 'UUG', 'L', 'CUG', 'L', 'AUG', 'M', 'GUG', 'V', 'UCU', 'S', 'CCU', 'P', 'ACU', 'T', 'GCU', 'A', 'UCC', 'S', 'CCC', 'P', 'ACC', 'T', 'GCC', 'A', 'UCA', 'S', 'CCA', 'P', 'ACA', 'T', 'GCA', 'A', 'UCG', 'S', 'CCG', 'P', 'ACG', 'T', 'GCG', 'A', 'UAU', 'Y', 'CAU', 'H', 'AAU', 'N', 'GAU', 'D', 'UAC', 'Y', 'CAC', 'H', 'AAC', 'N', 'GAC', 'D', 'UAA', 'Stop', 'CAA', 'Q', 'AAA', 'K', 'GAA', 'E', 'UAG', 'Stop', 'CAG', 'Q', 'AAG', 'K', 'GAG', 'E', 'UGU', 'C', 'CGU', 'R', 'AGU', 'S', 'GGU', 'G', 'UGC', 'C', 'CGC', 'R', 'AGC', 'S', 'GGC', 'G', 'UGA', 'Stop', 'CGA', 'R', 'AGA', 'R', 'GGA', 'G', 'UGG', 'W', '', '', '', '', 'CGG', 'R', '', '', '', '', '', 'AGG', 'R', '', '', '', '', '', 'GGG', 'G', '']
How to explain that iterative "remove" operation can't remove all the '' elements in the list?

You can also use a list comprehension:
>>> n = ['UUU', 'F', 'CUU', 'L', 'AUU', 'I', 'GUU', 'V', 'UUC', 'F', 'CUC', 'L', 'AUC', 'I', 'GUC', 'V', 'UUA', 'L', 'CUA', 'L', 'AUA', 'I', 'GUA', 'V', 'UUG', 'L', 'CUG', 'L', 'AUG', 'M', 'GUG', 'V', 'UCU', 'S', 'CCU', 'P', 'ACU', 'T', 'GCU', 'A', 'UCC', 'S', 'CCC', 'P', 'ACC', 'T', 'GCC', 'A', 'UCA', 'S', 'CCA', 'P', 'ACA', 'T', 'GCA', 'A', 'UCG', 'S', 'CCG', 'P', 'ACG', 'T', 'GCG', 'A', 'UAU', 'Y', 'CAU', 'H', 'AAU', 'N', 'GAU', 'D', 'UAC', 'Y', 'CAC', 'H', 'AAC', 'N', 'GAC', 'D', 'UAA', 'Stop', 'CAA', 'Q', 'AAA', 'K', 'GAA', 'E', 'UAG', 'Stop', 'CAG', 'Q', 'AAG', 'K', 'GAG', 'E', 'UGU', 'C', 'CGU', 'R', 'AGU', 'S', 'GGU', 'G', 'UGC', 'C', 'CGC', 'R', '', '', '', '', '', 'AGC', 'S', '', '', '', '', '', 'GGC', 'G', 'UGA', 'Stop', '', '', 'CGA', 'R', '', '', '', '', '', 'AGA', 'R', '', '', '', '', '', 'GGA', 'G', 'UGG', 'W', '', '', '', '', '', 'CGG', 'R', '', '', '', '', '', 'AGG', 'R', '', '', '', '', '', 'GGG', 'G', '']
>>> n = [x for x in n if x != ''] # or more simply [x for x in n if x]
>>> print n
['UUU', 'F', 'CUU', 'L', 'AUU', 'I', 'GUU', 'V', 'UUC', 'F', 'CUC', 'L', 'AUC', 'I', 'GUC', 'V', 'UUA', 'L', 'CUA', 'L', 'AUA', 'I', 'GUA', 'V', 'UUG', 'L', 'CUG', 'L', 'AUG', 'M', 'GUG', 'V', 'UCU', 'S', 'CCU', 'P', 'ACU', 'T', 'GCU', 'A', 'UCC', 'S', 'CCC', 'P', 'ACC', 'T', 'GCC', 'A', 'UCA', 'S', 'CCA', 'P', 'ACA', 'T', 'GCA', 'A', 'UCG', 'S', 'CCG', 'P', 'ACG', 'T', 'GCG', 'A', 'UAU', 'Y', 'CAU', 'H', 'AAU', 'N', 'GAU', 'D', 'UAC', 'Y', 'CAC', 'H', 'AAC', 'N', 'GAC', 'D', 'UAA', 'Stop', 'CAA', 'Q', 'AAA', 'K', 'GAA', 'E', 'UAG', 'Stop', 'CAG', 'Q', 'AAG', 'K', 'GAG', 'E', 'UGU', 'C', 'CGU', 'R', 'AGU', 'S', 'GGU', 'G', 'UGC', 'C', 'CGC', 'R', 'AGC', 'S', 'GGC', 'G', 'UGA', 'Stop', 'CGA', 'R', 'AGA', 'R', 'GGA', 'G', 'UGG', 'W', 'CGG', 'R', 'AGG', 'R', 'GGG', 'G']

To expand a bit on why you shouldn't change a list this way as you're iterating over it, consider the following simplification of what you're doing:
>>> a = range(10)
>>> a
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
>>> for item in a:
... print item
... a.remove(item)
...
0
2
4
6
8
>>> a
[1, 3, 5, 7, 9]
As this illustrates, by changing the list, you're also changing which elements you act on in successive iterations. That's almost never a good idea and appears not to be what you're trying to accomplish.
Instead, how about a list comprehension:
[item for item in mylist if item != '']

Another option is
while '' in n:
n.remove('')

Instead of trying to remove the blanks, try instead to make a new array which contains only the non-blank items
a = ['UUU', 'F', 'CUU', 'L', 'AUU', 'I', 'GUU', 'V', 'UUC', 'F', 'CUC', 'L', 'AUC', 'I', 'GUC', 'V', 'UUA', 'L', 'CUA', 'L', 'AUA', 'I', 'GUA', 'V', 'UUG', 'L', 'CUG', 'L', 'AUG', 'M', 'GUG', 'V', 'UCU', 'S', 'CCU', 'P', 'ACU', 'T', 'GCU', 'A', 'UCC', 'S', 'CCC', 'P', 'ACC', 'T', 'GCC', 'A', 'UCA', 'S', 'CCA', 'P', 'ACA', 'T', 'GCA', 'A', 'UCG', 'S', 'CCG', 'P', 'ACG', 'T', 'GCG', 'A', 'UAU', 'Y', 'CAU', 'H', 'AAU', 'N', 'GAU', 'D', 'UAC', 'Y', 'CAC', 'H', 'AAC', 'N', 'GAC', 'D', 'UAA', 'Stop', 'CAA', 'Q', 'AAA', 'K', 'GAA', 'E', 'UAG', 'Stop', 'CAG', 'Q', 'AAG', 'K', 'GAG', 'E', 'UGU', 'C', 'CGU', 'R', 'AGU', 'S', 'GGU', 'G', 'UGC', 'C', 'CGC', 'R', '', '', '', '', '', 'AGC', 'S', '', '', '', '', '', 'GGC', 'G', 'UGA', 'Stop', '', '', 'CGA', 'R', '', '', '', '', '', 'AGA', 'R', '', '', '', '', '', 'GGA', 'G', 'UGG', 'W', '', '', '', '', '', 'CGG', 'R', '', '', '', '', '', 'AGG', 'R', '', '', '', '', '', 'GGG', 'G', '']
b = []
for x in a:
if len(x)>0:
b.append(x)

As dylrei mentioned, list comprehension is a pythonic solution, but is has a drawback with (very) huge list since it copies the list. If you have a huge list and the item you want to remove is only a few times in the list, then I suggest the following (less pythonic) solution:
>>> l=[1,2,3,4,5,6,1,2,3,1,2,1,1]
>>> while True:
... try:
... l.remove(1)
... except ValueError:
... break
...
>>> l
[2, 3, 4, 5, 6, 2, 3, 2]

Related

Joining individual elements of an array

I have an array consisting of labels but each label has been broken down by individual characters. For example, this is the first 2 elements of the array:
array([['1', '.', ' ', 'I', 'd', 'e', 'n', 't', 'i', 'f', 'y', 'i', 'n',
'g', ',', ' ', 'A', 's', 's', 'e', 's', 's', 'i', 'n', 'g', ' ',
'a', 'n', 'd', ' ', 'I', 'm', 'p', 'r', 'o', 'v', 'i', 'n', 'g',
' ', 'C', 'a', 'r', 'e', '', ''],
['9', '.', ' ', 'N', 'o', 'n', '-', 'P', 'h', 'a', 'r', 'm', 'a',
'c', 'o', 'l', 'o', 'g', 'i', 'c', 'a', 'l', ' ', 'I', 'n', 't',
'e', 'r', 'v', 'e', 'n', 't', 'i', 'o', 'n', 's', '', '', '',
'', ''], ...
I would like it to be formatted as such:
array(['1. Identifying, Assessing and Improving Care',
'9. Non-Pharmacological Interventions', ...
I want to be able to iterate through a concatenate the label output so it is as shown above.
Any help in achieving this would be much appreciated :) Many thanks!
import numpy as np
k=np.array([['1', '.', ' ', 'I', 'd', 'e', 'n', 't', 'i', 'f', 'y', 'i', 'n',
'g', ',', ' ', 'A', 's', 's', 'e', 's', 's', 'i', 'n', 'g', ' ',
'a', 'n', 'd', ' ', 'I', 'm', 'p', 'r', 'o', 'v', 'i', 'n', 'g',
' ', 'C', 'a', 'r', 'e', '', ''],
['9', '.', ' ', 'N', 'o', 'n', '-', 'P', 'h', 'a', 'r', 'm', 'a',
'c', 'o', 'l', 'o', 'g', 'i', 'c', 'a', 'l', ' ', 'I', 'n', 't',
'e', 'r', 'v', 'e', 'n', 't', 'i', 'o', 'n', 's', '', '', '',
'', '']])
for x in k:
print(''.join(x))
#output
1. Identifying, Assessing and Improving Care
9. Non-Pharmacological Interventions
Using List comprehension:
[''.join(x) for x in k]
#output
['1. Identifying, Assessing and Improving Care',
'9. Non-Pharmacological Interventions']
Considering the array as a list of lists, you could join all characters by looping through the list:
r = [['1', '.', ' ', 'I', 'd', 'e', 'n', 't', 'i', 'f', 'y', 'i', 'n',
'g', ',', ' ', 'A', 's', 's', 'e', 's', 's', 'i', 'n', 'g', ' ',
'a', 'n', 'd', ' ', 'I', 'm', 'p', 'r', 'o', 'v', 'i', 'n', 'g',
' ', 'C', 'a', 'r', 'e', '', ''],
['9', '.', ' ', 'N', 'o', 'n', '-', 'P', 'h', 'a', 'r', 'm', 'a',
'c', 'o', 'l', 'o', 'g', 'i', 'c', 'a', 'l', ' ', 'I', 'n', 't',
'e', 'r', 'v', 'e', 'n', 't', 'i', 'o', 'n', 's', '', '', '',
'', '']]
t = ["".join(i) for i in r]
print(t)
Output:
['1. Identifying, Assessing and Improving Care',
'9. Non-Pharmacological Interventions']
array = [['1', '.', ' ', 'I', 'd', 'e', 'n', 't', 'i', 'f', 'y', 'i', 'n',
'g', ',', ' ', 'A', 's', 's', 'e', 's', 's', 'i', 'n', 'g', ' ',
'a', 'n', 'd', ' ', 'I', 'm', 'p', 'r', 'o', 'v', 'i', 'n', 'g',
' ', 'C', 'a', 'r', 'e', '', ''],
['9', '.', ' ', 'N', 'o', 'n', '-', 'P', 'h', 'a', 'r', 'm', 'a',
'c', 'o', 'l', 'o', 'g', 'i', 'c', 'a', 'l', ' ', 'I', 'n', 't',
'e', 'r', 'v', 'e', 'n', 't', 'i', 'o', 'n', 's', '', '', '',
'', '']]
# array(['1. Identifying, Assessing and Improving Care',
# '9. Non-Pharmacological Interventions', ...
array = [''.join(i) for i in array]
print(array) #['1. Identifying, Assessing and Improving Care', '9. Non-Pharmacological Interventions']
Assuming from array([...]) that you are using numpy, here's a solution
import numpy as np
a = np.array([['1', '.', ' ', 'I', 'd', 'e', 'n', 't', 'i', 'f', 'y', 'i', 'n',
'g', ',', ' ', 'A', 's', 's', 'e', 's', 's', 'i', 'n', 'g', ' ',
'a', 'n', 'd', ' ', 'I', 'm', 'p', 'r', 'o', 'v', 'i', 'n', 'g',
' ', 'C', 'a', 'r', 'e', '', ''],
['9', '.', ' ', 'N', 'o', 'n', '-', 'P', 'h', 'a', 'r', 'm', 'a',
'c', 'o', 'l', 'o', 'g', 'i', 'c', 'a', 'l', ' ', 'I', 'n', 't',
'e', 'r', 'v', 'e', 'n', 't', 'i', 'o', 'n', 's', '', '', '',
'', '']])
b = np.empty(a.shape[0], dtype=object)
for i, x in enumerate(a): b[i] = ''.join(x)
If you make a loop over each element of your array, you can then use list .join to get what you are looking for.
Something like:
arr = [['1', '.', ' ', 'I', ...], ...]
output = list()
for x in arr:
output.append(''.join(x))
output
>>>
['1. Identifying, Assessing and Improving Care', ...]

Remove adjacent items group in list based on length of elements

I have a list of items from PDF text extraction in this way:
['performed three times. Data represent the mean±SEM of threeindependent experiments. *P<0.05, **P<0.005, ***P<0.001.', 'B','O-GlcNAc', 'AMPK', 'pAMPK', 'SREBP-1', 'ACC', 'FAS', 'actin', 'C','T', 'L', 'D', 'O', 'N', 'T', 'M', 'G', 'C', 'T', 'L', 'D', 'O', 'N','T', 'M', 'G', 'HaCaT HeLa', 'O', '-G', 'lN', 'A', 'c', 'le', 'v','e', 'l', '(A', '.U', ')', 'H', 'a', 'C', 'a', 'T', 'C', 'T', 'L','0.0', '2.5', '1.5', '1.0', '0.5', 'H', 'a', 'C', 'a', 'T', 'D', 'O','N', 'H', 'a', 'C', 'a', 'T', 'T', 'M', 'G', '2.0', '**', '***', 'S','R', 'E', 'B', 'P', '-1', 'le', 'v', 'e', 'l', 'H', 'a', 'C', 'a','T', 'C', 'T', 'L', '0.0', '1.5', '1.0', '0.5', 'H', 'a', 'C', 'a','T', 'D', 'O', 'N', 'H', 'a', 'C', 'a', 'T', 'T', 'M', 'G', '2.0','F', 'A', 'S', '(A', '.U', ')', 'H', 'a', 'C', 'a', 'T', 'C', 'T','L', '0.0', '1.5', '1.0', '0.5', 'H', 'a', 'C', 'a', 'T', 'D', 'O','N', 'H', 'a', 'C', 'a', 'T', 'T', 'M', 'G', '2.0', 'A', 'C', 'C','(A', '.U', ')', 'H', 'a', 'C', 'a', 'T', 'C', 'T', 'L', '0.0', '2.5','1.5', '1.0', '0.5', 'H', 'a', 'C', 'a', 'T', 'D', 'O', 'N', 'H', 'a','C', 'a', 'T', 'T', 'M', 'G', '2.0', 'O', '-G', 'lc', 'N', 'A', 'c','le', 'v', 'e', 'l', '(A', '.U', ')', 'H', 'e', 'L', 'a', 'C', 'T','L', '0.0', '1.5', '1.0', '0.5', 'H', 'e', 'L', 'a', 'D', 'O', 'N','H', 'e', 'L', 'a', 'T', 'M', 'G', '2.0', '***', '***', 'S', 'R', 'E','B', 'P', '-1', 'le', 'v', 'e', 'l', 'H', 'e', 'L', 'a', 'C', 'T','L', '0.0', '1.5', '1.0', '0.5', 'H', 'e', 'L', 'a', 'D', 'O', 'N','H', 'e', 'L', 'a', 'T', 'M', 'G', '2.0', '***', 'F', 'A', 'S', '(A','.U', ')', 'H', 'e', 'L', 'a', 'C', 'T', 'L', '0.0', '1.5', '1.0','0.5', 'H', 'e', 'L', 'a', 'D', 'O', 'N', 'H', 'e', 'L', 'a', 'T','M', 'G', '2.0', '***', 'A', 'C', 'C', '(A', '.U', ')', 'H', 'e', 'L','a', 'C', 'T', 'L', '0.0', '1.5', '1.0', '0.5', 'H', 'e', 'L', 'a','D', 'O', 'N', 'H', 'e', 'L', 'a', 'T', 'M', 'G', '2.0', '***', '***','***', '***', '***', '***', '***', '*** ***', '***', 'O-GlcNAc','AMPK', 'pAMPK', 'SREBP-1', 'ACC', 'FAS', '�-actin', 'O', '-G', 'lN','A', 'c', 'le', 'v', 'e', 'l', '(A', '.U', ')', 'C', 'T', 'L', 'H','a', 'C', 'a', 'T', '0.0', '1.5', '1.0', '0.5', 'Q', 'u', 'e', 'r','H', 'a', 'C', 'a', 'T', 'C', 'T', 'L', 'H', 'e', 'L', 'a', '2.0','Q', 'u', 'e', 'r', 'H', 'e', 'L', 'a', 'p', 'A', 'M', 'P', 'K', '(A','.U', ')', 'C', 'T', 'L', 'H', 'a', 'C', 'a', 'T', '0.0', '1.5','1.0', '0.5', 'Q', 'u', 'e', 'r', 'H', 'a', 'C', 'a', 'T', 'C', 'T','L', 'H', 'e', 'L', 'a', '2.5 ***', 'Q', 'u', 'e', 'r', 'H', 'e', 'L','a', 'S', 'R', 'E', 'B', 'P', '-1', 'le', 'v', 'e', 'l', 'C', 'T','L', 'H', 'a', 'C', 'a', 'T', '0.0', '1.5', '1.0', '0.5', 'Q', 'u','e', 'r', 'H', 'a', 'C', 'a', 'T', 'C', 'T', 'L', 'H', 'e', 'L', 'a','**', 'Q', 'u', 'e', 'r', 'H', 'e', 'L', 'a', 'A', 'C', 'C', '(A','.U', ')', 'C', 'T', 'L', 'H', 'a', 'C', 'a', 'T', '0.0', '1.5','1.0', '0.5', 'Q', 'u', 'e', 'r', 'H', 'a', 'C', 'a', 'T', 'C', 'T','L', 'H', 'e', 'L', 'a', '2.0', '***', 'Q', 'u', 'e', 'r', 'H', 'e','L', 'a', 'F', 'A', 'S', '(A', '.U', ')', 'C', 'T', 'L', 'H', 'a','C', 'a', 'T', '0.0', '1.5', '1.0', '0.5', 'Q', 'u', 'e', 'r', 'H','a', 'C', 'a', 'T', 'C', 'T', 'L', 'H', 'e', 'L', 'a', '***', 'Q','u', 'e', 'r', 'H', 'e', 'L', 'a', '2.0', '***', '*** ***', '*','******* *******', 'HaCaT HeLa', 'CTL Quer CTL Quer', 'A', 'Fig. 4.Quercetin regulates SREBP­1 and its target proteins']
In this list, I would like to remove all groups of adjacent elements (length of group > N) for which no element has length > M.
A pseudo code would be:
for item in list:
if len(item) <= M:
buffer.append(item_index)
active = True
if len(item) > M and active == True:
active = False
if len(buffer) > N:
list.replace_at_index(buffer_by_index,'')
buffer.clear()
Thanks for helping
Here is how you can use the built-in enumerate method to iterate through the elements in a list alongside each element's index:
lst = ['performed three times. Data represent the mean±SEM of threeindependent experiments. *P<0.05, **P<0.005, ***P<0.001.', 'B','O-GlcNAc', 'AMPK', 'pAMPK', 'SREBP-1', 'ACC', 'FAS', 'actin', 'C','T', 'L', 'D', 'O', 'N', 'T', 'M', 'G', 'C', 'T', 'L', 'D', 'O', 'N','T', 'M', 'G', 'HaCaT HeLa', 'O', '-G', 'lN', 'A', 'c', 'le', 'v','e', 'l', '(A', '.U', ')', 'H', 'a', 'C', 'a', 'T', 'C', 'T', 'L','0.0', '2.5', '1.5', '1.0', '0.5', 'H', 'a', 'C', 'a', 'T', 'D', 'O','N', 'H', 'a', 'C', 'a', 'T', 'T', 'M', 'G', '2.0', '**', '***', 'S','R', 'E', 'B', 'P', '-1', 'le', 'v', 'e', 'l', 'H', 'a', 'C', 'a','T', 'C', 'T', 'L', '0.0', '1.5', '1.0', '0.5', 'H', 'a', 'C', 'a','T', 'D', 'O', 'N', 'H', 'a', 'C', 'a', 'T', 'T', 'M', 'G', '2.0','F', 'A', 'S', '(A', '.U', ')', 'H', 'a', 'C', 'a', 'T', 'C', 'T','L', '0.0', '1.5', '1.0', '0.5', 'H', 'a', 'C', 'a', 'T', 'D', 'O','N', 'H', 'a', 'C', 'a', 'T', 'T', 'M', 'G', '2.0', 'A', 'C', 'C','(A', '.U', ')', 'H', 'a', 'C', 'a', 'T', 'C', 'T', 'L', '0.0', '2.5','1.5', '1.0', '0.5', 'H', 'a', 'C', 'a', 'T', 'D', 'O', 'N', 'H', 'a','C', 'a', 'T', 'T', 'M', 'G', '2.0', 'O', '-G', 'lc', 'N', 'A', 'c','le', 'v', 'e', 'l', '(A', '.U', ')', 'H', 'e', 'L', 'a', 'C', 'T','L', '0.0', '1.5', '1.0', '0.5', 'H', 'e', 'L', 'a', 'D', 'O', 'N','H', 'e', 'L', 'a', 'T', 'M', 'G', '2.0', '***', '***', 'S', 'R', 'E','B', 'P', '-1', 'le', 'v', 'e', 'l', 'H', 'e', 'L', 'a', 'C', 'T','L', '0.0', '1.5', '1.0', '0.5', 'H', 'e', 'L', 'a', 'D', 'O', 'N','H', 'e', 'L', 'a', 'T', 'M', 'G', '2.0', '***', 'F', 'A', 'S', '(A','.U', ')', 'H', 'e', 'L', 'a', 'C', 'T', 'L', '0.0', '1.5', '1.0','0.5', 'H', 'e', 'L', 'a', 'D', 'O', 'N', 'H', 'e', 'L', 'a', 'T','M', 'G', '2.0', '***', 'A', 'C', 'C', '(A', '.U', ')', 'H', 'e', 'L','a', 'C', 'T', 'L', '0.0', '1.5', '1.0', '0.5', 'H', 'e', 'L', 'a','D', 'O', 'N', 'H', 'e', 'L', 'a', 'T', 'M', 'G', '2.0', '***', '***','***', '***', '***', '***', '***', '*** ***', '***', 'O-GlcNAc','AMPK', 'pAMPK', 'SREBP-1', 'ACC', 'FAS', '�-actin', 'O', '-G', 'lN','A', 'c', 'le', 'v', 'e', 'l', '(A', '.U', ')', 'C', 'T', 'L', 'H','a', 'C', 'a', 'T', '0.0', '1.5', '1.0', '0.5', 'Q', 'u', 'e', 'r','H', 'a', 'C', 'a', 'T', 'C', 'T', 'L', 'H', 'e', 'L', 'a', '2.0','Q', 'u', 'e', 'r', 'H', 'e', 'L', 'a', 'p', 'A', 'M', 'P', 'K', '(A','.U', ')', 'C', 'T', 'L', 'H', 'a', 'C', 'a', 'T', '0.0', '1.5','1.0', '0.5', 'Q', 'u', 'e', 'r', 'H', 'a', 'C', 'a', 'T', 'C', 'T','L', 'H', 'e', 'L', 'a', '2.5 ***', 'Q', 'u', 'e', 'r', 'H', 'e', 'L','a', 'S', 'R', 'E', 'B', 'P', '-1', 'le', 'v', 'e', 'l', 'C', 'T','L', 'H', 'a', 'C', 'a', 'T', '0.0', '1.5', '1.0', '0.5', 'Q', 'u','e', 'r', 'H', 'a', 'C', 'a', 'T', 'C', 'T', 'L', 'H', 'e', 'L', 'a','**', 'Q', 'u', 'e', 'r', 'H', 'e', 'L', 'a', 'A', 'C', 'C', '(A','.U', ')', 'C', 'T', 'L', 'H', 'a', 'C', 'a', 'T', '0.0', '1.5','1.0', '0.5', 'Q', 'u', 'e', 'r', 'H', 'a', 'C', 'a', 'T', 'C', 'T','L', 'H', 'e', 'L', 'a', '2.0', '***', 'Q', 'u', 'e', 'r', 'H', 'e','L', 'a', 'F', 'A', 'S', '(A', '.U', ')', 'C', 'T', 'L', 'H', 'a','C', 'a', 'T', '0.0', '1.5', '1.0', '0.5', 'Q', 'u', 'e', 'r', 'H','a', 'C', 'a', 'T', 'C', 'T', 'L', 'H', 'e', 'L', 'a', '***', 'Q','u', 'e', 'r', 'H', 'e', 'L', 'a', '2.0', '***', '*** ***', '*','******* *******', 'HaCaT HeLa', 'CTL Quer CTL Quer', 'A', 'Fig. 4.Quercetin regulates SREBP­1 and its target proteins']
N = 3
M = 5
buffer = []
for i, v in enumerate(lst):
if len(v) <= M:
buffer.append(i)
else:
if len(buffer) > N:
for i in buffer:
lst[i] = None
buffer.clear()
print(list(filter(None, lst)))
Output:
['performed three times. Data represent the mean±SEM of threeindependent experiments. *P<0.05, **P<0.005, ***P<0.001.', 'B', 'O-GlcNAc', 'AMPK', 'pAMPK', 'SREBP-1', 'HaCaT HeLa', '*** ***', '***', 'O-GlcNAc', 'AMPK', 'pAMPK', 'SREBP-1', 'ACC', 'FAS', '�-actin', '2.5 ***', '*** ***', '*', '******* *******', 'HaCaT HeLa', 'CTL Quer CTL Quer', 'A', 'Fig. 4.Quercetin regulates SREBP\xad1 and its target proteins']
It is unclear exactly what is desired. Below is some code of what I believe is being asked for. Hope this helps. If this is off... let me know.
Psuedo algorithm
Given a list of strings.
Identify the len of each string using class cntseq.
Identify sequence of strings having the same len
For each sequence, if the length of sequence
x = ['performed three times. Data represent the mean±SEM of three independent experiments. P<0.05, P<0.005, P<0.001.', 'B', 'O-GlcNAc', 'AMPK', 'pAMPK', 'SREBP-1', 'ACC', 'FAS', 'actin', 'C', 'T', 'L', 'D', 'O', 'N', 'T', 'M', 'G', 'C', 'T', 'L', 'D', 'O', 'N', 'T', 'M', 'G', 'HaCaT HeLa', 'O', '-G', 'lN', 'A', 'c', 'le', 'v', 'e', 'l', '(A', '.U', ')', 'H', 'a', 'C', 'a', 'T', 'C', 'T', 'L', '0.0', '2.5', '1.5', '1.0', '0.5', 'H', 'a', 'C', 'a', 'T', 'D', 'O', 'N', 'H', 'a', 'C', 'a', 'T', 'T', 'M', 'G', '2.0', '', '', 'S', 'R', 'E', 'B', 'P', '-1', 'le', 'v', 'e', 'l', 'H', 'a', 'C', 'a', 'T', 'C', 'T', 'L', '0.0', '1.5', '1.0', '0.5', 'H', 'a', 'C', 'a', 'T', 'D', 'O', 'N', 'H', 'a', 'C', 'a', 'T', 'T', 'M', 'G', '2.0', 'F', 'A', 'S', '(A', '.U', ')', 'H', 'a', 'C', 'a', 'T', 'C', 'T', 'L', '0.0', '1.5', '1.0', '0.5', 'H', 'a', 'C', 'a', 'T', 'D', 'O', 'N', 'H', 'a', 'C', 'a', 'T', 'T', 'M', 'G', '2.0', 'A', 'C', 'C', '(A', '.U', ')', 'H', 'a', 'C', 'a', 'T', 'C', 'T', 'L', '0.0', '2.5', '1.5', '1.0', '0.5', 'H', 'a', 'C', 'a', 'T', 'D', 'O', 'N', 'H', 'a', 'C', 'a', 'T', 'T', 'M', 'G', '2.0', 'O', '-G', 'lc', 'N', 'A', 'c', 'le', 'v', 'e', 'l', '(A', '.U', ')', 'H', 'e', 'L', 'a', 'C', 'T', 'L', '0.0', '1.5', '1.0', '0.5', 'H', 'e', 'L', 'a', 'D', 'O', 'N', 'H', 'e', 'L', 'a', 'T', 'M', 'G', '2.0', '', '', 'S', 'R', 'E', 'B', 'P', '-1', 'le', 'v', 'e', 'l', 'H', 'e', 'L', 'a', 'C', 'T', 'L', '0.0', '1.5', '1.0', '0.5', 'H', 'e', 'L', 'a', 'D', 'O', 'N', 'H', 'e', 'L', 'a', 'T', 'M', 'G', '2.0', '', 'F', 'A', 'S', '(A', '.U', ')', 'H', 'e', 'L', 'a', 'C', 'T', 'L', '0.0', '1.5', '1.0', '0.5', 'H', 'e', 'L', 'a', 'D', 'O', 'N', 'H', 'e', 'L', 'a', 'T', 'M', 'G', '2.0', '', 'A', 'C', 'C', '(A', '.U', ')', 'H', 'e', 'L', 'a', 'C', 'T', 'L', '0.0', '1.5', '1.0', '0.5', 'H', 'e', 'L', 'a', 'D', 'O', 'N', 'H', 'e', 'L', 'a', 'T', 'M', 'G', '2.0', '', '', '', '', '', '', '', '* ', '', 'O-GlcNAc', 'AMPK', 'pAMPK', 'SREBP-1', 'ACC', 'FAS', '�-actin', 'O', '-G', 'lN', 'A', 'c', 'le', 'v', 'e', 'l', '(A', '.U', ')', 'C', 'T', 'L', 'H', 'a', 'C', 'a', 'T', '0.0', '1.5', '1.0', '0.5', 'Q', 'u', 'e', 'r', 'H', 'a', 'C', 'a', 'T', 'C', 'T', 'L', 'H', 'e', 'L', 'a', '2.0', 'Q', 'u', 'e', 'r', 'H', 'e', 'L', 'a', 'p', 'A', 'M', 'P', 'K', '(A', '.U', ')', 'C', 'T', 'L', 'H', 'a', 'C', 'a', 'T', '0.0', '1.5', '1.0', '0.5', 'Q', 'u', 'e', 'r', 'H', 'a', 'C', 'a', 'T', 'C', 'T', 'L', 'H', 'e', 'L', 'a', '2.5 ', 'Q', 'u', 'e', 'r', 'H', 'e', 'L', 'a', 'S', 'R', 'E', 'B', 'P', '-1', 'le', 'v', 'e', 'l', 'C', 'T', 'L', 'H', 'a', 'C', 'a', 'T', '0.0', '1.5', '1.0', '0.5', 'Q', 'u', 'e', 'r', 'H', 'a', 'C', 'a', 'T', 'C', 'T', 'L', 'H', 'e', 'L', 'a', '', 'Q', 'u', 'e', 'r', 'H', 'e', 'L', 'a', 'A', 'C', 'C', '(A', '.U', ')', 'C', 'T', 'L', 'H', 'a', 'C', 'a', 'T', '0.0', '1.5', '1.0', '0.5', 'Q', 'u', 'e', 'r', 'H', 'a', 'C', 'a', 'T', 'C', 'T', 'L', 'H', 'e', 'L', 'a', '2.0', '', 'Q', 'u', 'e', 'r', 'H', 'e', 'L', 'a', 'F', 'A', 'S', '(A', '.U', ')', 'C', 'T', 'L', 'H', 'a', 'C', 'a', 'T', '0.0', '1.5', '1.0', '0.5', 'Q', 'u', 'e', 'r', 'H', 'a', 'C', 'a', 'T', 'C', 'T', 'L', 'H', 'e', 'L', 'a', '', 'Q', 'u', 'e', 'r', 'H', 'e', 'L', 'a', '2.0', '', '* ', '', '***** *******', 'HaCaT HeLa', 'CTL Quer CTL Quer', 'A', 'Fig. 4. Quercetin regulates SREBP\xad1 and its target proteins']
df = pd.DataFrame(x, columns=['v'])
df['len'] = df.v.apply(len)
N = 2
M = 2
class cntseq(object):
'''
Define class to track sequences across the Column
'''
def __init__(self, **kwargs):
self.prevLen = -1
self.cnt = 0
self.start = None
def countN(self, r):
if r.len == self.prevLen:
# if adjcent sequence found, mark it's start with "len.start"
self.cnt += 1
if self.start == None :
self.start = int(r.name)-1
return '%d.%d'%(r.len, self.start)
else:
# non-adjcent sequence found, mark None.None"
self.prevLen = r.len
self.cnt = 0
self.start = None
return 'None.None'
# Identify sequences of adjcent lengths.
cs = cntseq()
df['seq'] = df.apply(lambda r : cs.countN(r),axis=1)
print("\nOriginal DF info")
print(df.describe())
print("\nOriginal DF ")
print(df.head())
# Compute lookup of duplicate information
df2 = pd.DataFrame(df.groupby('seq').seq.count())
df2.columns=['M']
df2 = df2.reset_index()
n = df2[df2.seq == 'None.None'].index[0]
df2 = df2.drop(78, axis=0)
print("\nLookup DF, seq has count and index for start of 'adjcent elements'")
print(df2.head())
# Compute final DF without duplicates
df3 = df[df.seq.isin(list(df2[df2.M > M].seq))].head()
print("\nFinal DF without duplicated")
print(df3)
print("\nOriginal DF info")
print(df3.describe())
output:
Original DF info
len
count 578.000000
mean 1.730104
std 5.284275
min 0.000000
25% 1.000000
50% 1.000000
75% 1.000000
max 110.000000
Original DF
v len seq
0 performed three times. Data represent the mean... 110 None.None
1 B 1 None.None
2 O-GlcNAc 8 None.None
3 AMPK 4 None.None
4 pAMPK 5 None.None
Lookup DF, seq has count and index for start of 'adjcent elements'
seq M
0 0.221 1
1 0.325 6
2 0.70 1
3 1.111 2
4 1.116 8
Final DF without duplicated
v len seq
10 T 1 1.9
11 L 1 1.9
12 D 1 1.9
13 O 1 1.9
14 N 1 1.9
Original DF info
len
count 5.0
mean 1.0
std 0.0
min 1.0
25% 1.0
50% 1.0
75% 1.0
max 1.0

I am writing a program for converting DNA to protein, I am not get idea to creat help me to get through it,

def translat_dna(sequence):
gencode= {
'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
}
proteinseq=''
for n in range(0,len(gencode),3):
if sequence[n:n+3] in gencode:
proteinseq += gencode[sequence[n:n+3]:
sequence=''
return proteinseq in translat_dna()
Change
for n in range(0,len(gencode),3):
Onto:
for n in range(0,len(sequence),3):
def translat_dna(sequence):
gencode = {
'ATA': 'I', 'ATC': 'I', 'ATT': 'I', 'ATG': 'M',
'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T',
'AAC': 'N', 'AAT': 'N', 'AAA': 'K', 'AAG': 'K',
'AGC': 'S', 'AGT': 'S', 'AGA': 'R', 'AGG': 'R',
'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L',
'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P',
'CAC': 'H', 'CAT': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R',
'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V',
'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A',
'GAC': 'D', 'GAT': 'D', 'GAA': 'E', 'GAG': 'E',
'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G',
'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S',
'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L',
'TAC': 'Y', 'TAT': 'Y', 'TAA': '_', 'TAG': '_',
'TGC': 'C', 'TGT': 'C', 'TGA': '_', 'TGG': 'W',
}
proteinseq = ''
for n in range(0, len(sequence), 3):
proteinseq += gencode.get(sequence[n:n + 3], '')
return proteinseq
You could also shorten it slightly using str.join:
def translat_dna(sequence):
gencode = {
'ATA': 'I', 'ATC': 'I', 'ATT': 'I', 'ATG': 'M',
'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T',
'AAC': 'N', 'AAT': 'N', 'AAA': 'K', 'AAG': 'K',
'AGC': 'S', 'AGT': 'S', 'AGA': 'R', 'AGG': 'R',
'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L',
'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P',
'CAC': 'H', 'CAT': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R',
'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V',
'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A',
'GAC': 'D', 'GAT': 'D', 'GAA': 'E', 'GAG': 'E',
'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G',
'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S',
'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L',
'TAC': 'Y', 'TAT': 'Y', 'TAA': '_', 'TAG': '_',
'TGC': 'C', 'TGT': 'C', 'TGA': '_', 'TGG': 'W',
}
return ''.join(gencode.get(sequence[n:n + 3], '')
for n in range(0, len(sequence), 3))

Python encoding using ASCII

I am designing a program that will encode. messages imported from a csv. It will do this by converting them their ASCII value, adding 2 to them and then converting them back to characters.
My current problem is that while my code will encode each character in each string the messages are now no longer joined together.
Any help would be appreciated.
My code:
#importing csv file and allowing it to be read from
import csv
ifile = open("messages.csv","rb")
reader= csv.reader(ifile)
#creating lists
plain_text=[]
plain_ascii=[]
encrypted_ascii=[]
encrypted_text=[]
latitude=[]
longitude=[]
#appending csv data to separate lists
for row in reader:
latitude.append(row[0])
longitude.append(row[1])
plain_text.append(row[2])
#encoding messages
encrypted_text=[[chr(ord(ch)+2) for ch in string] for string in plain_text]
print plain_text
print encrypted_text
ifile.close()
The current output:
['A famous Scottish victory in the First War of Scottish Independence - bit.ly/1yIAb8Q', "How high is Scotland's tallest mountain? - bit.ly/1q3Rj6D", 'What is the traditional instrument most often linked with Scotland? - http://#bit.ly/1lNdrk3', "A prickly problem Scotland's national symbol - bit.ly/1q3REpQ", 'Name the largest city in Scotland - bit.ly/T4OEuU']
[['C', '"', 'h', 'c', 'o', 'q', 'w', 'u', '"', 'U', 'e', 'q', 'v', 'v', 'k', 'u', 'j', '"', 'x', 'k', 'e', 'v', 'q', 't', '{', '"', 'k', 'p', '"', 'v', 'j', 'g', '"', 'H', 'k', 't', 'u', 'v', '"', 'Y', 'c', 't', '"', 'q', 'h', '"', 'U', 'e', 'q', 'v', 'v', 'k', 'u', 'j', '"', 'K', 'p', 'f', 'g', 'r', 'g', 'p', 'f', 'g', 'p', 'e', 'g', '"', '/', '"', 'd', 'k', 'v', '0', 'n', '{', '1', '3', '{', 'K', 'C', 'd', ':', 'S'], ['J', 'q', 'y', '"', 'j', 'k', 'i', 'j', '"', 'k', 'u', '"', 'U', 'e', 'q', 'v', 'n', 'c', 'p', 'f', ')', 'u', '"', 'v', 'c', 'n', 'n', 'g', 'u', 'v', '"', 'o', 'q', 'w', 'p', 'v', 'c', 'k', 'p', 'A', '"', '/', '"', 'd', 'k', 'v', '0', 'n', '{', '1', '3', 's', '5', 'T', 'l', '8', 'F'], ['Y', 'j', 'c', 'v', '"', 'k', 'u', '"', 'v', 'j', 'g', '"', 'v', 't', 'c', 'f', 'k', 'v', 'k', 'q', 'p', 'c', 'n', '"', 'k', 'p', 'u', 'v', 't', 'w', 'o', 'g', 'p', 'v', '"', 'o', 'q', 'u', 'v', '"', 'q', 'h', 'v', 'g', 'p', '"', 'n', 'k', 'p', 'm', 'g', 'f', '"', 'y', 'k', 'v', 'j', '"', 'U', 'e', 'q', 'v', 'n', 'c', 'p', 'f', 'A', '"', '/', '"', 'j', 'v', 'v', 'r', '<', '1', '1', 'd', 'k', 'v', '0', 'n', '{', '1', '3', 'n', 'P', 'f', 't', 'm', '5'], ['C', '"', 'r', 't', 'k', 'e', 'm', 'n', '{', '"', 'r', 't', 'q', 'd', 'n', 'g', 'o', '"', 'U', 'e', 'q', 'v', 'n', 'c', 'p', 'f', ')', 'u', '"', 'p', 'c', 'v', 'k', 'q', 'p', 'c', 'n', '"', 'u', '{', 'o', 'd', 'q', 'n', '"', '/', '"', 'd', 'k', 'v', '0', 'n', '{', '1', '3', 's', '5', 'T', 'G', 'r', 'S'], ['P', 'c', 'o', 'g', '"', 'v', 'j', 'g', '"', 'n', 'c', 't', 'i', 'g', 'u', 'v', '"', 'e', 'k', 'v', '{', '"', 'k', 'p', '"', 'U', 'e', 'q', 'v', 'n', 'c', 'p', 'f', '"', '/', '"', 'd', 'k', 'v', '0', 'n', '{', '1', 'V', '6', 'Q', 'G', 'w', 'W']]
You need to join the inner lists.
[''.join(chr(ord(ch)+2) for ch in string) for string in plain]

DNA To Protein Sequence

I am trying to create a program which translates a DNA sequence entered by the user to 3 alternative protein sequence using the following dictionary (codons are keys, amino acids are values):
{'TGA': '*', 'GCG': 'A', 'CGA': 'R', 'ATA': 'I', 'AGA': 'R', 'TAA': '*', 'TTT': 'F', 'GAG': 'E', 'CTT': 'L', 'CGT': 'R', 'CTC': 'L', 'CTG': 'L', 'TGT': 'C', 'CCA': 'P', 'AAT': 'N', 'GTC': 'V', 'GAC': 'D', 'GAT': 'D', 'TAT': 'Y', 'AAA': 'K', 'GTA': 'V', 'TAG': '*', 'CGC': 'R', 'GCA': 'A', 'TCG': 'S', 'GCT': 'A', 'GCC': 'A', 'TGG': 'W', 'TTC': 'F', 'CCC': 'P', 'TTG': 'L', 'CGG': 'R', 'GGC': 'G', 'AGG': 'R', 'TCC': 'S', 'CCT': 'P', 'GGT': 'G', 'GGG': 'G', 'TCA': 'S', 'AGC': 'S', 'CAG': 'Q', 'CAC': 'H', 'ATC': 'I', 'GAA': 'E', 'GTG': 'V', 'CCG': 'P', 'CAT': 'H', 'AAG': 'K', 'ATG': 'M', 'AAC': 'N', 'TAC': 'Y', 'TGC': 'C', 'CTA': 'L', 'TCT': 'S', 'ATT': 'I', 'ACG': 'T', 'AGT': 'S', 'GTT': 'V', 'TTA': 'L', 'CAA': 'Q', 'GGA': 'G', 'ACC': 'T', 'ACA': 'T', 'ACT': 'T'}
I want to just use mapping and not biopython.
So, when the program is run it should look like this:
Please enter a DNA sequence: GCTgttaagactatgaaaagaataagcaacaccatcaat
Frame 1 is AVKTMKRISNTIN
Frame 2 is LLRL*KE*ATPS
Frame 3 is C*DYEKNKQHHQ
I have created this dictionary from a file, but I'm unsure how to get started from here. Any help would be greatly appreciated. Thanks!
Declare the dictionary:
prot_match = {'TGA': '*', 'GCG': 'A', 'CGA': 'R', 'ATA': 'I', 'AGA': 'R', 'TAA': '*', 'TTT': 'F', 'GAG': 'E', 'CTT': 'L', 'CGT': 'R', 'CTC': 'L', 'CTG': 'L', 'TGT': 'C', 'CCA': 'P', 'AAT': 'N', 'GTC': 'V', 'GAC': 'D', 'GAT': 'D', 'TAT': 'Y', 'AAA': 'K', 'GTA': 'V', 'TAG': '*', 'CGC': 'R', 'GCA': 'A', 'TCG': 'S', 'GCT': 'A', 'GCC': 'A', 'TGG': 'W', 'TTC': 'F', 'CCC': 'P', 'TTG': 'L', 'CGG': 'R', 'GGC': 'G', 'AGG': 'R', 'TCC': 'S', 'CCT': 'P', 'GGT': 'G', 'GGG': 'G', 'TCA': 'S', 'AGC': 'S', 'CAG': 'Q', 'CAC': 'H', 'ATC': 'I', 'GAA': 'E', 'GTG': 'V', 'CCG': 'P', 'CAT': 'H', 'AAG': 'K', 'ATG': 'M', 'AAC': 'N', 'TAC': 'Y', 'TGC': 'C', 'CTA': 'L', 'TCT': 'S', 'ATT': 'I', 'ACG': 'T', 'AGT': 'S', 'GTT': 'V', 'TTA': 'L', 'CAA': 'Q', 'GGA': 'G', 'ACC': 'T', 'ACA': 'T', 'ACT': 'T'}
Slice the string to chunks of 3 characters:
def chunks (arr, size = 1, frame = 1):
return [arr[i: i+size] for i in range(frame - 1, len(arr), size)]
dna = input('Enter DNA: ').upper()
Then, map for the prot_match value of the current sequence (use get for safety in case you dont have that sequence for sure):
frames = 3
for frame in range(frames):
chunked_dna = chunks(dna, 3, frame + 1)
prot_seq = map(lambda seq: prot_match.get(seq, '0'), chunked_dna )
Finally, use join to get the protein sequence as a string from the map:
prot_seq = ''.join(prot_seq)
print('Protein sequence number', frame + 1, ':', prot_seq)
Example (these snippets together in a file):
Enter DNA: GCTgttaagactatgaaaagaataagcaacaccatcaat
Protein sequence number 1 : AVKTMKRISNTIN
Protein sequence number 2 : LLRL*KE*ATPS0
Protein sequence number 3 : C*DYEKNKQHHQ0

Categories

Resources