How to group genes regarding their id and position , python

How to group genes regarding their id and position , python - python

I have a file containing genes of different genomes. Gene is denoted by NZ_CP019047.1_2993 and Genome by NZ_CP019047
They look like this :
NZ_CP019047.1_2993
NZ_CP019047.1_2994
NZ_CP019047.1_2995
NZ_CP019047.1_2999
NZ_CP019047.1_3000
NZ_CP019047.1_3001
NZ_CP019047.1_3003
KE699235.1_379
KE699235.1_1000
KE699235.1_1001
what I want to do is group the genes of a genome (if a genome has more than 1 gene) regarding their distance meaning, if I have genes nearer than 4 positions I want to group them together.The position can be understood as the number after '_'. I want something like these:
[NZ_CP019047.1_2993,NZ_CP019047.1_2994,NZ_CP019047.1_2995]
[NZ_CP019047.1_2999,NZ_CP019047.1_3000,NZ_CP019047.1_3001,NZ_CP019047.1_3003]
[KE699235.1_1000,KE699235.1_1001]
What I have tried so far is creating a dictionary holding for each genome, in my case NZ_CP019047 and KE699235, all the number after '_'. Then I calculate their differences, if it is less than 4 I try to group them. The problem is that I am having duplication and I am having problem in the case when 1 genome has more than 1 group of genes like this case :
[NZ_CP019047.1_2993,NZ_CP019047.1_2994,NZ_CP019047.1_2995]
[NZ_CP019047.1_2999,NZ_CP019047.1_3000,NZ_CP019047.1_3001,NZ_CP019047.1_3003]
This is my code:
for key in sortedDict1:
cassette = ''
differences = []
numbers = sortedDict1[key]
differences = [x - numbers[i - 1] for i, x in enumerate(numbers)][1:]
print(differences)
for i in range(0,len(differences)):
if differences[i] <= 3:
pos = i
el1 = key + str(numbers[i])
el2 = key + str(numbers[i+1])
cas = el1 + ' '
cassette += cas
cas = el2 + ' '
cassette += cas
else:
cassette + '/n'
i+=1
I am referring to groups with variable cassette.
Can someone please help?

Please see below. You can modify the labels and distances to your requirements.
def get_genome_groups(genome_info):
genome_info.sort(key = lambda x: (x.split('.')[0], int(x.split('_')[-1])))
#print(genome_info)
genome_groups = []
close_genome_group = []
last_genome = ''
position = 0
last_position = 0
#'NZ_CP019047.1_2995',
for genomes in genome_info:
genome, position = genomes.split('.')
position = int(position.split('_')[1])
if last_genome and (genome != last_genome):
genome_groups.append(close_genome_group)
close_genome_group = []
elif close_genome_group and position and (position > last_position+3):
genome_groups.append(close_genome_group)
close_genome_group = []
if genomes:
close_genome_group.append(genomes)
last_position = position
last_genome = genome
if close_genome_group:
genome_groups.append(close_genome_group)
return genome_groups
if __name__ == '__main__':
genome_group = get_genome_groups(genome_info)
print(genome_group)
user#Inspiron:~/code/general$ python group_genes.py
[['KE699235.1_379'], ['KE699235.1_1000', 'KE699235.1_1001'], ['NZ_CP019047.1_2993', 'NZ_CP019047.1_2994', 'NZ_CP019047.1_2995'], ['NZ_CP019047.1_2999', 'NZ_CP019047.1_3000', 'NZ_CP019047.1_3001', 'NZ_CP019047.1_3003']]
user#Inspiron:~/code/general$

Input:
NZ_CP019047.1_2993
NZ_CP019047.1_2994
NZ_CP019047.1_2995
NZ_CP019047.1_2999
NZ_CP019047.1_3000
NZ_CP019047.1_3001
NZ_CP019047.1_3003
KE699235.1_379
KE699235.1_1000
KE699235.1_1001
KE6992351.2_379
KE6992352.2_1000
KE6992353.2_1001
Code:
from operator import itemgetter, attrgetter
with open("genes.dat", "r") as msg:
data = msg.read().splitlines()
for i, gene in enumerate(data):
gene_name = gene.split(".")[0]
chr_pos = gene.split(".")[1]
data[i] = (gene_name,int(chr_pos.split("_")[0]),int(chr_pos.split("_")[1]))
data = sorted(data, key=itemgetter(1,2))
output = []
j = 0
for i in range(1,len(data)):
if i == 1:
output.append([data[i]])
elif data[i][1] == output[j][0][1]:
if data[i][2] - output[j][0][2] < 5:
output[j].append(data[i])
else:
output.append([data[i]])
j += 1
else:
output.append([data[i]])
j += 1
print (output)
Output:
[[('KE699235', 1, 1000), ('KE699235', 1, 1001)], [('NZ_CP019047', 1, 2993), ('NZ_CP019047', 1, 2994), ('NZ_CP019047', 1, 2995)], [('NZ_CP019047', 1, 2999), ('NZ_CP019047', 1, 3000), ('NZ_CP019047', 1, 3001), ('NZ_CP019047', 1, 3003)], [('KE6992351', 2, 379)], [('KE6992352', 2, 1000), ('KE6992353', 2, 1001)]]
This should make groups based on max 5 difference in position between the most backward element and the most forward in the same group.
It should work if you get a list of mixed genes considering chr location.

Related

How to optimize the search algorithm for large 2d-array?

After adding a line of code
pathResult.append(find_max_path(arr, a + 1, b + 1, path))
began to run slowly, but without this code it does not work correctly. How can i optimize the code? The function looks for the path with the maximum number of points in a two-dimensional array where values equal to 100 lie predominantly on the main diagonal. Rows can have the same value equal to 100, but in any column the value 100 is one or none. Full code:
arr = [
[000,000,000,000,000,100,000],
[000,000,000,000,000,000,000],
[000,000,100,000,000,000,000],
[000,100,000,000,000,000,000],
[100,000,000,000,000,100,000],
[000,000,000,000,100,000,000],
[000,000,000,000,000,000,000],
[000,000,000,000,000,000,000]]
def find_max_path(arr, a=0, b=0, path=None):
if path is None:
path = []
while (a < len(arr)) and (b < len(arr[a])):
if arr[a][b] == 100:
path.append({"a": a, "b": b})
b += 1
else:
try:
if arr[a + 1][b + 1] == 100:
a += 1
b += 1
continue
except IndexError:
pass
check = []
for j in range(b + 1, len(arr[a])):
if arr[a][j] == 100:
check.append({"a": a, "b": j})
break
if not check:
a += 1
continue
i = a + 1
while i < len(arr):
if arr[i][b] == 100:
check.append({"a": i, "b": b})
break
i += 1
pathResult = []
for c in check:
pathNew = path[:]
pathNew.append({"a": c["a"], "b": c["b"]})
pathResult.append(find_max_path(arr, c["a"] + 1, c["b"] + 1, pathNew))
pathResult.append(find_max_path(arr, a + 1, b + 1, path))
maximum = 0
maxpath = []
for p in pathResult:
if len(p) > maximum:
maximum = len(p)
maxpath = p[:]
if maxpath:
return maxpath
a += 1
return path
print(find_max_path(arr))
UPDATE1: add two break in inner loops (execution time is halved)
Output:
[{'a': 4, 'b': 0}, {'a': 5, 'b': 4}]
UPDATE2
Usage.
I use this algorithm to synchronize two streams of information. I have words from the text along the lines, about which it is known where they are in the text of the book L_word. By columns, I have recognized words from the audiobook, about which the recognized word itself is known and when it was spoken in the audio stream R_word.
It turns out two arrays of words. To synchronize these two lists, I use something like this
from rapidfuzz import process, fuzz
import numpy as np
window = 50
# L_word = ... # words from text book
# R_word = ... # recognize words from audiobook
L = 0
R = 0
L_chunk = L_word[L:L+window]
R_chunk = R_word[R:R+window]
scores = process.cdist(L_chunk,
R_chunk,
scorer=fuzz.ratio,
type=np.uint8,
score_cutoff=100)
p = find_max_path(scores)
# ... path processing ...
...
as a result of all the work, we get something like this video book with pagination and subtitles synchronized with audio download 3GB
UPDATE3: adding this code reduces the execution time by almost ten times!
try:
if arr[a + 1][b + 1] == 100:
a += 1
b += 1
continue
except IndexError:
pass

Python shows how to do debugging and profiling. Go around the algorithm and time functions to see where the bottleneck is

combinations with python

I am trying to generate combination of ID's
Input: cid = SPARK
oupout: list of all the comibnations as below, position of each element should be constant. I am a beginner in python any help here is much appreciated.
'S****'
'S***K'
'S**R*'
'S**RK'
'S*A**'
'S*A*K'
'S*AR*'
'S*ARK'
'SP***'
'SP**K'
'SP*R*'
'SP*RK'
'SPA**'
'SPA*K'
'SPAR*'
'SPARK'
I tried below, I need a dynamic code:
cid = 'SPARK'
# print(cid.replace(cid[1],'*'))
# cu_len = lenth of cid [SPARK] here which is 5
# com_stars = how many stars i.e '*' or '**'
def cubiod_combo_gen(cu_len, com_stars, j_ite, i_ite):
cubiodList = []
crange = cu_len
i = i_ite #2 #3
j = j_ite #1
# com_stars = ['*','**','***','****']
while( i <= crange):
# print(j,i)
if len(com_stars) == 1:
x = len(com_stars)
n_cid = cid.replace(cid[j:i],com_stars)
i += x
j += x
cubiodList.append(n_cid)
elif len(com_stars) == 2:
x = len(com_stars)
n_cid = cid.replace(cid[j:i],com_stars)
i += x
j += x
cubiodList.append(n_cid)
elif len(com_stars) == 3:
x = len(com_stars)
n_cid = cid.replace(cid[j:i],com_stars)
i += x
j += x
cubiodList.append(n_cid)
return cubiodList
#print(i)
#print(n_cid)
# for item in cubiodList:
# print(item)
print(cubiod_combo_gen(5,'*',1,2))
print(cubiod_combo_gen(5,'**',1,3))

For every character in your given string, you can represent it as a binary string, using a 1 for a character that stays the same and a 0 for a character to replace with an asterisk.
def cubiod_combo_gen(string, count_star):
str_list = [char0 for char0 in string] # a list with the characters of the string
itercount = 2 ** (len(str_list)) # 2 to the power of the length of the input string
results = []
for config in range(itercount):
# return a string of i in binary representation
binary_repr = bin(config)[2:]
while len(binary_repr) < len(str_list):
binary_repr = '0' + binary_repr # add padding
# construct a list with asterisks
i = -1
result_list = str_list.copy() # soft copy, this made me spend like 10 minutes debugging lol
for char in binary_repr:
i += 1
if char == '0':
result_list[i] = '*'
if char == '1':
result_list[i] = str_list[i]
# now we have a possible string value
if result_list.count('*') == count_star:
# convert back to string and add to list of accepted strings
result = ''
for i in result_list:
result = result + i
results.append(result)
return results
# this function returns the value, so you have to use `print(cubiod_combo_gen(args))`
# comment this stuff out if you don't want an interactive user prompt
string = input('Enter a string : ')
count_star = input('Enter number of stars : ')
print(cubiod_combo_gen(string, int(count_star)))
It iterates through 16 characters in about 4 seconds and 18 characters in about 17 seconds. Also you made a typo on "cuboid" but I left the original spelling
Enter a string : DPSCT
Enter number of stars : 2
['**SCT', '*P*CT', '*PS*T', '*PSC*', 'D**CT', 'D*S*T', 'D*SC*', 'DP**T', 'DP*C*', 'DPS**']
As a side effect of this binary counting, the list is ordered by the asterisks, where the earliest asterisk takes precedence, with next earliest asterisks breaking ties.
If you want a cumulative count like 1, 4, 5, and 6 asterisks from for example "ABCDEFG", you can use something like
star_counts = (1, 4, 5, 6)
string = 'ABCDEFG'
for i in star_counts:
print(cubiod_combo_gen(string, star_counts))
If you want the nice formatting you have in your answer, try adding this block at the end of your code:
def formatted_cuboid(string, count_star):
values = cubiod_combo_gen(string, count_star)
for i in values:
print(values[i])
I honestly do not know what your j_ite and i_ite are, but it seems like they have no use so this should work. If you still want to pass these arguments, change the first line to def cubiod_combo_gen(string, count_star, *args, **kwargs):

I am not sure what com_stars does, but to produce your sample output, the following code does.
def cuboid_combo(cid):
fill_len = len(cid)-1
items = []
for i in range(2 ** fill_len):
binary = f'{i:0{fill_len}b}'
#print(binary, 'binary', 'num', i)
s = cid[0]
for idx, bit in enumerate(binary,start=1):
if bit == '0':
s += '*'
else: # 'bit' == 1
s += cid[idx]
items.append(s)
return items
#cid = 'ABCDEFGHI'
cid = 'DPSCT'
result = cuboid_combo(cid)
for item in result:
print(item)
Prints:
D****
D***T
D**C*
D**CT
D*S**
D*S*T
D*SC*
D*SCT
DP***
DP**T
DP*C*
DP*CT
DPS**
DPS*T
DPSC*
DPSCT

Concatenate several seq within a file according to their percentage of similarities

Hel lo I need your help in a complicated task.
Here is a file1.txt :
>Name1.1_1-40_-__Sp1
AAAAAACC-------------
>Name1.1_67-90_-__Sp1
------CCCCCCCCC------
>Name1.1_90-32_-__Sp1
--------------CCDDDDD
>Name2.1_20-89_-__Sp2
AAAAAACCCCCCCCCCC----
>Name2.1_78-200_-__Sp2
-------CCCCCCCCCCDDDD
and the idea is to create a new file called file1.txt_Hsp such as:
>Name1.1-3HSPs-__Sp1
AAAAAACCCCCCCCCCDDDDD
>Name3.1_-__Sp2
AAAAAACCCCCCCCCCC----
>Name4.1_-__Sp2
-------CCCCCCCCCCCCCC
So basically the idea is to:
Compare each sequence from the same SpN <-- (here it is very important only with the same SpN name) with each other in file1.txt.
For instance I will have to compare :
Name1.1_1-40_-__Sp1 vs Name1.1_67-90_-__Sp1
Name1.1_1-40_-__Sp1 vs Name1.1_90-32_-__Sp1
Name1.1_67-90_-__Sp1 vs Name1.1_90-32_-__Sp1
Name2.1_20-89_-__Sp2 vs Name2.1_78-200_-__Sp2
So for exemple when I compare:
Name1.1_1-40_-__Sp1 vs Name1.1_67-90_-__Sp1 I get :
>Name1.1_1-40_-__Sp1
AAAAAACC-------------
>Name1.1_67-90_-__Sp1
------CCCCCCCCC------
here I want to concatenate the two sequences if ratio between number of letter matching with another letter / nb letter matching with a (-) is < 0.20`.
Here for example there are 21 characters, and the number of letter matching with another letter = 2 (C and C).
And the number of letter that match with a - , is 13 (AAAAAA+CCCCCCC)
so
ratio = 2/15 : 0.1538462
and if this ratio < 0.20 then I want to concatenate this 2 sequences such as :
>Name1.1-2HSPs_-__Sp1
AAAAAACCCCCCCCC------
(As you can se the name of the new seq is now : Name.1-2HSPs_-__Sp1 with the 2 meaning that there are 2 sequences concatenated) So we remove the number-number part for XHSPS with X being the number of sequence concatenated.
and get the file1.txt_Hsp :
>Name1.1-2HSPs_-__Sp1
AAAAAACCCCCCCCC------
>Name1.1_90-32_-__Sp1
--------------CCDDDDD
>Name2.1_20-89_-__Sp2
AAAAAACCCCCCCCCCC----
>Name2.1_78-200_-__Sp2
-------CCCCCCCCCCDDDD
Then I do it again with Name1.1-2HSPs_-__Sp1 vs Name1.1_90-32_-__Sp1
>Name1.1-2HSPs_-__Sp1
AAAAAACCCCCCCCC------
>Name1.1_90-32-__Sp1
--------------CCDDDDD
Where ratio = 1/20 = 0.05
Then because the ratio is < 0.20 I want to concatenate this 2 sequences such as :
>Name1.1-3HSPs_-__Sp1
AAAAAACCCCCCCCCCDDDDD
(As you can see the name of the new seq is now : Name.1-3HSPs_-__Sp1 with the 3 meaning that there are 3 sequences concatenated)
file1.txt_Hsp:
>Name1.1-3HSPs_-__Sp1
AAAAAACCCCCCCCCCDDDDD
>Name2.1_20-89_-__Sp2
AAAAAACCCCCCCCCCC----
>Name2.1_78-200_-__Sp2
-------CCCCCCCCCCDDDD
Then I do it again with Name2.1_20-89_-__Sp2 vs Name2.1_78-200_-__Sp2
>Name2.1_20-89_-__Sp2
AAAAAACCCCCCCCCCC----
>Name2.1_78-200_-__Sp2
-------CCCCCCCCCCDDDD
Where ratio = 10/11 = 0.9090909
Then because the ratio is > 0.20 I do nothing and get the final file1.txt_Hsp:
>Name1.1-3HSPs_-__Sp1
AAAAAACCCCCCCCCCDDDDD
>Name2.1_20-89_-__Sp2
AAAAAACCCCCCCCCCC----
>Name2.1_78-200_-__Sp2
-------CCCCCCCCCCDDDD
Which is the final result I needed.
A simplest exemple would be :
>Name1.1_10-60_-__Seq1
AAA------
>Name1.1_70-120_-__Seq1
--AAAAAAA
>Name2.1_12-78_-__Seq2
--AAAAAAA
The ratio is 1/8 = 0.125 because only 1 letter is matching and 8 because 8 letters are matching with a (-)
Because the ratio < 0.20 I concatenate the two sequences Seq1 to:
>Name1.1_2HSPs_-__Seq1
AAAAAAAAA
and the new file should be :
>Name1.1_2HSPs_-__Seq1
AAAAAAAAA
>Name2.1_-__Seq2
--AAAAAAA
** Here is an exemple from my real data **
>YP_009186705
MMSCQSWMMKYFTKVCNRSNLALPFDQSVNPVSFSMISSHDVMLKLDDEIFYKSLNQSNL
ALPFDQSVNPVSFSMISSHDLIA
>XO009980.1_26784332-20639090_-__Agapornis_vilveti
------------------------------------------------------LNQSNL
ALPFDQSVNPVSFSMISSHDLIA
>CM009917.1_20634332-20634508_-__Neodiprion_lecontei
---CDSWMIKFFARISQMC---IKIHSKYEEVSFFLFQSK--KKKIADSHFFRSLNQDTA
-------LNTVSY----------
>XO009980.1_20634508-20634890_-__Agapornis_vilveti
MMSCQSWMMKYFTKVCNRSNLALPFDQSVNPVSFSMISSHDVMLKL--------------
-----------------------
>YUUBBOX12
MMSCQSWMMKYFTKVCNRSNLALPFDQSVNPVSFSMISSHDVMLKLDDEIFYKSLNQSNL
ALPFDQSVNPVSFSMISSHDLIA
and I should get :
>YP_009186705
MMSCQSWMMKYFTKVCNRSNLALPFDQSVNPVSFSMISSHDVMLKLDDEIFYKSLNQSNL
ALPFDQSVNPVSFSMISSHDLIA
>XO009980.1_2HSPs_-__Agapornis_vilveti
MMSCQSWMMKYFTKVCNRSNLALPFDQSVNPVSFSMISSHDVMLKLLNQSNL
ALPFDQSVNPVSFSMISSHDLIA
>CM009917.1_20634332-20634508_-__Neodiprion_lecontei
---CDSWMIKFFARISQMC---IKIHSKYEEVSFFLFQSK--KKKIADSHFFRSLNQDTA
-------LNTVSY----------
>YUUBBOX12
MMSCQSWMMKYFTKVCNRSNLALPFDQSVNPVSFSMISSHDVMLKLDDEIFYKSLNQSNL
ALPFDQSVNPVSFSMISSHDLIA
the ratio between XO009980.1_26784332-20639090_-__Agapornis_vilveti and XO009980.1_20634508-20634890_-__Agapornis_vilveti was : 0/75 = 0
Here as you can see, some sequence does not have the [\d]+[-]+[\d] patterns such as YP_009186705 or YUUBBOX12, these one does not have to be concatenate, they juste have to be added in the outputfile.
Thanks a lot for your help.

First, let's read the text files into tuples of (name, seq):
with open('seq.txt', 'r+') as f:
lines = f.readlines()
seq_map = []
for i in range(0, len(lines), 2):
seq_map.append((lines[i].strip('\n'), lines[i+1].strip('\n')))
#[('>Name1.1_10-60_-__Seq1', 'AAA------'),
# ('>Name1.1_70-120_-__Seq1', '--AAAAAAA'),
# ('>Name2.1_12-78_-__Seq2', '--AAAAAAA')]
#
# or
#
# [('>Name1.1_1-40_-__Sp1', 'AAAAAACC-------------'),
# ('>Name1.1_67-90_-__Sp1', '------CCCCCCCCC------'),
# ('>Name1.1_90-32_-__Sp1', '--------------CCDDDDD'),
# ('>Name2.1_20-89_-__Sp2', 'AAAAAACCCCCCCCCCC----'),
# ('>Name2.1_78-200_-__Sp2', '-------CCCCCCCCCCDDDD')]
Then we define helper functions, one each for checking for a concat, then concat for seq, and merge for name (with a nest helper for getting HSPs counts):
import re
def count_num(x):
num = re.findall(r'[\d]+?(?=HSPs)', x)
count = int(num[0]) if num and 'HSPs' in x else 1
return count
def concat_name(nx, ny):
count, new_name = 0, []
count += count_num(nx)
count += count_num(ny)
for ind, x in enumerate(nx.split('_')):
if ind == 1:
new_name.append('{}HSPs'.format(count))
else:
new_name.append(x)
new_name = '_'.join([x for x in new_name])
return new_name
def concat_seq(x, y):
mash, new_seq = zip(x, y), ''
for i in mash:
if i.count('-') > 1:
new_seq += '-'
else:
new_seq += i[0] if i[1] == '-' else i[1]
return new_seq
def check_concat(x, y):
mash, sim, dissim = zip(x, y), 0 ,0
for i in mash:
if i[0] == i[1] and '-' not in i:
sim += 1
if '-' in i and i.count('-') == 1:
dissim += 1
return False if not dissim or float(sim)/float(dissim) >= 0.2 else True
Then we will write a script to run over the tuples in sequence, checking for spn matches, then concat_checks, and taking forward the new pairing for the next comparison, adding to the final list where necessary:
tmp_seq_map = seq_map[:]
final_seq = []
for ind in range(1, len(seq_map)):
end = True if ind == len(seq_map)-1 else False
pair_a = tmp_seq_map[ind-1]
pair_b = tmp_seq_map[ind]
name_a = pair_a[0][:]
name_b = pair_b[0][:]
if name_a.split('__')[1] == name_b.split('__')[1]:
if check_concat(pair_a[1], pair_b[1]):
new_name = concat_name(pair_a[0], pair_b[0])
new_seq = concat_seq(pair_a[1], pair_b[1])
tmp_seq_map[ind] = (((new_name, new_seq)))
if end:
final_seq.append(tmp_seq_map[ind])
end = False
else:
final_seq.append(pair_a)
else:
final_seq.append(pair_a)
if end:
final_seq.append(pair_b)
print(final_seq)
#[('>Name1.1_2HSPs_-__Seq1', 'AAAAAAAAA'),
# ('>Name2.1_12-78_-__Seq2', '--AAAAAAA')]
#
# or
#
#[('>Name1.1_3HSPs_-__Sp1', 'AAAAAACCCCCCCCCCDDDDD'),
# ('>Name2.1_20-89_-__Sp2', 'AAAAAACCCCCCCCCCC----'),
# ('>Name2.1_78-200_-__Sp2', '-------CCCCCCCCCCDDDD')]
Please note that I have checked for concatenation of only consecutive sequences from the text files, and that you would have to re-use the methods I've written in a different script for accounting for combinations. I leave that to your discretion.
Hope this helps. :)

You can do this as follows.
from collections import defaultdict
with open('lines.txt','r') as fp:
lines=fp.readlines()
dnalist = defaultdict(list)
for i,line in enumerate(lines):
line = line.replace('\n','')
if i%2: #'Name' in line:
dnalist[n].append(line)
else:
n = line.split('-')[-1]
That gives you a dictionary with keys being the file numbers and values being the dna sequences in a list.
def calc_ratio(str1,str2):
n_skipped,n_matched,n_notmatched=0,0,0
print(len(str1),len(str2))
for i,ch in enumerate(str1):
if ch=='-' or str2[i]=='-':
n_skipped +1
elif ch == str2[i]:
n_matched += 1
else:
n_notmatched+=1
retval = float(n_matched)/float(n_matched+n_notmatched+n_skipped)
print(n_matched,n_notmatched,n_skipped)
return retval
That gets you the ratio; you might want to consider the case where characters in the sequences dont match (and neither is '-'), here I assumed that's not a different case than one being '-'.
A helper function to concatenate the strings: here I took the case of non-matching chars and put in an 'X' to mark it (if it ever happens) .
def dna_concat(str1,str2):
outstr=[]
for i,ch in enumerate(str1):
if ch!=str2[i]:
if ch == '-':
outchar = str2[i]
elif str2[i] == '-':
outchar = ch
else:
outchar = 'X'
else:
outchar = ch
outstr.append(outchar)
outstr = ''.join(outstr)
return outstr
And finally a loop thru the dictionary lists to get the concatenated answers, in another dictionary with filenumbers as keys and lists of concatenations as values.
for filenum,dnalist in dnalist.items():
print(dnalist)
answers = defaultdict(list)
for i,seq in enumerate(dnalist):
for seq2 in dnalist[i+1:len(dnalist)]:
ratio = calc_ratio(seq,seq2)
print('i {} {} ration {}'.format(seq,seq2,ratio))
if ratio<0.2:
answers[filenum].append(dna_concat(seq,seq2))
print(dna_concat(seq,seq2))

Longest substring without repeating characters in python

This is a pretty standard interview question. Find the longest substring without repeating characters. Here are the test cases,
abcabcbb -> 3
bbbbb -> 1
pwwkew -> 3
bpfbhmipx -> 7
tmmzuxt -> 5
Here's my code which uses a pretty simple approach with 2 pointers.
def lengthOfLongestSubstring(s):
checklist = {}
starting_index_of_current_substring = 0
length_of_longest_substring = 0
for i, v in enumerate(s):
if v in checklist:
starting_index_of_current_substring = checklist[v] + 1
else:
length_of_current_substring = i - starting_index_of_current_substring + 1
length_of_longest_substring = max(length_of_current_substring, length_of_longest_substring)
checklist[v] = i
return length_of_longest_substring
My code passes all the test cases except the last one (actual 4, expected 5). Can someone help me modify the code to take care of the last test case. I don't wish to reinvent the algorithm.

Here is a simple tweak in your code with 2 pointers to find the longest sub-string without repeating characters.
Change in your code is instead of calculating the length of longest substring when v is not present in checklist, I am calculating length of longest substring for all cases.
def lengthOfLongestSubstring(s):
checklist = {}
starting_index_of_current_substring = 0
length_of_longest_substring = 0
for i, v in enumerate(s):
if v in checklist:
starting_index_of_current_substring = max(starting_index_of_current_substring, checklist[v] + 1)
checklist[v] = i
length_of_longest_substring = max(length_of_longest_substring, i - starting_index_of_current_substring + 1)
return length_of_longest_substring
## Main
result = {}
for string in ['abcabcbb', 'bbbbb', 'ppwwkew', 'wcabcdeghi', 'bpfbhmipx', 'tmmzuxt', 'AABGAKGIMN', 'stackoverflow']:
result[string] = lengthOfLongestSubstring(string)
print(result)
Sample run:
{'abcabcbb': 3, 'bbbbb': 1, 'ppwwkew': 3, 'wcabcdeghi': 8, 'bpfbhmipx': 7, 'tmmzuxt': 5, 'AABGAKGIMN': 6, 'stackoverflow': 11}

This post is pretty old, but I think my solution fixes the bug in the original code.
def lengthOfLongestSubstring(s):
checklist = {}
starting_index_of_current_substring = 0
length_of_longest_substring = 0
for i, v in enumerate(s):
if v in checklist:
if checklist[v] >= starting_index_of_current_substring:
starting_index_of_current_substring = checklist[v] + 1
length_of_current_substring = i - starting_index_of_current_substring + 1
length_of_longest_substring = max(length_of_current_substring, length_of_longest_substring)
checklist[v] = i
return length_of_longest_substring

This doesnt really iterate upon your solution, but it's a bit simpler approach, just to give you an idea how it could be also solved.
def longest_substr(s):
longest = 0
for start_index in range(len(s)):
contains = set()
for letter in s[start_index:]:
if letter in contains:
break
contains.add(letter)
longest = max(longest, len(contains))
return longest

0
I would prefer this solution>>
Time and Space Management Optimised:
def lengthOfLongestSubstring(self, s: str) -> int:
curlen = maxlen = 0 # curlen saves the max len of substrings ending with current num
for i, num in enumerate(s):
curlen -= s[i-curlen:i].find(num)
maxlen = max(maxlen, curlen)
return maxlen

Find Longest Substring in the string without repeating characters.
def find_non_repeating_substring(input_str):
output_length = 0
longest_sub_str = ''
len_str = len(input_str)
index = 0
while len_str != 1:
l_str = ''
for i in range(index, len(input_str)):
if input_str[i] not in l_str:
l_str = l_str + input_str[i]
else:
break
sub_str_length = len(l_str)
if sub_str_length > output_length:
output_length = sub_str_length
longest_sub_str = l_str
len_str = len_str -1
index = index + 1
return output_length,longest_sub_str
if __name__ == '__main__':
input_str = raw_input("Please enter the string: ")
sub_str_length, sub_str = find_non_repeating_substring(input_str)
print ('Longest Substing lenght is "{0}" and the sub string is "{1}"'.format(sub_str_length, sub_str))```

Count consecutive characters

How would I count consecutive characters in Python to see the number of times each unique digit repeats before the next unique digit?
At first, I thought I could do something like:
word = '1000'
counter = 0
print range(len(word))
for i in range(len(word) - 1):
while word[i] == word[i + 1]:
counter += 1
print counter * "0"
else:
counter = 1
print counter * "1"
So that in this manner I could see the number of times each unique digit repeats. But this, of course, falls out of range when i reaches the last value.
In the example above, I would want Python to tell me that 1 repeats 1, and that 0 repeats 3 times. The code above fails, however, because of my while statement.
How could I do this with just built-in functions?

Consecutive counts:
You can use itertools.groupby:
s = "111000222334455555"
from itertools import groupby
groups = groupby(s)
result = [(label, sum(1 for _ in group)) for label, group in groups]
After which, result looks like:
[("1": 3), ("0", 3), ("2", 3), ("3", 2), ("4", 2), ("5", 5)]
And you could format with something like:
", ".join("{}x{}".format(label, count) for label, count in result)
# "1x3, 0x3, 2x3, 3x2, 4x2, 5x5"
Total counts:
Someone in the comments is concerned that you want a total count of numbers so "11100111" -> {"1":6, "0":2}. In that case you want to use a collections.Counter:
from collections import Counter
s = "11100111"
result = Counter(s)
# {"1":6, "0":2}
Your method:
As many have pointed out, your method fails because you're looping through range(len(s)) but addressing s[i+1]. This leads to an off-by-one error when i is pointing at the last index of s, so i+1 raises an IndexError. One way to fix this would be to loop through range(len(s)-1), but it's more pythonic to generate something to iterate over.
For string that's not absolutely huge, zip(s, s[1:]) isn't a a performance issue, so you could do:
counts = []
count = 1
for a, b in zip(s, s[1:]):
if a==b:
count += 1
else:
counts.append((a, count))
count = 1
The only problem being that you'll have to special-case the last character if it's unique. That can be fixed with itertools.zip_longest
import itertools
counts = []
count = 1
for a, b in itertools.zip_longest(s, s[1:], fillvalue=None):
if a==b:
count += 1
else:
counts.append((a, count))
count = 1
If you do have a truly huge string and can't stand to hold two of them in memory at a time, you can use the itertools recipe pairwise.
def pairwise(iterable):
"""iterates pairwise without holding an extra copy of iterable in memory"""
a, b = itertools.tee(iterable)
next(b, None)
return itertools.zip_longest(a, b, fillvalue=None)
counts = []
count = 1
for a, b in pairwise(s):
...

A solution "that way", with only basic statements:
word="100011010" #word = "1"
count=1
length=""
if len(word)>1:
for i in range(1,len(word)):
if word[i-1]==word[i]:
count+=1
else :
length += word[i-1]+" repeats "+str(count)+", "
count=1
length += ("and "+word[i]+" repeats "+str(count))
else:
i=0
length += ("and "+word[i]+" repeats "+str(count))
print (length)
Output :
'1 repeats 1, 0 repeats 3, 1 repeats 2, 0 repeats 1, 1 repeats 1, and 0 repeats 1'
#'1 repeats 1'

Totals (without sub-groupings)
#!/usr/bin/python3 -B
charseq = 'abbcccdddd'
distros = { c:1 for c in charseq }
for c in range(len(charseq)-1):
if charseq[c] == charseq[c+1]:
distros[charseq[c]] += 1
print(distros)
I'll provide a brief explanation for the interesting lines.
distros = { c:1 for c in charseq }
The line above is a dictionary comprehension, and it basically iterates over the characters in charseq and creates a key/value pair for a dictionary where the key is the character and the value is the number of times it has been encountered so far.
Then comes the loop:
for c in range(len(charseq)-1):
We go from 0 to length - 1 to avoid going out of bounds with the c+1 indexing in the loop's body.
if charseq[c] == charseq[c+1]:
distros[charseq[c]] += 1
At this point, every match we encounter we know is consecutive, so we simply add 1 to the character key. For example, if we take a snapshot of one iteration, the code could look like this (using direct values instead of variables, for illustrative purposes):
# replacing vars for their values
if charseq[1] == charseq[1+1]:
distros[charseq[1]] += 1
# this is a snapshot of a single comparison here and what happens later
if 'b' == 'b':
distros['b'] += 1
You can see the program output below with the correct counts:
➜ /tmp ./counter.py
{'b': 2, 'a': 1, 'c': 3, 'd': 4}

You only need to change len(word) to len(word) - 1. That said, you could also use the fact that False's value is 0 and True's value is 1 with sum:
sum(word[i] == word[i+1] for i in range(len(word)-1))
This produces the sum of (False, True, True, False) where False is 0 and True is 1 - which is what you're after.
If you want this to be safe you need to guard empty words (index -1 access):
sum(word[i] == word[i+1] for i in range(max(0, len(word)-1)))
And this can be improved with zip:
sum(c1 == c2 for c1, c2 in zip(word[:-1], word[1:]))

If we want to count consecutive characters without looping, we can make use of pandas:
In [1]: import pandas as pd
In [2]: sample = 'abbcccddddaaaaffaaa'
In [3]: d = pd.Series(list(sample))
In [4]: [(cat[1], grp.shape[0]) for cat, grp in d.groupby([d.ne(d.shift()).cumsum(), d])]
Out[4]: [('a', 1), ('b', 2), ('c', 3), ('d', 4), ('a', 4), ('f', 2), ('a', 3)]
The key is to find the first elements that are different from their previous values and then make proper groupings in pandas:
In [5]: sample = 'abba'
In [6]: d = pd.Series(list(sample))
In [7]: d.ne(d.shift())
Out[7]:
0 True
1 True
2 False
3 True
dtype: bool
In [8]: d.ne(d.shift()).cumsum()
Out[8]:
0 1
1 2
2 2
3 3
dtype: int32

This is my simple code for finding maximum number of consecutive 1's in binaray string in python 3:
count= 0
maxcount = 0
for i in str(bin(13)):
if i == '1':
count +=1
elif count > maxcount:
maxcount = count;
count = 0
else:
count = 0
if count > maxcount: maxcount = count
maxcount

There is no need to count or groupby. Just note the indices where a change occurs and subtract consecutive indicies.
w = "111000222334455555"
iw = [0] + [i+1 for i in range(len(w)-1) if w[i] != w[i+1]] + [len(w)]
dw = [w[i] for i in range(len(w)-1) if w[i] != w[i+1]] + [w[-1]]
cw = [ iw[j] - iw[j-1] for j in range(1, len(iw) ) ]
print(dw) # digits
['1', '0', '2', '3', '4']
print(cw) # counts
[3, 3, 3, 2, 2, 5]
w = 'XXYXYYYXYXXzzzzzYYY'
iw = [0] + [i+1 for i in range(len(w)-1) if w[i] != w[i+1]] + [len(w)]
dw = [w[i] for i in range(len(w)-1) if w[i] != w[i+1]] + [w[-1]]
cw = [ iw[j] - iw[j-1] for j in range(1, len(iw) ) ]
print(dw) # characters
print(cw) # digits
['X', 'Y', 'X', 'Y', 'X', 'Y', 'X', 'z', 'Y']
[2, 1, 1, 3, 1, 1, 2, 5, 3]

A one liner that returns the amount of consecutive characters with no imports:
def f(x):s=x+" ";t=[x[1] for x in zip(s[0:],s[1:],s[2:]) if (x[1]==x[0])or(x[1]==x[2])];return {h: t.count(h) for h in set(t)}
That returns the amount of times any repeated character in a list is in a consecutive run of characters.
alternatively, this accomplishes the same thing, albeit much slower:
def A(m):t=[thing for x,thing in enumerate(m) if thing in [(m[x+1] if x+1<len(m) else None),(m[x-1] if x-1>0 else None)]];return {h: t.count(h) for h in set(t)}
In terms of performance, I ran them with
site = 'https://web.njit.edu/~cm395/theBeeMovieScript/'
s = urllib.request.urlopen(site).read(100_000)
s = str(copy.deepcopy(s))
print(timeit.timeit('A(s)',globals=locals(),number=100))
print(timeit.timeit('f(s)',globals=locals(),number=100))
which resulted in:
12.528256356999918
5.351301653001428
This method can definitely be improved, but without using any external libraries, this was the best I could come up with.

In python
your_string = "wwwwweaaaawwbbbbn"
current = ''
count = 0
for index, loop in enumerate(your_string):
current = loop
count = count + 1
if index == len(your_string)-1:
print(f"{count}{current}", end ='')
break
if your_string[index+1] != current:
print(f"{count}{current}",end ='')
count = 0
continue
This will output
5w1e4a2w4b1n

#I wrote the code using simple loops and if statement
s='feeekksssh' #len(s) =11
count=1 #f:0, e:3, j:2, s:3 h:1
l=[]
for i in range(1,len(s)): #range(1,10)
if s[i-1]==s[i]:
count = count+1
else:
l.append(count)
count=1
if i == len(s)-1: #To check the last character sequence we need loop reverse order
reverse_count=1
for i in range(-1,-(len(s)),-1): #Lopping only for last character
if s[i] == s[i-1]:
reverse_count = reverse_count+1
else:
l.append(reverse_count)
break
print(l)

Today I had an interview and was asked the same question. I was struggling with the original solution in mind:
s = 'abbcccda'
old = ''
cnt = 0
res = ''
for c in s:
cnt += 1
if old != c:
res += f'{old}{cnt}'
old = c
cnt = 0 # default 0 or 1 neither work
print(res)
# 1a1b2c3d1
Sadly this solution always got unexpected edge cases result(is there anyone to fix the code? maybe i need post another question), and finally timeout the interview.
After the interview I calmed down and soon got a stable solution I think(though I like the groupby best).
s = 'abbcccda'
olds = []
for c in s:
if olds and c in olds[-1]:
olds[-1].append(c)
else:
olds.append([c])
print(olds)
res = ''.join([f'{lst[0]}{len(lst)}' for lst in olds])
print(res)
# [['a'], ['b', 'b'], ['c', 'c', 'c'], ['d'], ['a']]
# a1b2c3d1a1

Here is my simple solution:
def count_chars(s):
size = len(s)
count = 1
op = ''
for i in range(1, size):
if s[i] == s[i-1]:
count += 1
else:
op += "{}{}".format(count, s[i-1])
count = 1
if size:
op += "{}{}".format(count, s[size-1])
return op

data_input = 'aabaaaabbaaaaax'
start = 0
end = 0
temp_dict = dict()
while start < len(data_input):
if data_input[start] == data_input[end]:
end = end + 1
if end == len(data_input):
value = data_input[start:end]
temp_dict[value] = len(value)
break
if data_input[start] != data_input[end]:
value = data_input[start:end]
temp_dict[value] = len(value)
start = end
print(temp_dict)

PROBLEM: we need to count consecutive characters and return characters with their count.
def countWithString(input_string:str)-> str:
count = 1
output = ''
for i in range(1,len(input_string)):
if input_string[i]==input_string[i-1]:
count +=1
else:
output += f"{count}{input_string[i-1]}"
count = 1
# Used to add last string count (at last else condition will not run and data will not be inserted to ouput string)
output += f"{count}{input_string[-1]}"
return output
countWithString(input)
input:'aaabbbaabbcc'
output:'3a3b2a2b2c'
Time Complexity: O(n)
Space Complexity: O(1)

temp_str = "aaaajjbbbeeeeewwjjj"
def consecutive_charcounter(input_str):
counter = 0
temp_list = []
for i in range(len(input_str)):
if i==0:
counter+=1
elif input_str[i]== input_str[i-1]:
counter+=1
if i == len(input_str)-1:
temp_list.extend([input_str[i - 1], str(counter)])
else:
temp_list.extend([input_str[i-1],str(counter)])
counter = 1
print("".join(temp_list))
consecutive_charcounter(temp_str)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to group genes regarding their id and position , python - python

Related

How to optimize the search algorithm for large 2d-array?

combinations with python

Concatenate several seq within a file according to their percentage of similarities

Longest substring without repeating characters in python

Count consecutive characters

Categories

Resources