I'm working on an OCR use case and have identified common misclassification from the confusion matrix which is for example: '1' being confused for 'J' and '2' being confused with 'Z' and 'J'.
For a given word, I am trying to create a python script which would create all the permutations which account for all the misclassification.
Example:
Common Misclassifications: {'1':['J'],'2':['Z','J']}
Input: "AB1CD2"
Output: AB1CD2, AB1CDZ, ABJCD2, ABJCDZ, AB1CDJ, ABJCDJ
How do I go about solving this?
You get a neat solution by using a dictionary of all possible classifications, not just all mis-classifications. That is, you first "enrich" your misclassification dictionary with all possible correct classifications.
from itertools import product
all_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
common_misclass = {'1':['J'],'2':['Z','J']}
input_string = "AB1CD2"
common_class = {}
for char in all_characters:
if char in common_misclass:
common_class[char] = [char] + common_misclass[char]
else:
common_class[char] = [char]
possible_outputs = ["".join(tup) for tup in
product(*[common_class[letter] for letter in input_string])]
print(possible_outputs)
itertools product should help
from itertools import product
misclass = {'1':['J'],'2':['Z','J']}
misclass_items = [tuple([k, *v]) for k, v in misclass.items()]
print(["AB" + x + "CD" + y for (x, y) in list(product(*misclass_items))])
# ['AB1CD2', 'AB1CDZ', 'AB1CDJ', 'ABJCD2', 'ABJCDZ', 'ABJCDJ']
Related
I just started using function and I'm trying to build one that's find a repeated substring that is length is at least k and returns the results into tuple that contains a dict.
the keys needs to be the substring and the value is how many times it was repeated, and then add to the tuple the length of the substring.
I just started but I didnt really knew how to continue but this is what I tried to do:
def longest_repeat(string, K)
longest = {} ,
if isinstance(K, int) and isinstance(string, str)
for sub_str in string:
if sub_str >= K:
longest[0][sub_seq] = DNA_seq_slic = []
a=0
b=k
for nuc in range(len(DNA_seq)-k+1):
DNA_seq_slic.append(DNA_seq[a:b])
a +=1
b +=1
import collections
for sub_seq in DNA_seq_slic:
repeated = [item for item, count in collections.Counter(DNA_seq_slic).items() if count > 1]
repeated_subseq_dict = dict(zip(repeated,[0 for x in range(0,len(repeated))]))
for key in repeated_subseq_dict:
repeated_subseq_dict[key] = DNA_seq_slic.count(key)
return(repeated_subseq_dict)
Im sorry if its a little bit messed up, I didnt really had direction and I tried to use other function I built to solve this and it didnt really worked. I can clarify more if needed.
the output should be something like this:
longest_repeated("ATAATACATAATA", 5)
output: longest = {ATAATA: 2} , 6
Really appreciate any kind of help! Thanks!
You can try re module:
import re
def longest_repeated(s, k):
m = re.findall(f"(.{{{k},}})(?=.*\\1)", s)
if m:
mx = max(m, key=len)
return {mx: s.count(mx)}, len(mx)
Some tests:
print(longest_repeated("ATAATACATAATA", 5))
({'ATAATA': 2}, 6)
print(longest_repeated("XXXXXATAATACATAATAXXXXX", 5))
({'ATAATA': 2}, 6)
I'm a newbie in python, and I need to find the most frequent element in list pdInput and how many elements are the same in the list of mostFreqenNum
mostFreqenNum = []
contMostnum = [0]
ContTraining = int(input('How many time You like to Train you input: '))
for i in range(ContTraining):
pdInput = int(
input('Please input your number whatever you want: '))
mostFreqenNum.append(pdInput)
for x in mostFreqenNum:
coutFreqenNum = contMostnum.count(x)
given a list of values inp, you can find the most common like this:
using collections.Counter
from collections import Counter
most_common = Counter(inp).most_common(1)
output is a tuple with (value, count) inside
using sorted
sorted(inp, key=lambda x: inp.count(x), reverse=True)[0]
output is the most common value in the list
using numpy: # note only works with numeric values
np.argmax(np.bincount(inp))
output is the most common value in the list
one more using builtins:
max(set(inp), key=inp.count)
output is the most common value in the list
another using pandas:
import pandas as pd
pd.value_counts(inp).index[0]
output is the most common value in the list
Why you dont use the built in module from python, statistics.
you can use the module like these :
import statistics
### your input code
mode = statistics.mode(mostFreqenNum)
print(mode)
mode() receive parameter list type.
Then you can use the count().
Another example, maybe like these:
>>> import statistics
>>> lists = [2,3,2,2,3,4,5]
>>> mode = statistics.mode(lists)
>>> print(mode)
2
>>> lists.count(2)
3
>>>
I am not sure what you are trying to do exactly, but maybe this could work:
mostFreqenNum = {}
contMostnum = 0
myList = [1, 2, 3, 2, 4, 3, 2, 3, 5, 3]
for i in myList:
if i in mostFreqenNum:
mostFreqenNum[i] += 1
else:
mostFreqenNum[i] = 1
for x in mostFreqenNum:
if mostFreqenNum[x] > contMostnum:
contMostnum = mostFreqenNum[x]
mostFreqKey = x
else:
continue
print(f'Most frequent key, {mostFreqKey}, seen {contMostnum} times.')
def Prediction_Model_v3():
alnv3 = [[],[]]
inpv3 = int(input('How many time You like to Train you input V3: '))
for i in range(inpv3):
pdInpv3 = int(
input('V3 input number whatever you want: '))
alnv3[0].append(pdInpv3)
mdv3 = statistics.mode(alnv3[0])
if(pdInpv3 == mdv3):
alnv3[1].append(str(len(alnv3[1])))
print('numberInput V3: ', alnv3[0])
print('Most Frequent number V3 is ', str(mdv3), ':', str(len(alnv3[1])))
pdtISv3 = (((inpv3-int(len(alnv3[1])))*100)/inpv3)
print('Result of prediction V3 is: ', str(
mdv3), '=', str(pdtISv3), '%')
alnv3.clear()
return str(pdtISv3)
import collections
from typing import Counter
numbers = [1,3,7,4,3,0,3,6,3]
c = Counter(numbers).most_common()
print(f"The most frequent number {c[0][0]} was {c[0][1]} times repeated")
I am looking to be able to recursively remove adjacent letters in a string that differ only in their case e.g. if s = AaBbccDd i would want to be able to remove Aa Bb Dd but leave cc.
I can do this recursively using lists:
I think it aught to be able to be done using regex but i am struggling:
with test string 'fffAaaABbe' the answer should be 'fffe' but the regex I am using gives 'fe'
def test(line):
res = re.compile(r'(.)\1{1}', re.IGNORECASE)
#print(res.search(line))
while res.search(line):
line = res.sub('', line, 1)
print(line)
The way that works is:
def test(line):
result =''
chr = list(line)
cnt = 0
i = len(chr) - 1
while i > 0:
if ord(chr[i]) == ord(chr[i - 1]) + 32 or ord(chr[i]) == ord(chr[i - 1]) - 32:
cnt += 1
chr.pop(i)
chr.pop(i - 1)
i -= 2
else:
i -= 1
if cnt > 0: # until we can't find any duplicates.
return test(''.join(chr))
result = ''.join(chr)
print(result)
Is it possible to do this using a regex?
re.IGNORECASE is not way to solve this problem, as it will treat aa, Aa, aA, AA same way. Technically it is possible using re.sub, following way.
import re
txt = 'fffAaaABbe'
after_sub = re.sub(r'Aa|aA|Bb|bB|Cc|cC|Dd|dD|Ee|eE|Ff|fF|Gg|gG|Hh|hH|Ii|iI|Jj|jJ|Kk|kK|Ll|lL|Mm|mM|Nn|nN|Oo|oO|Pp|pP|Qq|qQ|Rr|rR|Ss|sS|Tt|tT|Uu|uU|Vv|vV|Ww|wW|Xx|xX|Yy|yY|Zz|zZ', '', txt)
print(after_sub) # fffe
Note that I explicitly defined all possible letters pairs, because so far I know there is no way to say "inverted case letter" using just re pattern. Maybe other user will be able to provide more concise re-based solution.
I suggest a different approach which uses groupby to group adjacent similar letters:
from itertools import groupby
def test(line):
res = []
for k, g in groupby(line, key=lambda x: x.lower()):
g = list(g)
if all(x == x.lower() for x in g):
res.append(''.join(g))
print(''.join(res))
Sample run:
>>> test('AaBbccDd')
cc
>>> test('fffAaaABbe')
fffe
r'(.)\1{1}' is wrong because it will match any character that is repeated twice, including non-letter characters. If you want to stick to letters, you can't use this.
However, even if we just do r'[A-z]\1{1}', this would still be bad because you would match any sequence of the same letter twice, but it would catch xx and XX -- you don't want to match consecutive same characters with matching case, as you said in the original question.
It just so happens that there is no short-hand to do this conveniently, but it is still possible. You could also just write a small function to turn it into a short-hand.
Building on #Daweo's answer, you can generate the regex pattern needed to match pairs of same letters with non-matching case to get the final pattern of aA|Aa|bB|Bb|cC|Cc|dD|Dd|eE|Ee|fF|Ff|gG|Gg|hH|Hh|iI|Ii|jJ|Jj|kK|Kk|lL|Ll|mM|Mm|nN|Nn|oO|Oo|pP|Pp|qQ|Qq|rR|Rr|sS|Ss|tT|Tt|uU|Uu|vV|Vv|wW|Ww|xX|Xx|yY|Yy|zZ|Zz:
import re
import string
def consecutiveLettersNonMatchingCase():
# Get all 'xX|Xx' with a list comprehension
# and join them with '|'
return '|'.join(['{0}{1}|{1}{0}'.format(s, t)\
# Iterate through the upper/lowercase characters
# in lock-step
for s, t in zip(
string.ascii_lowercase,
string.ascii_uppercase)])
def test(line):
res = re.compile(consecutiveLettersNonMatchingCase())
print(res.search(line))
while res.search(line):
line = res.sub('', line, 1)
print(line)
print(consecutiveLettersNonMatchingCase())
I'm setting up a simple sentence generator in python, to create as many word combinations as possible to describe a generic set of images involving robots. (Its a long story :D)
It outputs something like this: 'Cyborg Concept Downloadable Illustration'
Amazingly, the random generate I wrote only goes up to 255 unique combinations. Here is the script:
import numpy
from numpy import matrix
from numpy import linalg
import itertools
from pprint import pprint
import random
m = matrix( [
['Robot','Cyborg','Andoid', 'Bot', 'Droid'],
['Character','Concept','Mechanical Person', 'Artificial Intelligence', 'Mascot'],
['Downloadable','Stock','3d', 'Digital', 'Robotics'],
['Clipart','Illustration','Render', 'Image', 'Graphic'],
])
used = []
i = 0
def make_sentence(m, used):
sentence = []
i = 0
while i <= 3:
word = m[i,random.randrange(0,4)]
sentence.append(word)
i = i+1
return ' '.join(sentence)
def is_used(sentence, used):
if sentence not in used:
return False
else:
return True
sentences = []
i = 0
while i <= 1000:
sentence = make_sentence(m, used)
if(is_used(sentence, used)):
continue
else:
sentences.append(sentence)
print str(i) + ' ' +sentence
used.append(sentence)
i = i+1
Using randint instead of randrange, I get up to 624 combinations (instantly) then it hangs in an infinite loop, unable to create more combos.
I guess the question is, is there a more appropriate way of determining all possible combinations of a matrix?
You can make use of itertools to get the all possible combinations of matrix. I given one example to show how itertools will work.
import itertools
mx = [
['Robot','Cyborg','Andoid', 'Bot', 'Droid'],
['Character','Concept','Mechanical Person', 'Artificial Intelligence', 'Mascot'],
['Downloadable','Stock','3d', 'Digital', 'Robotics'],
['Clipart','Illustration','Render', 'Image', 'Graphic'],
]
for combination in itertools.product(*mx):
print combination
Your code can make use of recursion. Without itertools, here is one strategy:
def make_sentences(m, choices = []):
output = []
if len(choices) == 4:
sentence = ""
i = 0
#Go through the four rows of the matrix
#and choose words for the sentence
for j in choices:
sentence += " " + m[i][j]
i += 1
return [sentence] #must be returned as a list
for i in range(0,4):
output += make_sentences(m, choices+[i])
return output #this could be changed to a yield statement
This is quite different from your original function.
The choices list keeps track of the index of the column for each ROW in m that has been selected. When the recursive method finds that choices four rows have been selected, it outputs a list with just ONE sentence.
Where the method finds that the choices list doesn't have four elements, it recursively calls itself for FOUR new choices lists. The results of these recursive calls are added to the output list.
For example,
The function could be something like def RandABCD(n, .25, .34, .25, .25):
Where n is the length of the string to be generated and the following numbers are the desired probabilities of A, B, C, D.
I would imagine this is quite simple, however i am having trouble creating a working program. Any help would be greatly appreciated.
Here's the code to select a single weighted value. You should be able to take it from here. It uses bisect and random to accomplish the work.
from bisect import bisect
from random import random
def WeightedABCD(*weights):
chars = 'ABCD'
breakpoints = [sum(weights[:x+1]) for x in range(4)]
return chars[bisect(breakpoints, random())]
Call it like this: WeightedABCD(.25, .34, .25, .25).
EDIT: Here is a version that works even if the weights don't add up to 1.0:
from bisect import bisect_left
from random import uniform
def WeightedABCD(*weights):
chars = 'ABCD'
breakpoints = [sum(weights[:x+1]) for x in range(4)]
return chars[bisect_left(breakpoints, uniform(0.0,breakpoints[-1]))]
The random class is quite powerful in python. You can generate a list with the characters desired at the appropriate weights and then use random.choice to obtain a selection.
First, make sure you do an import random.
For example, let's say you wanted a truly random string from A,B,C, or D.
1. Generate a list with the characters
li = ['A','B','C','D']
Then obtain values from it using random.choice
output = "".join([random.choice(li) for i in range(0, n)])
You could easily make that a function with n as a parameter.
In the above case, you have an equal chance of getting A,B,C, or D.
You can use duplicate entries in the list to give characters higher probabilities. So, for example, let's say you wanted a 50% chance of A and 25% chances of B and C respectively. You could have an array like this:
li = ['A','A','B','C']
And so on.
It would not be hard to parameterize the characters coming in with desired weights, to model that I'd use a dictionary.
characterbasis = {'A':25, 'B':25, 'C':25, 'D':25}
Make that the first parameter, and the second being the length of the string and use the above code to generate your string.
For four letters, here's something quick off the top of my head:
from random import random
def randABCD(n, pA, pB, pC, pD):
# assumes pA + pB + pC + pD == 1
cA = pA
cB = cA + pB
cC = cB + pC
def choose():
r = random()
if r < cA:
return 'A'
elif r < cB:
return 'B'
elif r < cC:
return 'C'
else:
return 'D'
return ''.join([choose() for i in xrange(n)])
I have no doubt that this can be made much cleaner/shorter, I'm just in a bit of a hurry right now.
The reason I wouldn't be content with David in Dakota's answer of using a list of duplicate characters is that depending on your probabilities, it may not be possible to create a list with duplicates in the right numbers to simulate the probabilities you want. (Well, I guess it might always be possible, but you might wind up needing a huge list - what if your probabilities were 0.11235442079, 0.4072777384, 0.2297927874, 0.25057505341?)
EDIT: here's a much cleaner generic version that works with any number of letters with any weights:
from bisect import bisect
from random import uniform
def rand_string(n, content):
''' Creates a string of letters (or substrings) chosen independently
with specified probabilities. content is a dictionary mapping
a substring to its "weight" which is proportional to its probability,
and n is the desired number of elements in the string.
This does not assume the sum of the weights is 1.'''
l, cdf = zip(*[(l, w) for l, w in content.iteritems()])
cdf = list(cdf)
for i in xrange(1, len(cdf)):
cdf[i] += cdf[i - 1]
return ''.join([l[bisect(cdf, uniform(0, cdf[-1]))] for i in xrange(n)])
Here is a rough idea of what might suit you
import random as r
def distributed_choice(probs):
r= r.random()
cum = 0.0
for pair in probs:
if (r < cum + pair[1]):
return pair[0]
cum += pair[1]
The parameter probs takes a list of pairs of the form (object, probability). It is assumed that the sum of probabilities is 1 (otherwise, its trivial to normalize).
To use it just execute:
''.join([distributed_choice(probs)]*4)
Hmm, something like:
import random
class RandomDistribution:
def __init__(self, kv):
self.entries = kv.keys()
self.where = []
cnt = 0
for x in self.entries:
self.where.append(cnt)
cnt += kv[x]
self.where.append(cnt)
def find(self, key):
l, r = 0, len(self.where)-1
while l+1 < r:
m = (l+r)/2
if self.where[m] <= key:
l=m
else:
r=m
return self.entries[l]
def randomselect(self):
return self.find(random.random()*self.where[-1])
rd = RandomDistribution( {"foo": 5.5, "bar": 3.14, "baz": 2.8 } )
for x in range(1000):
print rd.randomselect()
should get you most of the way...
Thank you all for your help, I was able to figure something out, mostly with this info.
For my particular need, I did something like this:
import random
#Create a function to randomize a given string
def makerandom(seq):
return ''.join(random.sample(seq, len(seq)))
def randomDNA(n, probA=0.25, probC=0.25, probG=0.25, probT=0.25):
notrandom=''
A=int(n*probA)
C=int(n*probC)
T=int(n*probT)
G=int(n*probG)
#The remainder part here is used to make sure all n are used, as one cannot
#have half an A for example.
remainder=''
for i in range(0, n-(A+G+C+T)):
ramainder+=random.choice("ATGC")
notrandom=notrandom+ 'A'*A+ 'C'*C+ 'G'*G+ 'T'*T + remainder
return makerandom(notrandom)