Markov Model - Random word/gibberish generator - python

My code works fine until the random word generating. Sometimes it creates words/gibberish and sometimes it doesn't (probably going through an infinite loop). However, whenever it does create words/gibberish it doesn't seem so "random". The words would either repeat themselves or most of the words will be generating near the same character length.
The problem lies in the def genRandomWord:
import random
def getTransitions(astring):
d = {}
for i in range(len(astring)):
if astring[i:i+2] in d:
d[astring[i:i+2]] += 1
else:
d[astring[i:i+2]] = 1
#h = tuple(d.items()) #gets the indexes of the dictionary
#print(h[2][1])
if ' ' in d:
del d[' ']
return d
def getFirstLetters(astring):
d = []
for i in astring:
if i not in d:
d.append(i)
d.remove(' ')
return d
def letterCount(astring):
d = {}
for i in astring:
if i not in d.keys():
d[i] = 1
else:
d[i] +=1
d[' ']-= 1
return d
def getProb(astring):
d = {}
h = tuple(getTransitions(astring).items())
j = tuple(letterCount(astring).items())
#print("h", h)
#print()
#print()
#print("j", j)
for i in h:
for n in j:
if i[0][0] == n[0]:
d[i[0]] = i[1]/n[1]
return d
def genFletter(astring):
d = {}
r = random.random()
fl ='*'
#print("r",r)
a = getProb(astring)
suma = 0
count = -1
for i in a:
if i[0][0] == ' ':
d[i[1]] = a[i]
d = sorted(tuple(d.items()))
#print(d)
while suma < r:
count += 1
suma += d[count][1]
fl = d[count][0]
#print(suma)
return fl
def genRandomWord(astring):
h = getProb(The_List)
htrans = tuple(getProb(The_List).keys())
hprob = tuple(getProb(The_List).values())
#print(hprob)
z = genFletter(The_List)
word = z
#print(word)
fletterprob = h[' '+z]
r = random.random()
while word[-1]!= ' ':
index = 0
suma = 0
for i in range(len(htrans)):
if htrans[i][0] == word[-1]:
index = i
suma += hprob[index]
for j in range(len(hprob)):
if suma >= r:
word += htrans[index][1]
break
else:
suma += hprob[index]
return word
The_List = ' steam teams meets teems eat ate state tease test mast mates '
trans = getTransitions(The_List)
lcount = letterCount(The_List)
fletter = getFirstLetters(The_List)
transProb = getProb(The_List)
#Sorting
#print('LETTER TRANSITIONS'+'\n'+str(sorted(trans.items()))+'\n')
#print('LETTER COUNT'+'\n'+str(sorted(lcount.items()))+'\n')
#print('FIRST LETTERS'+'\n'+str(sorted(fletter))+'\n')
#print('TRANSITION PROBABILITIES'+'\n'+str(sorted(transProb.items()))+'\n')
print('LETTER TRANSITIONS'+'\n'+str(trans)+'\n')
print('LETTER COUNT'+'\n'+str(lcount)+'\n')
print('FIRST LETTERS'+'\n'+str(fletter)+'\n')
print('TRANSITION PROBABILITIES'+'\n'+str(transProb)+'\n')
#print(genFletter(The_List))
for i in range(10):
print("'"+genRandomWord(The_List)+"'")

Related

How to replace an old value in a string with a new one?

So the problem states that if the given word in a sentence begins and ends with the same character I remove that character and keep on doing that until they are not the same anymore or their length is less than 3.
Example: aabaa -> aba -> b
And that is not the problem, so now I should replace the word 'aabaa' with 'b' in original string.
The only problem i have is when the sentence is given without spaces. For example:
Trust,is,all. -> rus,is,all.
Also to note characters such as (. , ! ? ; :) are to be ignored but have to be there in the final output.
So far I've wrote this but it doesn't satisfy the example above:
s1 = str(input())
sentence = s1.split()
news = []
ignorabelCharacters = ['.',',','!','?',';',':']
helpList = []
for i in range(len(s1)):
if s1[i] in ignorabelCharacters:
helpList.append([s1[i],i])
for i in sentence:
i = str(i)
j = 0
while j < len(i):
if i[j] in ignorabelCharacters:
i = i.replace(i[j],' ').strip()
j+=1
else:j+=1
news.append(i)
s2 =' '.join(news)
newSentence = s2.split()
def checkAgain(newSentence):
newNewSentance = []
count = []
x=0
for i in newSentence:
j = len(i)
while j > 2:
if i[0].lower() == i[-1] or i[0].upper() == i[-1]:
i = i[1:-1]
j-=2
x+=2
else:
break
count.append(x)
newNewSentance.append(i)
x=0
return [newNewSentance,count]
newNewSentence = checkAgain(newSentence)[0]
count = checkAgain(newSentence)[1]
def finalProcessing(newNewSentence,sentence,newSentence):
finalSentence = []
for i in range(len(sentence)):
if len(newNewSentence[i]) == len(sentence[i]):
finalSentence.append(newNewSentence[i])
else:
x = len(sentence[i]) - len(newSentence[i])
if x ==0:
finalSentence.append(newNewSentence[i])
else:
value = newNewSentence[i] + sentence[i][-x:]
finalSentence.append(value)
return finalSentence
finalSentence = finalProcessing(newNewSentence,sentence,newSentence)
def finalPrint(finalSentece):
for i in range(len(finalSentece)):
if i == len(finalSentece) -1:
print(finalSentece[i],end='')
else:
print(finalSentece[i],end= ' ')
finalPrint(finalSentence)
This approach is different from yours but here is how I would do it.
def condenseWord(w):
ln = len(w)
while ln >= 3:
if w[0].lower() == w[-1].lower():
w = w[1:-1]
ln = len(w)
else:
break
return w
def collapseString(s):
sep_chars = {' ', ',', '.', ';', ':', '!', '"', "'"}
out = r''
wdstrt = 0
lstltr = 0
for i in range(len(s)):
if s[i] in sep_chars:
ck = condenseWord(s[wdstrt:i])
out += ck
out += s[i]
wdstrt = i+1
lstltr = i+1
else:
lstltr = i
out += s[wdstrt:lstltr]
return out
Then
collapseString('aabaa') -> 'b', and
collapseString('Trust,is,all.' ) -> 'rus,is,all.'

Hash table problem for education perposes

The problem is this:
It is required to create and use the hash table structure in a problem
large number of keys.
Here are the steps:
Creating one million (1,000,000) visits to a department store
and Credit Card Payment.
From the very large number of different cards, a relatively small subset is created
as follows. Credit cards for visits will have
sixteen (16) specific fixed digits eg 1234567890123456,
but in four (4) of the sixteen (16) random positions they will also have
four characters: X, Y, Z, W in random order.
eg 12Y45W789012Z4X6
In the other places the prices are the initial ones.
I have written the codes. Is is supposed to run super fast but it runs super slowly and I don't know why... Currently, I am running my code for 10,000 cards. Could you help me? Please excuse my poor english...
The code is bellow:
import string
import random
import time
random.seed(1059442)
global max_load_factor
max_load_factor = 0.6
def printGreaterThan2(num):
while True:
if num % 2 == 1:
isPrime = True
for x in range(3,int(num**0.5),2):
if num % x == 0:
isPrime = False
break
if isPrime:
return num
num += 1
N = printGreaterThan2(1000)
arr = [ [] for _ in range(N)]
arr = [ None for _ in range(N)]
def CreatNewItem():
letters = "WXZY"
days = ["Mon", "Tue", "Wed" , "Thu" , "Fri", "Sat"]
s = ''
count = 0
num = ['1','2','3','4','5','6','7','8','9','0','1','2','3','4','5','6']
list_a = []
while(count!=4):
a = random.randint(0,15)
b = random.choice(letters)
if b not in num and a not in list_a:
num[a] = b
count = count + 1
list_a.append(a)
s = ''.join(num)
d = random.randint(0,5)
day = days[d]
money = random.randint(10,100)
a = [s,day,money]
return a
def hash(key, tablesize):
sum = 0
for pos in range(len(key)):
sum = sum + ord(key[pos])
hash = sum % tablesize
return hash
#--------------------------------------
def rehash(oldhash , tablesize):
rehash = ( oldhash + 1 ) % tablesize
return rehash
#--------------------------------------
def put2 (arr,a,N,lenght,collitions):
if float(lenght)/float(N) >= max_load_factor:
(arr,N,collitions) = Resize(arr,N,lenght,collitions)
key = a[0]
i = hash(key,N)
j =0
while (True):
if arr[i] is None:
arr[i] = a
lenght = lenght + 1
break
elif arr[i][0] == key:
arr[i][2] = arr[i][2] + a[2]
arr[i][1] = arr[i][1] + a[1]
break
else:
if j == 0 :
collitions = collitions +1
j = 1
i = rehash(i,N)
return (lenght,N,arr,collitions )
#----------------------------------------
def Resize(arr,N,lenght,collitions):
print("resize")
N = printGreaterThan2(2*N)
collitions = 0
arr2 = [ [] for _ in range(N)]
arr2 = [ None for _ in range(N)]
for p in arr:
if p is not None:
(lenght,N,arr2,collitions)=put2(arr2,p,N,lenght,collitions)
return (arr2,N,collitions)
#-----------------------------------------
l = 0
cards = []
collitions = 0
t0 = time.time()
i=0
while i!=10000:
b = CreatNewItem()
(l,N,arr,collitions) = put2(arr,b,N,l,collitions)
i=i+1
t1 = time.time() - t0
print('\ntime is {:0.20f}'.format(t1))

Implementing Knuth-Morris-Pratt (KMP) algorithm for string matching with Python

I am following Cormen Leiserson Rivest Stein (clrs) book and came across "kmp algorithm" for string matching. I implemented it using Python (as-is).
However, it doesn't seem to work for some reason. where is my fault?
The code is given below:
def kmp_matcher(t,p):
n=len(t)
m=len(p)
# pi=[0]*n;
pi = compute_prefix_function(p)
q=-1
for i in range(n):
while(q>0 and p[q]!=t[i]):
q=pi[q]
if(p[q]==t[i]):
q=q+1
if(q==m):
print "pattern occurs with shift "+str(i-m)
q=pi[q]
def compute_prefix_function(p):
m=len(p)
pi =range(m)
pi[1]=0
k=0
for q in range(2,m):
while(k>0 and p[k]!=p[q]):
k=pi[k]
if(p[k]==p[q]):
k=k+1
pi[q]=k
return pi
t = 'brownfoxlazydog'
p = 'lazy'
kmp_matcher(t,p)
This is a class I wrote based on CLRs KMP algorithm, which contains what you are after. Note that only DNA "characters" are accepted here.
class KmpMatcher(object):
def __init__(self, pattern, string, stringName):
self.motif = pattern.upper()
self.seq = string.upper()
self.header = stringName
self.prefix = []
self.validBases = ['A', 'T', 'G', 'C', 'N']
#Matches the motif pattern against itself.
def computePrefix(self):
#Initialize prefix array
self.fillPrefixList()
k = 0
for pos in range(1, len(self.motif)):
#Check valid nt
if(self.motif[pos] not in self.validBases):
self.invalidMotif()
#Unique base in motif
while(k > 0 and self.motif[k] != self.motif[pos]):
k = self.prefix[k]
#repeat in motif
if(self.motif[k] == self.motif[pos]):
k += 1
self.prefix[pos] = k
#Initialize the prefix list and set first element to 0
def fillPrefixList(self):
self.prefix = [None] * len(self.motif)
self.prefix[0] = 0
#An implementation of the Knuth-Morris-Pratt algorithm for linear time string matching
def kmpSearch(self):
#Compute prefix array
self.computePrefix()
#Number of characters matched
match = 0
found = False
for pos in range(0, len(self.seq)):
#Check valid nt
if(self.seq[pos] not in self.validBases):
self.invalidSequence()
#Next character is not a match
while(match > 0 and self.motif[match] != self.seq[pos]):
match = self.prefix[match-1]
#A character match has been found
if(self.motif[match] == self.seq[pos]):
match += 1
#Motif found
if(match == len(self.motif)):
print(self.header)
print("Match found at position: " + str(pos-match+2) + ':' + str(pos+1))
found = True
match = self.prefix[match-1]
if(found == False):
print("Sorry '" + self.motif + "'" + " was not found in " + str(self.header))
#An invalid character in the motif message to the user
def invalidMotif(self):
print("Error: motif contains invalid DNA nucleotides")
exit()
#An invalid character in the sequence message to the user
def invalidSequence(self):
print("Error: " + str(self.header) + "sequence contains invalid DNA nucleotides")
exit()
You might want to try out my code:
def recursive_find_match(i, j, pattern, pattern_track):
if pattern[i] == pattern[j]:
pattern_track.append(i+1)
return {"append":pattern_track, "i": i+1, "j": j+1}
elif pattern[i] != pattern[j] and i == 0:
pattern_track.append(i)
return {"append":pattern_track, "i": i, "j": j+1}
else:
i = pattern_track[i-1]
return recursive_find_match(i, j, pattern, pattern_track)
def kmp(str_, pattern):
len_str = len(str_)
len_pattern = len(pattern)
pattern_track = []
if len_pattern == 0:
return
elif len_pattern == 1:
pattern_track = [0]
else:
pattern_track = [0]
i = 0
j = 1
while j < len_pattern:
data = recursive_find_match(i, j, pattern, pattern_track)
i = data["i"]
j = data["j"]
pattern_track = data["append"]
index_str = 0
index_pattern = 0
match_from = -1
while index_str < len_str:
if index_pattern == len_pattern:
break
if str_[index_str] == pattern[index_pattern]:
if index_pattern == 0:
match_from = index_str
index_pattern += 1
index_str += 1
else:
if index_pattern == 0:
index_str += 1
else:
index_pattern = pattern_track[index_pattern-1]
match_from = index_str - index_pattern
Try this:
def kmp_matcher(t, d):
n=len(t)
m=len(d)
pi = compute_prefix_function(d)
q = 0
i = 0
while i < n:
if d[q]==t[i]:
q=q+1
i = i + 1
else:
if q != 0:
q = pi[q-1]
else:
i = i + 1
if q == m:
print "pattern occurs with shift "+str(i-q)
q = pi[q-1]
def compute_prefix_function(p):
m=len(p)
pi =range(m)
k=1
l = 0
while k < m:
if p[k] <= p[l]:
l = l + 1
pi[k] = l
k = k + 1
else:
if l != 0:
l = pi[l-1]
else:
pi[k] = 0
k = k + 1
return pi
t = 'brownfoxlazydog'
p = 'lazy'
kmp_matcher(t, p)
KMP stands for Knuth-Morris-Pratt it is a linear time string-matching algorithm.
Note that in python, the string is ZERO BASED, (while in the book the string starts with index 1).
So we can workaround this by inserting an empty space at the beginning of both strings.
This causes four facts:
The len of both text and pattern is augmented by 1, so in the loop range, we do NOT have to insert the +1 to the right interval. (note that in python the last step is excluded);
To avoid accesses out of range, you have to check the values of k+1 and q+1 BEFORE to give them as index to arrays;
Since the length of m is augmented by 1, in kmp_matcher, before to print the response, you have to check this instead: q==m-1;
For the same reason, to calculate the correct shift you have to compute this instead: i-(m-1)
so the correct code, based on your original question, and considering the starting code from Cormen, as you have requested, would be the following:
(note : I have inserted a matching pattern inside, and some debug text that helped me to find logical errors):
def compute_prefix_function(P):
m = len(P)
pi = [None] * m
pi[1] = 0
k = 0
for q in range(2, m):
print ("q=", q, "\n")
print ("k=", k, "\n")
if ((k+1) < m):
while (k > 0 and P[k+1] != P[q]):
print ("entered while: \n")
print ("k: ", k, "\tP[k+1]: ", P[k+1], "\tq: ", q, "\tP[q]: ", P[q])
k = pi[k]
if P[k+1] == P[q]:
k = k+1
print ("Entered if: \n")
print ("k: ", k, "\tP[k]: ", P[k], "\tq: ", q, "\tP[q]: ", P[q])
pi[q] = k
print ("Outside while or if: \n")
print ("pi[", q, "] = ", k, "\n")
print ("---next---")
print ("---end for---")
return pi
def kmp_matcher(T, P):
n = len(T)
m = len(P)
pi = compute_prefix_function(P)
q = 0
for i in range(1, n):
print ("i=", i, "\n")
print ("q=", q, "\n")
print ("m=", m, "\n")
if ((q+1) < m):
while (q > 0 and P[q+1] != T[i]):
q = pi[q]
if P[q+1] == T[i]:
q = q+1
if q == m-1:
print ("Pattern occurs with shift", i-(m-1))
q = pi[q]
print("---next---")
print("---end for---")
txt = " bacbababaabcbab"
ptn = " ababaab"
kmp_matcher(txt, ptn)
(so this would be the correct accepted answer...)
hope that it helps.

Converting string using list in python

I've been trying to rearrange the string by reversing a particular strings consecutively from the given string input and the limit is given as input.
for example
limit is 3
if input is Hellothegamestarts
output must be Heltolhegemastastr
and it is saved in separate array
The code is:
while True:
t = int(input())
if t == 0:
break
string = raw_input()
string = string.encode('utf-8')
leng = len(string)
r = t/leng
m = []
leng = 0
for i in range(r):
if r % 2 == 0:
l = 0
l = leng + t
for i in range(t):
temp = string[l]
m.append(temp)
l = l - 1
r = r + 1
leng = leng + t
else:
l = 0
l = leng
for i in range(t):
temp = string[l]
m.append(temp)
l = l + 1
r = r + 1
leng = leng + t
print m
the output i got is [] and asks for next input for t.
Any help is appreciated.
Take the blocks in chunks of 3s, and reverse the odd ones, eg:
import re
s = 'Hellothegamestarts'
r = ''.join(
el if idx % 2 == 0 else el[::-1]
for idx, el in enumerate(re.findall('.{,3}', s))
)
# Heltolhegemastastr
Maybe you can try -
t = int(input())
if t == 0:
break;
string = raw_input()
m = ''
leng = len(string)
i = 0
while i < leng:
if (i/t) % 2 != 0:
m = m + string[i+t-1:i-1:-1]
else:
m = m + string[i:i+t]
i = i + t
print(m)
Alternatively you can try this
def myfunc(s, count):
return [''.join(x) for x in zip(*[list(s[z::count]) for z in range(count)])]
a='Hellothegamestarts'
lst=myfunc(a,3)
print "".join([i if lst.index(i) in range(0,len(lst),2) else i[::-1] for i in lst])
myfun i didn't write it.It's from here

Creating a list from dictionary

dictionary:
{'airport': [YearCount( year=2007, count=175702 ), YearCount( year=2008, count=173294 )], 'wandered': [YearCount( year=2005, count=83769 ), YearCount( year=2006, count=87688 ), YearCount( year=2007, count=108634 ), YearCount( year=2008, count=171015 )], 'request': [YearCount( year=2005, count=646179 ), YearCount( year=2006, count=677820 ), YearCount( year=2007, count=697645 ), YearCount( year=2008, count=795265 )]}
This counts up the total letters in the dictionary keys:
def letterlength(words):
length = 0
for word in words.keys():
length += len(word)
return length
and I'm trying to create a list with this function, but I'm not getting a list. It should return the letter frequency of the letters in the words. I know it's lengthy, but I couldn't figure out a simpler method:
def letterFreq(words):
lst = []
a = 0
b = 0
c = 0
d=0
e=0
f=0
g=0
h=0
i=0
j=0
k=0
l=0
m=0
n=0
o=0
p=0
q=0
r=0
s=0
t=0
u=0
v=0
w=0
x=0
y=0
z=0
for word in words.keys():
a += word.count('a')
b += word.count('b')
c += word.count('c')
d += word.count('d')
e += word.count('e')
f += word.count('f')
g += word.count('g')
h += word.count('h')
i += word.count('i')
j += word.count('j')
k += word.count('k')
l += word.count('l')
m += word.count('m')
n += word.count('n')
o += word.count('o')
p += word.count('p')
q += word.count('q')
r += word.count('r')
s += word.count('s')
t += word.count('t')
u += word.count('u')
v += word.count('v')
w += word.count('w')
x += word.count('x')
y += word.count('y')
z += word.count('z')
return (a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z)
lst.append(a/letterlength(words))
lst.append(b/letterlength(words))
lst.append(c/letterlength(words))
lst.append(d/letterlength(words))
lst.append(e/letterlength(words))
lst.append(f/letterlength(words))
lst.append(g/letterlength(words))
lst.append(h/letterlength(words))
lst.append(i/letterlength(words))
lst.append(j/letterlength(words))
lst.append(k/letterlength(words))
lst.append(l/letterlength(words))
lst.append(m/letterlength(words))
lst.append(n/letterlength(words))
lst.append(o/letterlength(words))
lst.append(p/letterlength(words))
lst.append(q/letterlength(words))
lst.append(r/letterlength(words))
lst.append(s/letterlength(words))
lst.append(t/letterlength(words))
lst.append(u/letterlength(words))
lst.append(v/letterlength(words))
lst.append(w/letterlength(words))
lst.append(x/letterlength(words))
lst.append(y/letterlength(words))
lst.append(z/letterlength(words))
return lst
collections.Counter(itertools.chain(*d))
This is shorthand for some code like the following:
count = {}
for word in d:
for letter in word:
count[letter] = count.get(letter, 0) + 1
Try collections.Counter:
import collections
counter = collections.Counter()
for word in words:
counter.update(word)
You can then obtain letter frequencies with
total = sum(counter.values())
lst = [counter[letter] / total for letter in 'abcdefghijklmnopqrstuvwxyz']
You could iterate over the ASCII values of each character. Assuming you already have a 26-entry list set up:
letlen = letterlength(words)
for i in range(26):
for word in words.keys():
lst[i]+=word.count(chr(i+ord('a'))/letlen

Categories

Resources