Creating a list from dictionary - python

dictionary:
{'airport': [YearCount( year=2007, count=175702 ), YearCount( year=2008, count=173294 )], 'wandered': [YearCount( year=2005, count=83769 ), YearCount( year=2006, count=87688 ), YearCount( year=2007, count=108634 ), YearCount( year=2008, count=171015 )], 'request': [YearCount( year=2005, count=646179 ), YearCount( year=2006, count=677820 ), YearCount( year=2007, count=697645 ), YearCount( year=2008, count=795265 )]}
This counts up the total letters in the dictionary keys:
def letterlength(words):
length = 0
for word in words.keys():
length += len(word)
return length
and I'm trying to create a list with this function, but I'm not getting a list. It should return the letter frequency of the letters in the words. I know it's lengthy, but I couldn't figure out a simpler method:
def letterFreq(words):
lst = []
a = 0
b = 0
c = 0
d=0
e=0
f=0
g=0
h=0
i=0
j=0
k=0
l=0
m=0
n=0
o=0
p=0
q=0
r=0
s=0
t=0
u=0
v=0
w=0
x=0
y=0
z=0
for word in words.keys():
a += word.count('a')
b += word.count('b')
c += word.count('c')
d += word.count('d')
e += word.count('e')
f += word.count('f')
g += word.count('g')
h += word.count('h')
i += word.count('i')
j += word.count('j')
k += word.count('k')
l += word.count('l')
m += word.count('m')
n += word.count('n')
o += word.count('o')
p += word.count('p')
q += word.count('q')
r += word.count('r')
s += word.count('s')
t += word.count('t')
u += word.count('u')
v += word.count('v')
w += word.count('w')
x += word.count('x')
y += word.count('y')
z += word.count('z')
return (a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z)
lst.append(a/letterlength(words))
lst.append(b/letterlength(words))
lst.append(c/letterlength(words))
lst.append(d/letterlength(words))
lst.append(e/letterlength(words))
lst.append(f/letterlength(words))
lst.append(g/letterlength(words))
lst.append(h/letterlength(words))
lst.append(i/letterlength(words))
lst.append(j/letterlength(words))
lst.append(k/letterlength(words))
lst.append(l/letterlength(words))
lst.append(m/letterlength(words))
lst.append(n/letterlength(words))
lst.append(o/letterlength(words))
lst.append(p/letterlength(words))
lst.append(q/letterlength(words))
lst.append(r/letterlength(words))
lst.append(s/letterlength(words))
lst.append(t/letterlength(words))
lst.append(u/letterlength(words))
lst.append(v/letterlength(words))
lst.append(w/letterlength(words))
lst.append(x/letterlength(words))
lst.append(y/letterlength(words))
lst.append(z/letterlength(words))
return lst

collections.Counter(itertools.chain(*d))
This is shorthand for some code like the following:
count = {}
for word in d:
for letter in word:
count[letter] = count.get(letter, 0) + 1

Try collections.Counter:
import collections
counter = collections.Counter()
for word in words:
counter.update(word)
You can then obtain letter frequencies with
total = sum(counter.values())
lst = [counter[letter] / total for letter in 'abcdefghijklmnopqrstuvwxyz']

You could iterate over the ASCII values of each character. Assuming you already have a 26-entry list set up:
letlen = letterlength(words)
for i in range(26):
for word in words.keys():
lst[i]+=word.count(chr(i+ord('a'))/letlen

Related

How to count the amount of vowels and consonants in a text file?

I am trying to correctly count the number of vowels and consonants in a text file but I am lost currently. I have the other parts that need to be found done.
# Home work 4
from string import punctuation
fname = raw_input("Enter name of the file: ")
fvar = open(fname, "r")
punctuationList = "!#$%&'(),.:;?"
numLines = 0
numWords = 0
numChars = 0
numPunc = 0
numVowl = 0
numCons = 0
if line in "aeiou":
numVowl = + 1
else:
numCons += 1
for line in fvar:
wordsList = line.split()
numLines += 1
numWords += len(wordsList)
numChars += len(line)
for punctuation in punctuationList:
numPunc += 1
print "Lines %d" % numLines
print "Words %d" % numWords
print "The amount of charcters is %d" % numChars
print "The amount of punctuation is %d" % numPunc
print "The amount of vowls is %d" % numVowl
print "The amount of consonants is %d" % numCons
You need to loop over all the characters in the line, testing whether they're vowels, consonants, or punctuation.
for line in fvar:
wordsList = line.split()
numLines += 1
numWords += len(wordsList)
numChars += len(line)
for char in line:
if char in 'aeiou':
numVowl += 1
elif char in 'bcdfghjklmnpqrstvwxyz'
numCons += 1
else:
numPunc += 1
You can try this:
f = [i.strip('\n').split() for i in open('file.txt')]
new_lines = [[sum(b in 'bcdfghjklmnpqrstvwxyz' for b in i), sum(b in "aeiou" for b in i)] for i in f]
total_consonants = sum(a for a, b in new_lines)
total_vowels = sum(b for a, b in new_lines)
I would write a function that returns a 3-tuple of the counts you care about when given a string.
import string
def count_helper(s) -> ("vowel count", "consonant count", "punctuation count"):
vowels = set('aeiou')
consonants = set(string.ascii_lowercase).difference(vowels)
# you could also do set('bcdfghjklmnpqrstvwxyz'), but I recommend this approach
# because it's more obviously correct (you can't possibly typo and miss a letter)
c_vowel = c_consonant = c_punctuation = 0
for ch in s:
if ch in vowels: c_vowel += 1
elif ch in consonants: c_consonant += 1
else: c_punctuation += 1
return (c_vowel, c_consonant, c_punctuation)
Then as you iterate through the file, pass each line to count_helper.
counts = {'vowels': 0, 'consonants': 0, 'punctuation': 0}
for line in f:
v, c, p = count_helper(line)
counts['vowels'] += v
counts['consonants'] += c
counts['punctuation'] += p

Implementing Knuth-Morris-Pratt (KMP) algorithm for string matching with Python

I am following Cormen Leiserson Rivest Stein (clrs) book and came across "kmp algorithm" for string matching. I implemented it using Python (as-is).
However, it doesn't seem to work for some reason. where is my fault?
The code is given below:
def kmp_matcher(t,p):
n=len(t)
m=len(p)
# pi=[0]*n;
pi = compute_prefix_function(p)
q=-1
for i in range(n):
while(q>0 and p[q]!=t[i]):
q=pi[q]
if(p[q]==t[i]):
q=q+1
if(q==m):
print "pattern occurs with shift "+str(i-m)
q=pi[q]
def compute_prefix_function(p):
m=len(p)
pi =range(m)
pi[1]=0
k=0
for q in range(2,m):
while(k>0 and p[k]!=p[q]):
k=pi[k]
if(p[k]==p[q]):
k=k+1
pi[q]=k
return pi
t = 'brownfoxlazydog'
p = 'lazy'
kmp_matcher(t,p)
This is a class I wrote based on CLRs KMP algorithm, which contains what you are after. Note that only DNA "characters" are accepted here.
class KmpMatcher(object):
def __init__(self, pattern, string, stringName):
self.motif = pattern.upper()
self.seq = string.upper()
self.header = stringName
self.prefix = []
self.validBases = ['A', 'T', 'G', 'C', 'N']
#Matches the motif pattern against itself.
def computePrefix(self):
#Initialize prefix array
self.fillPrefixList()
k = 0
for pos in range(1, len(self.motif)):
#Check valid nt
if(self.motif[pos] not in self.validBases):
self.invalidMotif()
#Unique base in motif
while(k > 0 and self.motif[k] != self.motif[pos]):
k = self.prefix[k]
#repeat in motif
if(self.motif[k] == self.motif[pos]):
k += 1
self.prefix[pos] = k
#Initialize the prefix list and set first element to 0
def fillPrefixList(self):
self.prefix = [None] * len(self.motif)
self.prefix[0] = 0
#An implementation of the Knuth-Morris-Pratt algorithm for linear time string matching
def kmpSearch(self):
#Compute prefix array
self.computePrefix()
#Number of characters matched
match = 0
found = False
for pos in range(0, len(self.seq)):
#Check valid nt
if(self.seq[pos] not in self.validBases):
self.invalidSequence()
#Next character is not a match
while(match > 0 and self.motif[match] != self.seq[pos]):
match = self.prefix[match-1]
#A character match has been found
if(self.motif[match] == self.seq[pos]):
match += 1
#Motif found
if(match == len(self.motif)):
print(self.header)
print("Match found at position: " + str(pos-match+2) + ':' + str(pos+1))
found = True
match = self.prefix[match-1]
if(found == False):
print("Sorry '" + self.motif + "'" + " was not found in " + str(self.header))
#An invalid character in the motif message to the user
def invalidMotif(self):
print("Error: motif contains invalid DNA nucleotides")
exit()
#An invalid character in the sequence message to the user
def invalidSequence(self):
print("Error: " + str(self.header) + "sequence contains invalid DNA nucleotides")
exit()
You might want to try out my code:
def recursive_find_match(i, j, pattern, pattern_track):
if pattern[i] == pattern[j]:
pattern_track.append(i+1)
return {"append":pattern_track, "i": i+1, "j": j+1}
elif pattern[i] != pattern[j] and i == 0:
pattern_track.append(i)
return {"append":pattern_track, "i": i, "j": j+1}
else:
i = pattern_track[i-1]
return recursive_find_match(i, j, pattern, pattern_track)
def kmp(str_, pattern):
len_str = len(str_)
len_pattern = len(pattern)
pattern_track = []
if len_pattern == 0:
return
elif len_pattern == 1:
pattern_track = [0]
else:
pattern_track = [0]
i = 0
j = 1
while j < len_pattern:
data = recursive_find_match(i, j, pattern, pattern_track)
i = data["i"]
j = data["j"]
pattern_track = data["append"]
index_str = 0
index_pattern = 0
match_from = -1
while index_str < len_str:
if index_pattern == len_pattern:
break
if str_[index_str] == pattern[index_pattern]:
if index_pattern == 0:
match_from = index_str
index_pattern += 1
index_str += 1
else:
if index_pattern == 0:
index_str += 1
else:
index_pattern = pattern_track[index_pattern-1]
match_from = index_str - index_pattern
Try this:
def kmp_matcher(t, d):
n=len(t)
m=len(d)
pi = compute_prefix_function(d)
q = 0
i = 0
while i < n:
if d[q]==t[i]:
q=q+1
i = i + 1
else:
if q != 0:
q = pi[q-1]
else:
i = i + 1
if q == m:
print "pattern occurs with shift "+str(i-q)
q = pi[q-1]
def compute_prefix_function(p):
m=len(p)
pi =range(m)
k=1
l = 0
while k < m:
if p[k] <= p[l]:
l = l + 1
pi[k] = l
k = k + 1
else:
if l != 0:
l = pi[l-1]
else:
pi[k] = 0
k = k + 1
return pi
t = 'brownfoxlazydog'
p = 'lazy'
kmp_matcher(t, p)
KMP stands for Knuth-Morris-Pratt it is a linear time string-matching algorithm.
Note that in python, the string is ZERO BASED, (while in the book the string starts with index 1).
So we can workaround this by inserting an empty space at the beginning of both strings.
This causes four facts:
The len of both text and pattern is augmented by 1, so in the loop range, we do NOT have to insert the +1 to the right interval. (note that in python the last step is excluded);
To avoid accesses out of range, you have to check the values of k+1 and q+1 BEFORE to give them as index to arrays;
Since the length of m is augmented by 1, in kmp_matcher, before to print the response, you have to check this instead: q==m-1;
For the same reason, to calculate the correct shift you have to compute this instead: i-(m-1)
so the correct code, based on your original question, and considering the starting code from Cormen, as you have requested, would be the following:
(note : I have inserted a matching pattern inside, and some debug text that helped me to find logical errors):
def compute_prefix_function(P):
m = len(P)
pi = [None] * m
pi[1] = 0
k = 0
for q in range(2, m):
print ("q=", q, "\n")
print ("k=", k, "\n")
if ((k+1) < m):
while (k > 0 and P[k+1] != P[q]):
print ("entered while: \n")
print ("k: ", k, "\tP[k+1]: ", P[k+1], "\tq: ", q, "\tP[q]: ", P[q])
k = pi[k]
if P[k+1] == P[q]:
k = k+1
print ("Entered if: \n")
print ("k: ", k, "\tP[k]: ", P[k], "\tq: ", q, "\tP[q]: ", P[q])
pi[q] = k
print ("Outside while or if: \n")
print ("pi[", q, "] = ", k, "\n")
print ("---next---")
print ("---end for---")
return pi
def kmp_matcher(T, P):
n = len(T)
m = len(P)
pi = compute_prefix_function(P)
q = 0
for i in range(1, n):
print ("i=", i, "\n")
print ("q=", q, "\n")
print ("m=", m, "\n")
if ((q+1) < m):
while (q > 0 and P[q+1] != T[i]):
q = pi[q]
if P[q+1] == T[i]:
q = q+1
if q == m-1:
print ("Pattern occurs with shift", i-(m-1))
q = pi[q]
print("---next---")
print("---end for---")
txt = " bacbababaabcbab"
ptn = " ababaab"
kmp_matcher(txt, ptn)
(so this would be the correct accepted answer...)
hope that it helps.

On python, how to check the amount of times a letter is in a word (list)

So I have a stored word. And the user is invited to check if a letter of their choice is in this word. My code for this is the following
storedword = "abcdeef"
word = list(germ)
print (word)
merge = input("letter please")
print ("your letter is", merge)
counter = int(0)
letterchecker = int(0)
listlength = len(word)
while counter < listlength and merge != word[counter]:
counter +=1
if counter <listlength:
print ("found")
else:
print ("not found")
How can I alter this code to check how many times the user letter is in this word? I can only use if's and while loops and not using .count
Can you use a Counter
from collections import Counter
storedword = "abcdeef"
wordcounter = Counter(list(storedword))
merge = input("letter please ")
print("your letter is %s" % merge)
print('It occurs %d times' % wordcounter[merge])
len([w for w in word if w == merge])
is short for
x = []
for w in word:
if w == merge:
x.append(w)
len(x)
Similar approach with while loop:
i = x = 0
while i < len(word):
if word[i] == merge:
x += 1
i += 1
counter = 0
letter_count = 0
while counter < len(word);
if word[counter] == merge:
letter_count +=1
counter +=1
Try this:
counter = 0
for c in word:
if c == merge:
counter += 1
If you can't use for, use:
counter = 0
ind = 0
while ind < len(word):
if word[ind] == merge:
counter += 1
ind +=1

Vigenere Cipher Python 2.0

Im having trouble with encoding / decoding programming for a vigenere cipher. Im only supposed to use lists, dictionaries and loops.
EDIT: I added in the decrypt i have. GetCharList() just gets a list containing the alphabet. I dont know what is wrong that its making the output of the decrpyt not the original message.
def encryptVig(msg, keyword):
alphabet = getCharList() #Get char list is another function which creates a list containing a - z
key = keyword.upper()
keyIndex = 0
dicList = []
for symbol in msg:
num = alphabet.find(key[keyIndex])
if num != -1:
num += alphabet.find(key[keyIndex])
alphabet.find(key[keyIndex])
num%= len(alphabet)
if symbol.isupper():
dicList.append(alphabet[num])
elif symbol.islower():
dicList. append(alphabet[num].lower())
keyIndex += 1
if keyIndex == len(key):
keyIndex = 0
else:
dicList.append(symbol)
return " " .join(dicList)
def decryptVig(msg, keyword):
getCharList()
key = keyword.upper()
keyIndex = 0
dicList = []
for symbol in msg:
num = alphabet.find(key[keyIndex])
if num != -1:
num -= alphabet.find(key[keyIndex])
alphabet.find(key[keyIndex])
num%= len(alphabet)
if symbol.isupper():
dicList.append(alphabet[num])
elif symbol.islower():
dicList. append(alphabet[num].lower())
keyIndex -= 1
if keyIndex == len(key):
keyIndex = 0
else:
dicList.append(symbol)
return " " .join(dicList)
Rather than hacking through the alphabet yourself, another approach would be to use ord and chr to remove some of the complexity of working with letters. At the very least consider using itertools.cycle and itertools.izip to construct a list of the encryption/decryption pairs. Here's how I would solve it:
def letters_to_numbers(str):
return (ord(c) - ord('A') for c in str)
def numbers_to_letters(num_list):
return (chr(x + ord('A')) for x in num_list)
def gen_pairs(msg, keyword):
msg = msg.upper().strip().replace(' ', '')
msg_sequence = letters_to_numbers(msg)
keyword_sequence = itertools.cycle(letters_to_numbers(keyword))
return itertools.izip(msg_sequence, keyword_sequence)
def encrypt_vig(msg, keyword):
out = []
for letter_num, shift_num in gen_pairs(msg, keyword):
shifted = (letter_num + shift_num) % 26
out.append(shifted)
return ' '.join(numbers_to_letters(out))
def decrypt_vig(msg, keyword):
out = []
for letter_num, shift_num in gen_pairs(msg, keyword):
shifted = (letter_num - shift_num) % 26
out.append(shifted)
return ' '.join(numbers_to_letters(out))
msg = 'ATTACK AT DAWN'
keyword = 'LEMON'
print(encrypt_vig(msg, keyword))
print(decrypt_vig(encrypt_vig(msg, keyword), keyword))
>>> L X F O P V E F R N H R
A T T A C K A T D A W N
I don't know how Vigenere is supposed to work. However I am quite sure that after
num = alphabet.find(key[keyIndex])
if num != -1:
num -= alphabet.find(key[keyIndex])
num is zero.

Markov Model - Random word/gibberish generator

My code works fine until the random word generating. Sometimes it creates words/gibberish and sometimes it doesn't (probably going through an infinite loop). However, whenever it does create words/gibberish it doesn't seem so "random". The words would either repeat themselves or most of the words will be generating near the same character length.
The problem lies in the def genRandomWord:
import random
def getTransitions(astring):
d = {}
for i in range(len(astring)):
if astring[i:i+2] in d:
d[astring[i:i+2]] += 1
else:
d[astring[i:i+2]] = 1
#h = tuple(d.items()) #gets the indexes of the dictionary
#print(h[2][1])
if ' ' in d:
del d[' ']
return d
def getFirstLetters(astring):
d = []
for i in astring:
if i not in d:
d.append(i)
d.remove(' ')
return d
def letterCount(astring):
d = {}
for i in astring:
if i not in d.keys():
d[i] = 1
else:
d[i] +=1
d[' ']-= 1
return d
def getProb(astring):
d = {}
h = tuple(getTransitions(astring).items())
j = tuple(letterCount(astring).items())
#print("h", h)
#print()
#print()
#print("j", j)
for i in h:
for n in j:
if i[0][0] == n[0]:
d[i[0]] = i[1]/n[1]
return d
def genFletter(astring):
d = {}
r = random.random()
fl ='*'
#print("r",r)
a = getProb(astring)
suma = 0
count = -1
for i in a:
if i[0][0] == ' ':
d[i[1]] = a[i]
d = sorted(tuple(d.items()))
#print(d)
while suma < r:
count += 1
suma += d[count][1]
fl = d[count][0]
#print(suma)
return fl
def genRandomWord(astring):
h = getProb(The_List)
htrans = tuple(getProb(The_List).keys())
hprob = tuple(getProb(The_List).values())
#print(hprob)
z = genFletter(The_List)
word = z
#print(word)
fletterprob = h[' '+z]
r = random.random()
while word[-1]!= ' ':
index = 0
suma = 0
for i in range(len(htrans)):
if htrans[i][0] == word[-1]:
index = i
suma += hprob[index]
for j in range(len(hprob)):
if suma >= r:
word += htrans[index][1]
break
else:
suma += hprob[index]
return word
The_List = ' steam teams meets teems eat ate state tease test mast mates '
trans = getTransitions(The_List)
lcount = letterCount(The_List)
fletter = getFirstLetters(The_List)
transProb = getProb(The_List)
#Sorting
#print('LETTER TRANSITIONS'+'\n'+str(sorted(trans.items()))+'\n')
#print('LETTER COUNT'+'\n'+str(sorted(lcount.items()))+'\n')
#print('FIRST LETTERS'+'\n'+str(sorted(fletter))+'\n')
#print('TRANSITION PROBABILITIES'+'\n'+str(sorted(transProb.items()))+'\n')
print('LETTER TRANSITIONS'+'\n'+str(trans)+'\n')
print('LETTER COUNT'+'\n'+str(lcount)+'\n')
print('FIRST LETTERS'+'\n'+str(fletter)+'\n')
print('TRANSITION PROBABILITIES'+'\n'+str(transProb)+'\n')
#print(genFletter(The_List))
for i in range(10):
print("'"+genRandomWord(The_List)+"'")

Categories

Resources