I am trying to generate random text using letter frequencies that I have obtained. First, I succeeded with the following code:
for i in range(450):
outcome=random.random()
if 0<outcome<0.06775:
sys.stdout.write('a')
if 0.06775<outcome<0.07920:
sys.stdout.write('b')
if 0.07920<outcome<0.098:
sys.stdout.write('c')
....
This until the letter z and spacebar. This give me >50 lines of code and I want to get the same result using an array.
So far I have :
f_list = [0, 0.06775, 0.08242, 0.10199, 0.13522, 0.23703, 0.25514, 0.27324, 0.32793, 0.38483, 0.38577, 0.39278, 0.42999, 0.45023, 0.50728, 0.56756, 0.58256, 0.58391, 0.62924, 0.68509, 0.7616, 0.78481, 0.79229, 0.81161, 0.81251, 0.82718, 0.82773, 0.99998]
alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ' ']
import random
import sys
for i in range(25):
outcome=random.random()
if f_list[i]<outcome<f_list[i+1]:
sys.stdout.write('alphabet[i]')
But it isn't working properly, as the range seems now to relate to the array and not the number of iterations I want. The output is blank.
import random
import sys
import bisect
f_list = [0, 0.06775, 0.08242, 0.10199, 0.13522, 0.23703, 0.25514, 0.27324, 0.32793, 0.38483, 0.38577, 0.39278, 0.42999, 0.45023, 0.50728, 0.56756, 0.58256, 0.58391, 0.62924, 0.68509, 0.7616, 0.78481, 0.79229, 0.81161, 0.81251, 0.82718, 0.82773, 0.99998]
alphabet = 'abcdefghijklmnopqrstuvwxyz '
for i in xrange(450):
sys.stdout.write(alphabet[bisect.bisect(f_list, random.random()) - 1])
does the trick and returns (example):
l wefboethol gsplotfoh ua onpedefh dnolnairnioeiegehhecaworonnfmeuej dsiauhpbfttwcknal ateof ap cgbr sunnee leseaeeecltaiaur u oen vxntgsoio kdeniei ot df htr dcencrsrrfp bwelsuoaslrnr heh ee tpt oeejaldeatcto fi a u idimiadmgglral o m iaielbtnt es oe shlspudwdfrrsvol oo i tlwh d r i swhsnloai p swlooi wbe nn sshth nsawtnrqsud mtw diit pner r nitmah todf zcsehma hl e ros ctee toiouinn i hl hlonphioe nh gan ho heein itrgeylftn epaacrmanhe
alphabet can be defined as a simple string too (accessing its elements - single characters - works like for lists)
bisect.bisect(list, value) takes a sorted list and a value and tells where this value should be put between. More about bisect.
Eumiros answer is perfect, and much simpler then mine, but because i made the effort to modify a older solution to a similar problem, i don't want it to go to waste.
I even had the link still around for the discussion about weighted random generators
from which i borrowed the "King of the hill" algorithm.
from string import lowercase
from random import random
class TextGenerator(object):
def __init__(self, flist, textlength, charmap = lowercase + ' '):
self.text_length = textlength
self.chars = charmap
self.weights = self._get_weight_list(flist)
def create_new_weights(self, flist):
self.weights = self._get_weight_list(flist)
def get_weight(self, char):
return self.weights[self.chars.index(char)]
def change_weight(self, char, weight):
self.weights[self.chars.index(char)] = weight
def _get_weight_list(self, flist):
return map (lambda x, y: y-x,
flist,
flist[1:] + [1.0])[:-1]
def windex(self):
assert(len(self.weights) == len(self.chars))
rnd = random() * sum(self.weights)
for i, w in enumerate(self.weights):
rnd -= w
if rnd < 0:
return i
def create_text(self, flist = None):
weights = self._get_weight_list(flist)if flist else self.weights
return u''.join([self.chars[self.windex()] for i in range(self.text_length)])
flist = [0, 0.067750000000000005, 0.082419999999999993, 0.10199, 0.13522000000000001, 0.23702999999999999, 0.25513999999999998, 0.27323999999999998, 0.32793, 0.38483000000000001, 0.38577, 0.39278000000000002, 0.42998999999999998, 0.45023000000000002, 0.50727999999999995, 0.56755999999999995, 0.58255999999999997, 0.58391000000000004, 0.62924000000000002, 0.68508999999999998, 0.76160000000000005, 0.78481000000000001, 0.79229000000000005, 0.81161000000000005, 0.81250999999999995, 0.82718000000000003, 0.82772999999999997, 0.99997999999999998]
texter = TextGenerator(flist, 1000)
print texter.create_text()
texter.change_weight('i', texter.get_weight('e') * 2)
print texter.create_text()
Related
I am creating a function that returns a histogram with each letter of the alphabet and asterisks that map out how many times each character appears in a string. So far I have:
alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
def character_frequency_string(text):
#remove_extraneous function removes anything that is not a letter in the alphabet from the text string
new_text = remove_extraneous(text)
for char in new_text:
if char in new_text:
print(char +' ' + '*'*new_text.count(char))
if char not in new_text:
print(char)
My docstring is the following (with the outputs as they are right now, incorrect):
'''
Examples:
>>> character_frequency_string('hello world!')
h *
e *
l ***
l ***
o **
w *
o **
r *
l ***
d *
>>> character_frequency_string('testing!')
t **
e *
s *
t **
i *
n *
g *
'''
The correct output for 'hello world!' would be:
How could I change my code so that the histogram works as intended (with all of the alphabet in order, displaying an asterisk beside each letter for its character frequency, still displaying letters when they aren't in the text, just with no asterisk.)
Iterate over alphabet:
alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
def character_frequency_string(text):
new_text = text
for char in alphabet:
print(char + ' ' + '*' * new_text.count(char))
character_frequency_string('hello world!')
Output
a
b
c
d *
e *
f
g
h *
i
j
k
l ***
m
n
o **
p
q
r *
s
t
u
v
w *
x
y
z
The above solution has O(n^2) time complexity, a more performant alternative is to use collections.Counter.
You could do the following, using a collections.Counter and f-strings:
from collections import Counter
from string import ascii_lowercase as alphabet
def character_frequency_string(text):
c = Counter(text.lower())
for x in alphabet:
print(f"{x} {'*' * c[x]}")
>>> character_frequency_string("hello world!")
a
b
c
d *
e *
f
g
h *
i
j
k
l ***
m
n
o **
p
q
r *
s
t
u
v
w *
x
y
z
Some docs:
Counter
string.ascii_lowercase
A dictionary approach. Notice the text may contain characters which do not belongs to the alphabet such as punctuation and white spaces. In case, such excluded characters are implemented also as string.punctuation and string.whitespace.
import string
alphabet = string.ascii_lowercase
def character_frequency_string(text, sep='*'):
d = dict.fromkeys(alphabet, 0)
for c in tuple(text.lower()):
if c in d:
d[c] += 1
print(*(f'{k} {sep*v}' for k, v in d.items()), sep='\n')
t = 'hello world!'
character_frequency_string(t)
You could use set for getting the same result
import re
strString = input("Please give input value of the string :")
strString = re.sub(r"[^a-zA-Z]","",strString)
strString=strString.lower()
print(strString)
a = list(sorted(set(strString.lower())))
for i in range(len(a)):
count = 0
for k in range(len(strString)):
if a[i] == strString[k]:
count = count + 1
str(a[i])+"is"+str(count)
print(a[i] + " " + "*" *count)
So basically what I want to do is if the random string of characters generated is over 6 chars long it adds a space in a random place in that string and then the remaining ones are added on after the space so for example: raw output: "aoneicse", what I want : "aoneic se" or "aone icse".
Here is my code:
import random
alist = [' ', 'a', 'b', 'c', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
blist = [' ', 'a', 'b', 'c', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
clist = [' ', 'a', 'b', 'c', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
# define the 3 words
a = ''.join(random.choices(alist, k = random.randint(2,8)))
b = ''.join(random.choices(blist, k = random.randint(2,8)))
c = ''.join(random.choices(clist, k = random.randint(2,8)))
# each letter is a certain word, if this word exceeds 6 characters add a space in a random place
if a length = > 6:
asp = [a + " "]
if b length = > 6:
bsp = [b + " "]
if c length = > 6:
csp = [c + " "]
# or something along the lines of this i guess
This code does not currently work, BTW.
You don't need to create a list of strings with every letter of the alphabet, there's already a string module that does that for you:
import string
print(' ' + string.ascii_lowercase)
# Outputs abcdefghijklmnopqrstuvwxyz
You also don't need to create three different variables, you can just use a single one and use that:
import string
base_str = ' ' + string.ascii_lowercase
Then, you can use a list to generate your words based on this string:
import random
import string
base_str = ' ' + string.ascii_lowercase
words = [''.join(random.choices(base_str, k=random.randint(2,8))) for _ in range(3)]
Now, just apply your space requirement to each word:
import random
import string
base_str = ' ' + string.ascii_lowercase
words = [''.join(random.choices(base_str, k=random.randint(2,8))) for _ in range(3)]
new_words = []
for word in words:
if len(word) < 6:
new_word = word
else:
pos = random.randrange(len(word))
new_word = word[:pos] + ' ' + word[pos:]
new_words.append(new_word)
Using random.seed(0), new_words is equal to ['j xazmxvz', 'oxmg', 'buzns pcb'].
Don't join the choices immediately. Generate the list, and if it is long enough, pick an index in the middle (sufficiently far from either end, depending on your needs), and perform a slice assignment to the empty list at that position. Then join the list into a single string.
import string
a = random.choices(string.ascii_lowercase, k=random.randint(2,8))
if len(a) > 6:
# Adjust the arguments to randint as desired
x = random.randint(2, len(a) - 2)
a[x:x] = ' '
a = ''.join(a)
import random
character = list(map(chr, range(ord('a'), ord('z') + 1)))
def get_char():
return character[random.randint(0, len(character) - 1)]
def gen_word(min_len, max_len):
word = [get_char() for _ in range(random.randint(min_len, max_len))]
if len(word) > 6:
word[random.randint(1, len(word) - 2)] = ' '
return ''.join(word)
for i in range(10):
print(gen_word(4, 10))
I’d use slices to return a space, sliced-in at the right spot; Something along the lines of:
def split(input_str):
if len(input_str) > 6:
space_at = rand(0,len(input_str))
return input_str[:space_at] + ‘ ‘ + input_str[space_at:]
else:
return input_str
import random
alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
iterations = 1
running = True
def random_picker():
random_choice = random.choice(alphabet)
iterations =+ 1
print(random_choice)
while running == True:
if iterations <=26:
random_picker()
else:
running == False
I'm trying to get a different random letter through each iteration, through all 26. Each letter picked needs to update the random_choice variable.
Your program will continue looping until 26 letters are returned. Since you want a different letter on each iteration, it's probably easier to shuffle the alphabet array and loop over it instead of trying to choose a random letter in each iteration:
random.shuffle(alphabet)
Paul M. EDIT - Here is an example of how you might use it to achieve the desired effect:
from string import ascii_lowercase
from random import shuffle
alphabet = list(ascii_lowercase)
shuffle(alphabet)
for char in alphabet:
print(char)
Output:
j
m
z
w
k
y
d
f
l
c
u
b
t
s
e
p
x
g
a
r
n
h
i
q
o
v
>>>
I found this code here (Many thanks to Naren for developing it) it is a boggle solver, the question is that I now live in Brazil and I would like to take it to Portuguese or maybe, Spanish, so I want to use my own dictionary (Separated by line break , not commas) So how can I replace the nltk dictionary with my txt in this code? I already tried to convert it to csv, open it with the open () and nothing works u.u
Note: Dictionary with which I am testing: here
from collections import defaultdict
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
english_words = words.words()
# If you wan to remove stop words
# stop_words = set(stopwords.words('english'))
# english_words = [w for w in english_words if w not in stop_words]
boggle = [
['c', 'n', 't', 's', 's'],
['d', 'a', 't', 'i', 'n'],
['o', 'o', 'm', 'e', 'l'],
['s', 'i', 'k', 'n', 'd'],
['p', 'i', 'c', 'l', 'e']
]
# Instead of X and Y co-ordinates
# better to use Row and column
lenc = len(boggle[0])
lenr = len(boggle)
# Initialize trie datastructure
trie_node = {'valid': False, 'next': {}}
# lets get the delta to find all the nighbors
neighbors_delta = [
(-1,-1, "↖"),
(-1, 0, "↑"),
(-1, 1, "↗"),
(0, -1, "←"),
(0, 1, "→"),
(1, -1, "↙"),
(1, 0, "↓"),
(1, 1, "↘"),
]
def gen_trie(word, node):
"""udpates the trie datastructure using the given word"""
if not word:
return
if word[0] not in node:
node[word[0]] = {'valid': len(word) == 1, 'next': {}}
# recursively build trie
gen_trie(word[1:], node[word[0]])
def build_trie(words, trie):
"""Builds trie data structure from the list of words given"""
for word in words:
gen_trie(word, trie)
return trie
def get_neighbors(r, c):
"""Returns the neighbors for a given co-ordinates"""
n = []
for neigh in neighbors_delta:
new_r = r + neigh[0]
new_c = c + neigh[1]
if (new_r >= lenr) or (new_c >= lenc) or (new_r < 0) or (new_c < 0):
continue
n.append((new_r, new_c, neigh[2]))
return n
def dfs(r, c, visited, trie, now_word, direction):
"""Scan the graph using DFS"""
if (r, c) in visited:
return
letter = boggle[r][c]
visited.append((r, c))
if letter in trie:
now_word += letter
if trie[letter]['valid']:
print('Found "{}" {}'.format(now_word, direction))
neighbors = get_neighbors(r, c)
for n in neighbors:
dfs(n[0], n[1], visited[::], trie[letter], now_word, direction + " " + n[2])
def main(trie_node):
"""Initiate the search for words in boggle"""
trie_node = build_trie(english_words, trie_node)
# print the board
print("Given board")
for i in range(lenr):print (boggle[i])
print ('\n')
for r in range(lenr):
for c in range(lenc):
letter = boggle[r][c]
dfs(r, c, [], trie_node, '', 'directions from ({},{})({}) go '.format(r, c, letter))
if __name__ == '__main__':
main(trie_node)
#Again thanks to Naren https://stackoverflow.com/users/1193863/naren for developing this code<3
I'm not sure how to multiply a number following a string by the string. I want to find the RMM of a compound so I started by making a dictionary of RMMs then have them added together. My issue is with compounds such as H2O.
name = input("Insert the name of a molecule/atom to find its RMM/RAM: ")
compound = re.sub('([A-Z])', r' \1', name)
Compound = compound.split(' ')
r = re.split('(\d+)', compound)
For example:
When name = H2O
Compound = ['', 'H2', 'O']
r = ['H', '2', 'O']
I want to multiply 2 by H making a value "['H', 'H', 'O']."
TLDR: I want integers following names in a list to print the previously listed object 'x' amount of times (e.g. [O, 2] => O O, [C, O, 2] => C O O)
The question is somewhat complicated, so let me know if I can clarify it. Thanks.
How about the following, after you define compound:
test = re.findall('([a-zA-z]+)(\d*)', compound)
expand = [a*int(b) if len(b) > 0 else a for (a, b) in test]
Match on letters of 1 or more instances followed by an optional number of digits - if there's no digit we just return the letters, if there is a digit we duplicate the letters by the appropriate value. This doesn't quite return what you expected - it instead will return ['HH', 'O'] - so please let me know if this suits.
EDIT: assuming your compounds use elements consisting of either a single capital letter or a single capital followed by a number of lowercase letters, you can add the following:
final = re.findall('[A-Z][a-z]*', ''.join(expand))
Which will return your elements each as a separate entry in the list, e.g. ['H', 'H', 'O']
EDIT 2: with the assumption of my previous edit, we can actually reduce the whole thing down to just a couple of lines:
name = raw_input("Insert the name of a molecule/atom to find its RMM/RAM: ")
test = re.findall('([A-z][a-z]*)(\d*)', name)
final = re.findall('[A-Z][a-z]*', ''.join([a*int(b) if len(b) > 0 else a for (a, b) in test]))
You could probably do something like...
compound = 'h2o'
final = []
for x in range(len(compound)):
if compound[x].isdigit() and x != 0:
for count in range(int(compound[x])-1):
final.append(compound[x-1])
else:
final.append(compound[x])
Use regex and a generator function:
import re
def multilpy_string(seq):
regex = re.compile("([a-zA-Z][0-9])|([a-zA-Z])")
for alnum, alpha in regex.findall(''.join(seq)):
if alnum:
for char in alnum[0] * int(alnum[1]):
yield char
else:
yield alpha
l = ['C', 'O', '2'] # ['C', 'O', 'O']
print(list(multilpy_string(l)))
We join your list back together using ''.join. Then we compile a regex pattern that matches two types of strings in your list. If the string is a letter and is followed by a number its put in a group. If its a single number, its put in its own group. We then iterate over each group. If we've found something in a group, we yield the correct values.
Here are a few nested for comprehensions to get it done in two lines:
In [1]: groups = [h*int(''.join(t)) if len(t) else h for h, *t in re.findall('[A-Z]\d*', 'H2O')]
In[2]: [c for cG in groups for c in cG]
Out[2]: ['H', 'H', 'O']
Note: I am deconstructing and reconstructing strings so this is probably not the most efficient method.
Here is a longer example:
In [2]: def findElements(molecule):
...: groups = [h*int(''.join(t)) if len(t) else h for h, *t in re.findall('[A-Z]\d*', molecule)]
...: return [c for cG in groups for c in cG]
In [3]: findElements("H2O5S7D")
Out[3]: ['H', 'H', 'O', 'O', 'O', 'O', 'O', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'D']
In python3 (I don't know about python2) you can simply multiply strings.
for example:
print("H"*2) # HH
print(2*"H") # HH
Proof that this information is useful:
r = ['H', '2', 'O']
replacements = [(index, int(ch)) for index, ch in enumerate(r) if ch.isdigit()]
for postion, times in replacements:
r[postion] = (times - 1) * r[postion - 1]
# flaten the result
r = [ch for s in r for ch in s]
print(r) # ['H', 'H', 'O']