How to validate a word in python? - python

I have a list in Python like this:
`list = ['thatCreation', 'happeningso', '’', 'comebecause',]
Question :
I want specific words:
For e.g. -> 'thatCreation' -> 'that', 'creation'
'happeningso' -> 'happening', 'so'
'comebeacause' -> 'come', 'because' `
Thanks in advance for solving it in python.

It looks like you are trying to take words merged together in camel case and break it apart. There is a great algorithm called Viterbi that does this really well.
I can't explain the magic behind it, but I implemented it in my program recently and it works really well. My understanding is it calculates the probability of each word and splits on that. This algorithm can split words in any case.
def word_prob(word): return dictionary[word] / total
def words(text): return re.findall('[a-z]+', text.lower())
dictionary = Counter(words(open(words_path).read()))
max_word_length = max(map(len, dictionary))
total = float(sum(dictionary.values()))
def viterbi_segment(text):
probs, lasts = [1.0], [0]
for i in range(1, len(text) + 1):
prob_k, k = max((probs[j] * word_prob(text[j:i]), j)
for j in range(max(0, i - max_word_length), i))
probs.append(prob_k)
lasts.append(k)
words = []
i = len(text)
while 0 < i:
words.append(text[lasts[i]:i])
i = lasts[i]
words.reverse()
return words, probs[-1]
sentence = ' '.join(viterbi_segment('thatCreation'.lower())[0])
print('sentence: {0}'.format(sentence))
word = ''.join(a.capitalize() for a in split('([^a-zA-Z0-9])', sentence)
if a.isalnum())
print('word: {0}'.format(word[0].lower() + word[1:]))
You need a dictionary of a ton of words, there are multiple out there, but I used:
https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english-no-swears.txt
and updated it with new words that it didn't have.

Borrowed from Peter Norvig's pytudes to perform word segmentation.
Please try..
import re
import math
import random
import matplotlib.pyplot as plt
from collections import Counter
from itertools import permutations
from typing import List, Tuple, Set, Dict, Callable
!wget https://raw.githubusercontent.com/dwyl/english-words/master/words.txt
Word = str # We implement words as strings
cat = ''.join # Function to concatenate strings together
def tokens(text) -> List[Word]:
"""List all the word tokens (consecutive letters) in a text. Normalize to lowercase."""
return re.findall('[a-z]+', text.lower())
TEXT = open('big.txt').read()
WORDS = tokens(TEXT)
class ProbabilityFunction:
def __call__(self, outcome):
"""The probability of `outcome`."""
if not hasattr(self, 'total'):
self.total = sum(self.values())
return self[outcome] / self.total
class Bag(Counter, ProbabilityFunction): """A bag of words."""
Pword = Bag(WORDS)
def Pwords(words: List[Word]) -> float:
"Probability of a sequence of words, assuming each word is independent of others."
return Π(Pword(w) for w in words)
def Π(nums) -> float:
"Multiply the numbers together. (Like `sum`, but with multiplication.)"
result = 1
for num in nums:
result *= num
return result
def splits(text, start=0, end=20) -> Tuple[str, str]:
"""Return a list of all (first, rest) pairs; start <= len(first) <= L."""
return [(text[:i], text[i:])
for i in range(start, min(len(text), end)+1)]
def segment(text) -> List[Word]:
"""Return a list of words that is the most probable segmentation of text."""
if not text:
return []
else:
candidates = ([first] + segment(rest)
for (first, rest) in splits(text, 1))
return max(candidates, key=Pwords)
strings = ['thatCreation', 'happeningso', 'comebecause']
[segment(string.lower()) for string in strings]
--2020-08-04 18:48:06-- https://raw.githubusercontent.com/dwyl/english-words/master/words.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4863005 (4.6M) [text/plain]
Saving to: ‘words.txt.2’
words.txt.2 100%[===================>] 4.64M 162KB/s in 25s
2020-08-04 18:48:31 (192 KB/s) - ‘words.txt.2’ saved [4863005/4863005]
[['that', 'creation'], ['happening', 'so'], ['come', 'because']]

import re
from collections import Counter
def viterbi_segment(text):
probs, lasts = [1.0], [0]
for i in range(1, len(text) + 1):
prob_k, k = max((probs[j] * word_prob(text[j:i]), j)
for j in range(max(0, i - max_word_length), i))
probs.append(prob_k)
lasts.append(k)
words = []
i = len(text)
while 0 < i:
words.append(text[lasts[i]:i])
i = lasts[i]
words.reverse()
return words, probs[-1]
def word_prob(word): return dictionary[word] / total
def words(text): return re.findall('[a-z]+', text.lower())
dictionary = Counter(words(open('big.txt').read()))
max_word_length = max(map(len, dictionary))
total = float(sum(dictionary.values()))
l = ['thatCreation', 'happeningso', 'comebecause',]
for w in l:
print(viterbi_segment(w.lower()))
O/p will be -
(['that', 'creation'], 1.63869514118246e-07)
(['happening', 'so'], 1.1607123777400279e-07)
(['come', 'because'], 4.81658105705814e-07)
I got a solution to my problem from #Darius Bacon and for this, you need to make all strings a lowercase string.
Thank You Guys for your help.
Visit this link for download big.txt :
https://norvig.com/big.txt

Related

How to split word to ngrams in Python?

I've got this question. I should split word to ngrams (for example: word ADVENTURE has three 4grams - ADVE; ENTU; TURE). There is a book file document (that's the reason for counter and isalpha), which is I don't have here, so I'm using only a list of 2 words. This is my code in Python:
words = ['adven', 'adventure']
def ngrams(words, n):
counter = {}
for word in words:
if (len(word)-1) >= n:
for i in range(0, len(word)):
if word.isalpha() == True:
ngram = ""
for i in range(len(word)):
ngram += word[i:n:]
if len(ngram) == n:
ngram.join(counter)
counter[ngram] = counter.get(ngram, 0) + 1
return counter
print(trotl(words, 4))
This is what the code gives me:
{'adve': 14}
I don't care about the values in it but I'm not so good at strings and I don't know what I should do to gives me the three 4grams. I try to do "ngram += word[i::]" but that gives me None. Please help me, this is my school homework and I can't do more functions when this ngrams doesn't work.
use nltk.ngrams for this job:
from nltk import ngrams
I think the definition you have of n-grams is a little bit different from the conventional, as pointed out by #Stuart in his comment. However, with the definition from your comment, I think the following would solve your problem.
def n_grams(word, n):
# We can't find n-grams if the word has less than n letters.
if n > len(word):
return []
output = []
start_idx = 0
end_idx = start_idx + n
# Grab all n-grams except the last one
while end_idx < len(word):
n_gram = word[start_idx:end_idx]
output.append(n_gram)
start_idx = end_idx - 1
end_idx = start_idx + n
# Grab the last n-gram
last_n_gram_start = len(word) - n
last_n_gram_end = len(word)
output.append(word[last_n_gram_start:last_n_gram_end])
return output
If I've understood the rules correctly, you can do it like this
def special_ngrams(word, n):
""" Yield character ngrams of word that overlap by only one character,
except for the last two ngrams which may overlap by more than one
character. The first and last ngrams of the word are always included. """
for start in range(0, len(word) - n, n - 1):
yield word[start:start + n]
yield word[-n:]
for word in "hello there this is a test", "adventure", "tyrannosaurus", "advent":
print(list(special_ngrams(word, 4)))

Difficulty creating random words under conditions

I need k words to be generated, until the sum of all the characters that make up the list is equal to or greater than 25
import random
for x in range(k):
n=("a","b","c","d")
cc=[''.join(random.choice(n) for _ in range(random.choice(range(2,5))))]
print(cc)
def sumt(input1):
l = list(input1)
total = sum(len(i) for i in l)
return int(total)
print(sumt([cc]))
You can have a for loop if you have a variable amount of iteration to do
Have a method that generate a word, then call until you reach the good total length
chars = "abcd"
def new_word():
return ''.join(random.choice(chars) for _ in range(random.choice(range(2, 5))))
def generate(total_length):
result = []
result_length = 0
while result_length < total_length:
result.append(new_word())
result_length += len(result[-1]) # sum with len of last word
return result
x = generate(25)
print(x)
If I understand, you want to build a list of words until the sum of all characters is >= 25? I prefer using classes...
import random
class WordList:
def __init__(self):
self.choices = ['a','b','c','d']
self.threshold = 25
self.char_sum = 0
self.lst = []
self.build_list()
def build_list(self):
'''Build a list of words until sum of all chars
meets or exceeds the threshold.
'''
while self.char_sum < self.threshold:
self.generate_word()
self.char_sum = sum(len(i) for i in self.lst)
def generate_word(self):
'''Generate a single word with 2 to 5 characters.
'''
_word = ''.join(random.choice(self.choices) for _ in range(random.choice(range(2,5))))
self.lst.append(_word)
Usage:
new_list = WordList().lst
print(new_list)

How to use given function to deliver many strings

I have my function randStr which spits out a random string with N character.
def randStr(chars = string.ascii_uppercase + string.digits, N=4):
return ''.join(random.choice(chars) for _ in range(N))
How do I make it so that it returns 1000 of these random and unique small strings?
(Edit: added uniqueness check)
You could use a while-loop to iterate until you get 1000 unique words. Note that input string has to be at least 8 char long. To check using Permutations Calculator, 7P4=840, and 8P4=1680, and 9P4 onwards is greater than that.
import string
import random
def randStr(chars = string.ascii_uppercase + string.digits, N=4):
return ''.join(random.choice(chars) for _ in range(N))
unique_words = []
while len(unique_words) < 1000:
result = randStr('abcdefgh')
if result not in unique_words:
unique_words.append(result)
print(result, end=', ')
print(len(unique_words))
Output:
cfae, gbca, fgfe, bdhg, decd, gcha, ddgc, babd, bggb, eghe, eeca, ebch, fbec, bgbe, gbbc, dgda, efec, hccd, bgfh, gdbf, ecac, edhd, cfdg, eacf, dgaa, heeb,
...
egbb, cbed, eefg, gdec, dgcg, cgag, fadc, effe, dahg, fhdb, 1000
Same answer as that of #black-raven but with more efficient uniqueness check
Note the use of set structure for guaranteed unique strings.
import string
import random
def randStr(chars = string.ascii_uppercase + string.digits, N=4):
return ''.join(random.choice(chars) for _ in range(N))
strings = set()
while len(strings) != 1000:
newString = randStr('asdf')
strings.add(newString)
strings = list(strings)

Contract words in python with set length

I'm currently trying to make a sort of "word mixer": for two given words and the desired length specified, the program should return the "mix" of the two words. However, it can be any sort of mix: it can be the first half of the first word combined with the second half of the second word, it can be a random mix, anything really.
Examples:
fish + cake, length 5: fiske
dog + cat, length 4: doga
late + cross, length 6: losste
I've written a very sloppy code (as seen below), and I'd appreciate some tips on what am I doing wrong (since my outputs aren't really good) and if there's anything that can be improved.
from random import randint
name1 = "domingues"
name2 = "signorelli"
names = [name1,name2]
# a list of the desired lengths
lengths = [5,6,7]
mixes = []
def sizes(size):
if size == 5:
letters1 = randint(2,3)
else:
letters1 = randint(2,size-2)
letters2 = size-letters1
return letters1, letters2
def mix(letters1, letters2):
n = randint(0,1)
if n == 1:
a = 0
else:
a = 1
n1 = names[n]
n2 = names[a]
result = n1[0:letters2]+n2[-letters1::]
return result
file = open("results.txt","w+")
for leng in lengths:
file.write("RESULTS WITH "+str(leng)+" LETTERS \n")
file.write("\n")
for i in range(10):
let1, let2 = sizes(leng)
result = mix(let1,let2)
while result == name1 or result == name2:
result = mix(let2)
if result not in mixes:
mixes.append(result)
for m in mixes:
if m not in file:
file.write(m+" \n")
file.write("\n")
file.close()
(Thanks for taking your time to help me btw, I appreciate it!)
In general, this is AI-related problem, because we are implicitly want to get readable mixed words.
I just wrote simple (and dirty) code that tries to catch sequences of vowels and consonants from training data and builds mixed words according to catched rules.
import random
consonants_pat = 'BCDFGHJKLMNPQRSTVXZ'.lower()
vowels_pat = 'aeiouy'
train_data = '''
This our sentence to be used as a training dataset
It should be longer
'''
def build_mixer(train_data, num=3, mixed_len=(2, 4)):
def _get_random_pattern(td, wlen):
td_splitted = td.lower().split()
while True:
w = random.choice(list(filter(lambda x: len(x)>=wlen, td_splitted)))
for j in range(len(w)-wlen):
yield tuple(map(lambda x: 0 if x in vowels_pat else 1, w[j:j + wlen]))
def _select_vowels(w):
return
def _mixer(w1, w2, num=num, mixed_len=mixed_len):
allowed_letters = w1.lower().strip() + w2.lower().strip()
ind = 1
for j in range(num):
wlen = random.choice(range(*mixed_len))
pattern = _get_random_pattern(train_data, wlen)
_aux = allowed_letters
word = ''
try:
for pat in pattern:
for k in pat:
if k == 0:
choiced = random.choice(list(filter(lambda x: x in vowels_pat, _aux)))
word += choiced
else:
choiced = random.choice(list(filter(lambda x: x in consonants_pat, _aux)))
word += choiced
l = list(_aux)
l.remove(choiced)
_aux = ''.join(l)
ind += 1
yield word
if ind>num:
raise StopIteration
except IndexError:
continue
return _mixer
mixer = build_mixer(train_data, num=6, mixed_len=(3,6))
for mixed in mixer('this', 'horse'):
print(mixed)
I got the following words:
het hetihs hetihssro sheo hsio tohir
I recommend taking a random slice of the word string and combining it with another random slice from the second word. Get the len(word) and take a slice of the word randomly using random.randrange().
import random
def word_mixer(word1, word2):
slice1 = word1[:random.randrange(2, len(word1))]
slice2 = word2[:random.randrange(2, len(word2))]
return slice1 + slice2
mixed = word_mixer('weasel', 'snake')
print(mixed)
Output:
wesnak
weasesna
weassnak
Here's one way to do it.
import random
w1 = 'dog'
w2 = 'cat'
w3 = 'fish'
w4 = 'wolf'
def word_mixer(w1, w2, length):
new_word = w1 + w2
x = random.sample(range(len(new_word)), length)
result = []
for i in x:
result.append(new_word[i])
return "".join(result)
print(word_mixer(w3,w4,4))
print(word_mixer(w2,w4,5))
Output:
lfwi
falwc
A bit more smaller version of #AkshayNevrekar's post:
import random
w1 = 'dog'
w2 = 'cat'
w3 = 'fish'
w4 = 'wolf'
def word_mixer(w1, w2, length):
return ''.join(random.sample(w1 + w2, length))
print(word_mixer(w3, w4, 4))
print(word_mixer(w2, w4, 5))
We can also use random.sample and pass mixed string to it like this:
import random
w1=input("Enter first word")
w2=input("Enter second word")
len=int(input("Enter length"))
mixed=w1+w2
def wordmixer(mixed,len):
return ''.join(random.sample(mixed,len))
print(wordmixer(mixed,len))

In spell checker how to get the word that are 3 edits away(norvig)

I have been trying to use spell corrector for my database table to correct the address from one table, for which I have used the reference of http://norvig.com/spell-correct.html
Using the Address_mast table as a collection of strings I'm trying to correct and update the corrected string in "customer_master"
Address_mast
ID Address
1 sonal plaza,harley road,sw-309012
2 rose apartment,kell road, juniper, la-293889
3 plot 16, queen's tower, subbden - 399081
4 cognizant plaza, abs road, ziggar - 500234
now from the reference code it has been done only for those words which are "two edits away from word".but I'm trying to do it for 3 or till 4 and at the same time trying to update those corrected words to other table.here is the table which contains misspell words and is to be updated with corrected words
Customer_master
Address_1
josely apartmt,kell road, juneeper, la-293889
zoonal plaza, harli road,sw-309012
plot 16, queen's tower, subbden - 399081
cognejantt pluza, abs road, triggar - 500234
here is what I have tried
import re
import pyodbc
import numpy as np
from collections import Counter
cnxn = pyodbc.connect('DRIVER={SQLServer};SERVER=localhost;DATABASE=DBM;UID=ADMIN;PWD=s#123;autocommit=True')
cursor = cnxn.cursor()
cursor.execute("select address as data from Address_mast")
data=[]
for row in cursor.fetchall():
data.append(row[0])
data = np.array(data)
def words(text): return re.findall(r'\w+', text.lower())
WORDS = Counter(words(open('data').read()))
def P(word, N=sum(WORDS.values())):
"Probability of `word`."
return WORDS[word] / N
def correction(word):
"Most probable spelling correction for word."
return max(candidates(word), key=P)
def candidates(word):
"Generate possible spelling corrections for word."
return (known([word]) or known(edits1(word)) or known(edits2(word)) or known(edits3(word)) or known(edits4(word)) or [word])
def known(words):
"The subset of `words` that appear in the dictionary of WORDS."
return set(w for w in words if w in WORDS)
def edits1(word):
"All edits that are one edit away from `word`."
letters = 'abcdefghijklmnopqrstuvwxyz'
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [L + R[1:] for L, R in splits if R]
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
inserts = [L + c + R for L, R in splits for c in letters]
return set(deletes + transposes + replaces + inserts)
def edits2(word):
"All edits that are two edits away from `word`."
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
def edits3(word):
return (e3 for e2 in edits2(word) for e3 in edits1(e2))
def edits4(word):
return (e4 for e3 in edits3(word) for e4 in edits1(e3))
sqlstr = ""
j=0
k=0
for i in data:
sqlstr=" update customer_master set Address='"+correction(data)+"' where data="+correction(data)
cursor.execute(sqlstr)
j=j+1
k=k+cursor.rowcount
cnxn.commit()
cursor.close()
cnxn.close()
print(str(k) +" Records Completed")
from this I m unable to get proper output, any suggestion on what changes shuld be made..Thanks in advance
The above answers are ok, but there is a faster solution than checking the exponentially increasing set of strings of edit distance k. Suppose we had a data structure that stored the set of all words in a tree structure. This is useful because we know, for example, that we need not search paths in which there are no words. This is both memory efficient and computationally efficient.
Suppose we have a vocabulary stored in a set, dict, or a ideally, a collections.Counter object, then we can set up the data structure as follows:
class VocabTreeNode:
def __init__(self):
self.children = {}
self.word = None
def build(self, vocab):
for w in vocab:
self.insert(w)
def insert( self, word):
node = self
for letter in word:
if letter not in node.children:
node.children[letter] = VocabTreeNode()
node = node.children[letter]
node.word = word
To search only the set of elements of edit distance k from the word, we may endow this structure with a recursive search.
def search(self, word, maxCost):
currentRow = range( len(word) + 1 )
results = []
for letter in self.children:
self.searchRecursive(self.children[letter], letter,
word, currentRow, results,
maxCost)
return results
def searchRecursive(self, node, letter, word, previousRow,
results, maxCost):
columns = len( word ) + 1
currentRow = [ previousRow[0] + 1 ]
for column in range( 1, columns ):
insertCost = currentRow[column - 1] + 1
deleteCost = previousRow[column] + 1
if word[column - 1] != letter:
replaceCost = previousRow[ column - 1 ] + 1
else:
replaceCost = previousRow[ column - 1 ]
currentRow.append( min( insertCost, deleteCost, replaceCost ) )
if currentRow[-1] <= maxCost and node.word != None:
results.append( (node.word, currentRow[-1] ) )
if min( currentRow ) <= maxCost:
for next_letter in node.children:
self.searchRecursive( node.children[next_letter], next_letter, word,
currentRow, results, maxCost)
There is just one problem that I'm not sure how to overcome; transpositions are not valid as paths, so i'm not sure how to incorporate transpositions as edit distance 1 without a somewhat complicated hack.
My corpus of words was 97722 (the set of words in almost any linux distro).
sleep(1)
start = time()
for i in range(100):
x = V.search('elephant',3)
print(time()- start)
>>> 17.5
Which equates to edit distance 3 calculations for this word every 0.175 seconds. Edit distance 4 was able to be done in .377 seconds, whereas consecutive edit distances using the edits1 will quickly cause your system to run out of memory.
With the caveat of not easily handling transpositions, this is a fast effective way of implementing a Norvig-type algorithm for high edit distances.

Categories

Resources