Python: Find If Substring Exists in String Given Condition

Python: Find If Substring Exists in String Given Condition - python

I'm trying to optimize this solution for a function that accepts 2 arguments: fullstring and substring. The function will return True if the substring exists in the fullstring, and False if it does not. There is one special wildcard that could be entered in the substring that denotes 0 or 1 of the previous symbol, and there can be more than one wildcard in the substring.
For example, "a*" means "" or "a"
The solution I have works fine but I'm trying to reduce the number of for loops (3) and optimize for time complexity. Using regex is not permitted. Is there a more pythonic way to do this?
Current Solution:
def complex_search(fullstring, substring):
patterns = []
if "*" in substring:
index = substring.index("*")
patterns.append(substring[:index-1] + substring[index+1:])
patterns.append(substring[:index] + substring[index+1:])
else:
patterns.append(substring)
def check(s1, s2):
for a, b in zip(s1, s2):
if a != b:
return False
return True
for pattern in patterns:
for i in range(len(fullstring) - len(pattern) + 1):
if check(fullstring[i:i+len(pattern)], pattern):
return True
return False
>> print(complex_search("dogandcats", "dogs*andcats"))
>> True

Approach
Create all alternatives for the substring based upon '" in substring (can have zero or more '' in substring)
See Function combs(...) below
Use Aho-Corasick to check if one of the substring patterns is in the string. Aho-Corasick is a very efficient algorithm for checking if one or more substrings appear in a string and formed as the basis of the original Unix command fgrep.
For illustrative purposes a Python version of Aho-Corasik is used below, but a C implementation (with Python wrapper) is available at pyahocorasick for higher performance.
See class Aho-Corasick below.
Code
# Note: This is a modification of code explained in https://carshen.github.io/data-structures/algorithms/2014/04/07/aho-corasick-implementation-in-python.html
from collections import deque
class Aho_Corasick():
def __init__(self, keywords):
self.adj_list = []
# creates a trie of keywords, then sets fail transitions
self.create_empty_trie()
self.add_keywords(keywords)
self.set_fail_transitions()
def create_empty_trie(self):
""" initalize the root of the trie """
self.adj_list.append({'value':'', 'next_states':[],'fail_state':0,'output':[]})
def add_keywords(self, keywords):
""" add all keywords in list of keywords """
for keyword in keywords:
self.add_keyword(keyword)
def find_next_state(self, current_state, value):
for node in self.adj_list[current_state]["next_states"]:
if self.adj_list[node]["value"] == value:
return node
return None
def add_keyword(self, keyword):
""" add a keyword to the trie and mark output at the last node """
current_state = 0
j = 0
keyword = keyword.lower()
child = self.find_next_state(current_state, keyword[j])
while child != None:
current_state = child
j = j + 1
if j < len(keyword):
child = self.find_next_state(current_state, keyword[j])
else:
break
for i in range(j, len(keyword)):
node = {'value':keyword[i],'next_states':[],'fail_state':0,'output':[]}
self.adj_list.append(node)
self.adj_list[current_state]["next_states"].append(len(self.adj_list) - 1)
current_state = len(self.adj_list) - 1
self.adj_list[current_state]["output"].append(keyword)
def set_fail_transitions(self):
q = deque()
child = 0
for node in self.adj_list[0]["next_states"]:
q.append(node)
self.adj_list[node]["fail_state"] = 0
while q:
r = q.popleft()
for child in self.adj_list[r]["next_states"]:
q.append(child)
state = self.adj_list[r]["fail_state"]
while (self.find_next_state(state, self.adj_list[child]["value"]) == None
and state != 0):
state = self.adj_list[state]["fail_state"]
self.adj_list[child]["fail_state"] = self.find_next_state(state, self.adj_list[child]["value"])
if self.adj_list[child]["fail_state"] is None:
self.adj_list[child]["fail_state"] = 0
self.adj_list[child]["output"] = self.adj_list[child]["output"] + self.adj_list[self.adj_list[child]["fail_state"]]["output"]
def get_keywords_found(self, line):
""" returns keywords in trie from line """
line = line.lower()
current_state = 0
keywords_found = []
for i, c in enumerate(line):
while self.find_next_state(current_state, c) is None and current_state != 0:
current_state = self.adj_list[current_state]["fail_state"]
current_state = self.find_next_state(current_state, c)
if current_state is None:
current_state = 0
else:
for j in self.adj_list[current_state]["output"]:
yield {"index":i-len(j) + 1,"word":j}
def pattern_found(self, line):
''' Returns true when the pattern is found '''
return next(self.get_keywords_found(line), None) is not None
def combs(word, n = 0, path = ""):
''' Generate all combinations of words with star
e.g. list(combs("he*lp*")) = ['help', 'helpp', 'heelp', 'heelpp']
'''
if n == len(word):
yield path
elif word[n] == '*':
# Next letter
yield from combs(word, n+1, path) # don't add * to path
else:
if n < len(word) - 1 and word[n+1] == '*':
yield from combs(word, n+1, path) # Not including letter at n
yield from combs(word, n+1, path + word[n]) # including letter at n
Test
patterns = combs("dogs*andcats") # ['dogandcats', 'dogsandcats']
aho = Aho_Corasick(patterns) # Aho-Corasick structure to recognize patterns
print(aho.pattern_found("dogandcats")) # Output: True
print(aho.pattern_found("dogsandcats")) # Output: True

Related

Algorithm to remove words from a trie whose frequency < 5 and length > 15

I have a huge trie dictionary that I built from data from web. Although it is just 5MB when I write the trie into a file its' size is so big when I load it on the memory (more than 100 MB). So I've to compress the trie.
I am facing difficulties in writing a recursive function (preferably runs in linear time like a DFS) to remove the words whose frequency is < 5 and length > 15. Any help is appreciated
Here is my trie structure.
class TrieNode:
def __init__(self):
self.ch = '|'
self.score = 0
self.childs = [None]*26
self.isWord = False
class Trie:
def __init__(self):
self.root = TrieNode('$')
#staticmethod
def print_trie(node, level):
if node is None:
return
print(node.ch, " ", level, " ", node.isWord)
for i in range(26):
Trie.print_trie(node.childs[i], level+1)
def insert(self, word):
word = word.lower()
if not is_valid(word):
return
childs = self.root.childs
i = 0
while i < len(word):
idx = to_int(word[i])
if childs[idx] is not None:
t = childs[idx]
else:
t = TrieNode(word[i])
childs[idx] = t
childs = t.childs
if i == len(word)-1:
t.isWord = True
t.score += 1
i += 1
def search_node(self, word):
word = word.lower()
if not is_valid(word):
return False, 0
if self.root is None or word is None or len(word) == 0:
return False, 0
children = self.root.childs
for i in range(len(word)):
idx = to_int(word[i])
if children[idx] is not None:
t = children[idx]
children = t.childs
else:
return False, 0
if t.isWord:
return True, t.score
else:
return False, t.score

The following method takes a node and its level (initially pass in root and 0) and returns True if the node should remain alive after pruning and False if the node should be removed from the trie (with its subtrie).
def prune(node, level):
if node is None:
return False
canPruneNode = True
for idx in xrange(len(node.children)):
# If any of the children remains alive, don't prune current node.
if prune(children[idx], level + 1):
canPruneNode = False
else:
# Remove dead child.
node.children[idx] = None
if node.isWord and level > 15 and node.score < 5:
node.isWord = False
# Current node should be removed if and only if all of its children
# were removed and it doesn't represent a word itself after pruning.
return node.isWord or not canPruneNode

I am not sure if removing will solve the problem. The space consumed is not because of the words but because of the 26 children every node has.
Eg. I have a word cat with frequency 30 & there's another word cater whose frequency is 10. So, if you delete the node for t in cat then all the subsequent nodes will be deleted (that is cater will be reduced to cat)
So, removing a word from Trie means nothing but setting its score to 0.

palindromic substrings recursive solution using global variable

Trying this with the recursive solution:
Recursively I am creating all the substrings and checking if it is palindrome or not.
Problem is that I want to get rid of global variable count.
class Solution(object):
def countSubstrings(self, s):
"""
:type s: str
:rtype: int
"""
def palin(s):
if s == s[::-1]:
return True
return False
global count
count = 0
def helper(s, cur, dp):
global count
ret = 0
if cur >= len(s)-1:
return 0
if cur in dp:
return
for i in range(cur+1, len(s)):
if palin(s[cur:i+1]):
count += 1
ret = helper(s, i, dp)
else:
ret = helper(s, i, dp)
dp[cur] = ret
helper(s, 0, {})
return count + len(s)
What I have tried so far:
def helper(s, cur, dp, count):
ret = 0
if cur >= len(s)-1:
return count
if cur in dp:
return dp[cur]
for i in range(cur+1, len(s)):
if palin(s[cur:i+1]):
ret = helper(s, i, dp, count + 1)
else:
ret = helper(s, i, dp, count)
dp[cur] = ret
return dp[cur]

Just pass your count variable in your recursive helper function (and increment as necessary).

You can make a new Counter class to keep track of your count
class Counter:
def __init__(self):
self.count = 0
def __add__(self,num):
self.count+=num
return self
Then modify your code to use that counter
class Solution(object):
def countSubstrings(self, s):
"""
:type s: str
:rtype: int
"""
def palin(s):
if s == s[::-1]:
return True
return False
def helper(s, cur, dp,count): #make a parameter for Counter
ret = 0
if cur >= len(s)-1:
return 0
if cur in dp:
return
for i in range(cur+1, len(s)):
if palin(s[cur:i+1]):
count+=1
ret = helper(s, i, dp,count) #pass in the Counter
else:
ret = helper(s, i, dp,count) #pass in here as well
dp[cur] = ret
a = Counter() #Change here
helper(s, 0, {},a) #Change here
return a.count + len(s) #Change here
The way you designed your counting and recursion, you have no choice but to use a mutable object to keep track of the count. And of course there are better ways to do recursion for this problem.

Here is a version with as much recursion as I could think of (including your palindrome function):
class Solution(object):
def countSubstrings(self, s):
"""
:type s: str
:rtype: int
"""
def palin(s):
"""recursively checks if bookends match on narrower substrings"""
if len(s) <= 1:
return True
else:
# checks if bookends match and inner substring is a palindrome
return (s[0] == s[-1]) & palin(s[1:-1])
def first_char_palin_count(s):
"""counts palindromes of all substrings with first char (pos 0)
e.g. will check: "abba", "abb", "ab", "a", "" in palin()
"""
if len(s) <= 0:
return 0
# if s is palindrome + shorter palindromes with first char
else:
return palin(s) + first_char_palin_count(s[:-1])
def helper(s):
"""counts palindromes in all substrings"""
if len(s) <= 0:
return 0
else:
# first char palindromes + palindromes not including first char
return first_char_palin_count(s) + helper(s[1:])
return helper(s)
Notice:
I need 2 functions in my recursion:
one to handle all substrings that include the first character (calls itself)
another (called helper to match yours) to handle all substrings (with and without first character)
I don't need to pass anything but substrings around (no count variable global or local!), because the recursion implicitly the results of all its subproblems (substrings in this case).

Counting word strokes while parsing Trie tree

I'm trying to solve the keyboard autocompletion problem described here.
The problem is to calculate how many keystrokes a word requires, given some dictionary and autocomplete rules. For example, for the dictionary:
data = ['hello', 'hell', 'heaven', 'goodbye']
We get the following results (please refer to the link above for further explanations):
{'hell': 2, 'heaven': 2, 'hello': 3, 'goodbye': 1}
Quick explanation: if the user types h, then e is autocompleted because all words starting with h also have e as second letter. Now if the user types in l, the other l is filled, giving 2 strokes for the word hell. Of course, hello would require one more stroke. Please, see the link above for more examples.
My Trie code is the following, and it works fine (taken from https://en.wikipedia.org/wiki/Trie). The Stack code is to parse the tree from root (see edit below):
class Stack(object):
def __init__(self, size):
self.data = [None]*size
self.i = 0
self.size = size
def pop(self):
if self.i == 0:
return None
item = self.data[self.i - 1]
self.i-= 1
return item
def push(self, item):
if self.i >= self.size:
return None
self.data[self.i] = item
self.i+= 1
return item
def __str__(self):
s = '# Stack contents #\n'
if self.i == 0:
return
for idx in range(self.i - 1, -1, -1):
s+= str(self.data[idx]) + '\n'
return s
class Trie(object):
def __init__(self, value, children):
self.value = value #char
self.children = children #{key, trie}
class PrefixTree(object):
def __init__(self, data):
self.root = Trie(None, {})
self.data = data
for w in data:
self.insert(w, w)
def insert(self, string, value):
node = self.root
i = 0
n = len(string)
while i < n:
if string[i] in node.children:
node = node.children[string[i]]
i = i + 1
else:
break
while i < n:
node.children[string[i]] = Trie(string[:i], {})
node = node.children[string[i]]
i = i + 1
node.value = value
def find(self, key):
node = self.root
for char in key:
if char in node.children:
node = node.children[char]
else:
return None
return node
I couldn't figure it out how to count the number of strokes:
data = ['hello', 'hell', 'heaven', 'goodbye']
tree = PrefixTree(data)
strokes = {w:1 for w in tree.data} #at least 1 stroke is necessary
And here's the code to parse the tree from the root:
stack = Stack(100)
stack.push((None, pf.root))
print 'Key\tChilds\tValue'
print '--'*25
strokes = {}
while stack.i > 0:
key, curr = stack.pop()
# if something:
#update strokes
print '%s\t%s\t%s' % (key, len(curr.children), curr.value)
for key, node in curr.children.items():
stack.push((key, node))
print strokes
Any idea or constructive comment would help, thanks!
Edit
Great answer by #SergiyKolesnikov. There's one small change that can be done in order to avoid the call to endsWith(). I just added a boolean field to the Trie class:
class Trie(object):
def __init__(self, value, children, eow):
self.value = value #char
self.children = children #{key, trie}
self.eow = eow # end of word
And at the end of insert():
def insert(self, string, value):
#...
node.value = value
node.eow = True
Then just replace curr.value.endswith('$'): with curr.eow. Thank you all!

The trie for your example can look like this
' '
| \
H G
| |
E O
| \ |
L A O
| | |
L$ V D
| | |
O E B
| |
N Y
|
E
What nodes in the trie can be seen as markers for user key strokes? There are two types of such nodes:
Inner nodes with more than one child, because the user has to choose among multiple alternatives.
Nodes that represent the last letter of a word, but are not leaves (marked with $), because the user has to type the next letter if the current word is not what is needed.
While traversing the trie recursively one counts how many of these marker nodes were encountered before the last letter of a word was reached. This count is the number of strokes needed for the word.
For the word "hell" it is two marker nodes: ' ' and E (2 strokes).
For the word "hello" it is three marker nodes: ' ', E, L$ (3 strokes).
And so on...
What needs to be changed in your implementation:
The end of a valid word needs to be marked in the tree, so that the second condition can be checked. Therefore, we change the last line of the PrefixTree.insert() method from
node.value = value
to
node.value = value + '$'
Now we add a stroke counter for each stack item (the last value in the triple pushed to the stack) and the checks that increase the counter:
stack = Stack(100)
stack.push((None, tree.root, 0)) # We start with stroke counter = 0
print('Key\tChilds\tValue')
print('--'*25)
strokes = {}
while stack.i > 0:
key, curr, stroke_counter = stack.pop()
if curr.value is not None and curr.value.endswith('$'):
# The end of a valid word is reached. Save the word and the corresponding stroke counter.
strokes[curr.value[:-1]] = stroke_counter
if len(curr.children) > 1:
# Condition 2 is true. Increase the stroke counter.
stroke_counter += 1
if curr.value is not None and curr.value.endswith('$') and len(curr.children) > 0:
# Condition 1 is true. Increase the stroke counter.
stroke_counter += 1
print('%s\t%s\t%s' % (key, len(curr.children), curr.value))
for key, node in curr.children.items():
stack.push((key, node, stroke_counter)) # Save the stroke counter
print(strokes)
Output:
Key Childs Value
--------------------------------------------------
None 2 None
h 1
e 2 h
a 1 he
v 1 hea
e 1 heav
n 0 heaven$
l 1 he
l 1 hell$
o 0 hello$
g 1
o 1 g
o 1 go
d 1 goo
b 1 good
y 1 goodb
e 0 goodbye$
{'heaven': 2, 'goodbye': 1, 'hell': 2, 'hello': 3}

While you go through your stack, you should keep a stroke counter for each node:
It begins at 0 for None.
If the current node has more than 2 children, the counter of the
children will be 1 more than the current counter.
If the current value is a valid word and has at least one child, the
counter of the child(ren) will be 1 more than the current counter.
For documentation purpose, here's my Ruby answer :
class Node
attr_reader :key, :children
attr_writer :final
def initialize(key, children = [])
#key = key
#children = children
#final = false
end
def final?
#final
end
end
class Trie
attr_reader :root
def initialize
#root = Node.new('')
end
def add(word)
node = root
word.each_char.each{|c|
next_node = node.children.find{|child| child.key == c}
if next_node then
node = next_node
else
next_node = Node.new(c)
node.children.push(next_node)
node = next_node
end
}
node.final = true
end
def count_strokes(node=root,word="",i=0)
word=word+node.key
strokes = {}
if node.final? then
strokes[word]=i
if node.children.size>0 then
i+=1
end
elsif node.children.size>1 then
i+=1
end
node.children.each{|c|
strokes.merge!(count_strokes(c, word, i))
}
strokes
end
end
data = ['hello', 'hell', 'heaven', 'goodbye']
trie = Trie.new
data.each do |word|
trie.add(word)
end
# File.readlines('/usr/share/dict/british-english').each{|line|
# trie.add line.strip
# }
puts trie.count_strokes
#=> {"hell"=>2, "hello"=>3, "heaven"=>2, "goodbye"=>1}
60 lines only, and it take less than 3 seconds for 100 000 words.

Is there something wrong with this while-loop?

When I execute this code, it prints 'Constructed', meaning it executed Trie Construction - then my terminal outputs nothing, it doesn't return or print any error, it's just blank, as if it's still working on the problem. Is there something wrong with the while loop? Is it that the 'trie' is an external variable?
trie is a list of nodes, a class I defined.
class node:
def __init__(self, parent, daughters, edge):
self.parent = parent
self.daughters = daughters
self.edge = edge
trie.append(self)
self.index = len(trie) - 1
patterns is a list of fixed strings.
def TrieConstruction(patterns, trie):
trie.append(node(0, [], 0))
for pattern in patterns:
currentNode = trie[0]
for base in pattern:
for daughter in currentNode.daughters:
if base == daughter.edge:
currentNode = daughter
break
else:
trie.append(node(currentNode, [], base))
currentNode = trie[-1]
print('Constructed.')
return
def PrefixTrieMatching(text, trie):
v = trie[0]
for index, base in enumerate(text):
if v.daughters == []:
pattern_out = []
climb(v.index)
return ''.join(pattern_out)
else:
for daughter in v.daughters:
if base == daughter.edge:
v = daughter
break
else:
print('No matches found.')
return
def climb(index):
if index == 0:
return
else:
pattern_out.append(node.edge)
climb(trie[index].parent)
def TrieMatching(text, trie):
while text != []:
PrefixTrieMatching(text, trie)
text = text[0:len(text) - 2]
print('Complete.')
return
print('Next, we generate a trie with the patterns, and then run the text over the trie to search for matches.')
trie = []
TrieConstruction(patterns, trie)
TrieMatching(text, trie)

EDIT:
Disregard my previous answer, if you are entering a string as text, it should be:
while text != "":
PrefixTrieMatching(text, trie)
text = text[0:len(text) - 2]
as the string would never be an empty list

You are doing more work than needed, just use while text which will return False only for an empty string and just slice your string slicing two chars from the end at a time:
def TrieMatching(text, trie):
while text:
PrefixTrieMatching(text, trie)
text = text[:-2]
An empty list, str, dict etc will always evaluate to False so you don't ever need to explicitly check if my_list != [], if my_str != "", if my_list and if my_str etc.. is sufficient.

How to implement the remove function of a trie in python?

I've read the following implementation of the trie in python:
https://stackoverflow.com/a/11016430/2225221
and tried to make the remove fnction for it.
Basically, I had problems even with the start: If you want to remove a word from a trie, it can has sub-"words", or it can be "subword" of another word.
If you remove with "del dict[key]", you are removing these above mentioned two kinds of words also.
Could anyone help me in this, how to remove properly the chosen word (let us presume it's in the trie)

Basically, to remove a word from the trie (as it is implemented in the answer you linked to), you'd just have to remove its _end marker, for example like this:
def remove_word(trie, word):
current_dict = trie
for letter in word:
current_dict = current_dict.get(letter, None)
if current_dict is None:
# the trie doesn't contain this word.
break
else:
del current_dict[_end]
Note however that this doesn't ensure that the trie has its minimal size. After deleting the word, there may be branches in the trie left that are no longer used by any words. This doesn't affect the correctness of the data structure, it just means that the trie may consume more memory than absolutely necessary. You could improve this by iterating backwards from the leaf node and delete branches until you find one that has more than one child.
EDIT: Here's an idea how you could implement a remove function that also culls any unnecessary branches. There's probably a more efficient way to do it, but this might get you started:
def remove_word2(trie, word):
current_dict = trie
path = [current_dict]
for letter in word:
current_dict = current_dict.get(letter, None)
path.append(current_dict)
if current_dict is None:
# the trie doesn't contain this word.
break
else:
if not path[-1].get(_end, None):
# the trie doesn't contain this word (but a prefix of it).
return
deleted_branches = []
for current_dict, letter in zip(reversed(path[:-1]), reversed(word)):
if len(current_dict[letter]) <= 1:
deleted_branches.append((current_dict, letter))
else:
break
if len(deleted_branches) > 0:
del deleted_branches[-1][0][deleted_branches[-1][1]]
del path[-1][_end]
Essentially, it first finds the "path" to the word that is about to be deleted and then iterates through that backwards to find nodes that can be removed. It then removes the root of the path that can be deleted (which also implicitly deletes the _end node).

I think it is better to do it recursively, code as following:
def remove(self, word):
self.delete(self.tries, word, 0)
def delete(self, dicts, word, i):
if i == len(word):
if 'end' in dicts:
del dicts['end']
if len(dicts) == 0:
return True
else:
return False
else:
return False
else:
if word[i] in dicts and self.delete(dicts[word[i]], word, i + 1):
if len(dicts[word[i]]) == 0:
del dicts[word[i]]
return True
else:
return False
else:
return False

def remove_a_word_util(self, word, idx, node):
if len(word) == idx:
node.is_end_of_word = False
return bool(node.children)
ch = word[idx]
if ch not in node.children:
return True
flag = self.remove_a_word_util(word, idx+1, node.children[ch])
if flag:
return True
node.children.pop(ch)
return bool(node.children) or node.is_end_of_word

One method of handling structures like this is through recursion. The great thing about recursion in this case is that it zips to the bottom of the trie, then passes the returned values back up through the branches.
The following function does just that. It goes to the leaf and deletes the _end value, just in case the input word is a prefix of another. It then passes up a boolean (boo) which indicates that the current_dict is still in an outlying branch. Once we hit a point where the current dict has more than one child, we delete the appropriate branch and set boo to False so each remaining recursion will do nothing.
def trie_trim(term, trie=SYNONYMS, prev=0):
# checks that we haven't hit the end of the word
if term:
first, rest = term[0], term[1:]
current_length = len(trie)
next_length, boo = trie_trim(rest, trie=trie[first], prev=current_length)
# this statement avoids trimming excessively if the input is a prefix because
# if the word is a prefix, the first returned value will be greater than 1
if boo and next_length > 1:
boo = False
# this statement checks for the first occurrence of the current dict having more than one child
# or it checks that we've hit the bottom without trimming anything
elif boo and (current_length > 1 or not prev):
del trie[first]
boo = False
return current_length, boo
# when we do hit the end of the word, delete _end
else:
del trie[_end]
return len(trie) + 1, True

A bit of a long one, but I hope this helps answer your question:
class Trie:
WORD_END = "$"
def __init__(self):
self.trie = {}
def insert(self, word):
cur = self.trie
for char in word:
if char not in cur:
cur[char] = {}
cur = cur[char]
cur[Trie.WORD_END] = word
def delete(self, word):
def _delete(word, cur_trie, i=0):
if i == len(word):
if Trie.WORD_END not in cur_trie:
raise ValueError("'%s' is not registered in the trie..." %word)
cur_trie.pop(Trie.WORD_END)
if len(cur_trie) > 0:
return False
return True
if word[i] not in cur_trie:
raise ValueError("'%s' is not registered in the trie..." %word)
cont = _delete(word, cur_trie[word[i]], i+1)
if cont:
cur_trie.pop(word[i])
if Trie.WORD_END in cur_trie:
return False
return True
return False
_delete(word, self.trie)
t = Trie()
t.insert("bar")
t.insert("baraka")
t.insert("barakalar")
t.delete("barak") # raises error as 'barak' is not a valid WORD_END although it is a valid path.
t.delete("bareka") # raises error as 'e' does not exist in the path.
t.delete("baraka") # deletes the WORD_END of 'baraka' without deleting any letter as there is 'barakalar' afterwards.
t.delete("barakalar") # deletes until the previous word (until the first Trie.WORD_END; "$" - by going backwards with recursion) in the same path (until 'baraka').

In case you need the whole DS:
class TrieNode:
def __init__(self):
self.children = {}
self.wordCounter = 0
self.prefixCounter = 0
class Trie:
def __init__(self):
self.root = TrieNode()
def insert(self, word: str) -> None:
node = self.root
for char in word:
if char not in node.children:
node.children[char] = TrieNode()
node.prefixCounter += 1
node = node.children[char]
node.wordCounter += 1
def countWordsEqualTo(self, word: str) -> int:
node = self.root
if node.children:
for char in word:
node = node.children[char]
else:
return 0
return node.wordCounter
def countWordsStartingWith(self, prefix: str) -> int:
node = self.root
if node.children:
for char in prefix:
node = node.children[char]
else:
return 0
return node.prefixCounter
def erase(self, word: str) -> None:
node = self.root
for char in word:
if node.children:
node.prefixCounter -= 1
node = node.children[char]
else:
return None
node.wordCounter -= 1
if node.wordCounter == 0:
self.dfsRemove(self.root, word, 0)
def dfsRemove(self, node: TrieNode, word: str, idx: int) -> None:
if len(word) == idx:
node.wordCounter = 0
return
char = word[idx]
if char not in node.children:
return
self.dfsRemove(node.children[char], word, idx+1)
node.children.pop(char)
trie = Trie();
trie.insert("apple"); #// Inserts "apple".
trie.insert("apple"); #// Inserts another "apple".
print(trie.countWordsEqualTo("apple")) #// There are two instances of "apple" so return 2.
print(trie.countWordsStartingWith("app")) #// "app" is a prefix of "apple" so return 2.
trie.erase("apple") #// Erases one "apple".
print(trie.countWordsEqualTo("apple")) #// Now there is only one instance of "apple" so return 1.
print(trie.countWordsStartingWith("app")) #// return 1
trie.erase("apple"); #// Erases "apple". Now the trie is empty.
print(trie.countWordsEqualTo("apple")) #// return 0
print(trie.countWordsStartingWith("app")) #// return 0

I would argue that this implementation is the most succinct and easiest to understand after a bit of staring.
def removeWord(word, node=None):
if not node:
node = self.root
if word == "":
node.isEnd = False
return
newnode = node.children[word[0]]
removeWord(word[1:], newnode)
if not newnode.isEnd and len(newnode.children) == 0:
del node.children[word[0]]
Although it's a little tricky to understand with the default parameter node=None at first, this is the most succinct implementation of a Trie removal that handles marking the word node.isEnd = False while also pruning extraneous nodes.
The method is first called as Trie.removeWord("ToBeDeletedWord").
In subsequent recursion calls, a node tied to the corresponding letter ("T" then "o" then "B" then "e" etc. etc.) is added to the next recursion (e.g "remove 'oBeDeletedWord' with the node at T").
Once we hit the end node that has the full string ToBeDeletedWord , the last recursion calls removeWord("", <node d>)
In this last recursion call, we mark node.isEnd = False. Later, the node is no longer marked isEnd and it has no children so we can call the delete operator.
Once that last recursion call ends, the rest of the recursions (e.g TobeDeletedWor, TobeDeletedWo, TobeDeletedW, etc. etc.) will then observe that it too is not an end node and there are no more children. These nodes will also delete.
You will have to read this a couple of times but this implementation is concise, readable, and correct. The difficulty is that the recursion happens midfunction rather than at the beginning or end.

TL;DR
class TrieNode:
children: dict[str, "TrieNode"]
def __init__(self) -> None:
self.children = {}
self.end = False
def __contains__(self, char: str) -> bool:
return char in self.children
def __getitem__(self, __name: str) -> "TrieNode":
return self.children[__name]
def __setitem__(self, __name: str, __value: "TrieNode") -> None:
self.children[__name] = __value
def __len__(self):
return len(self.children)
def __delitem__(self, __name: str):
del self.children[__name]
class Trie:
def __init__(self, words: list[str]) -> None:
self.root = TrieNode()
for w in words:
self.insert(w)
def insert(self, word: str):
curr = self.root
for c in word:
curr = curr.children.setdefault(c, TrieNode())
curr.end = True
def remove(self, word: str):
def _remove(node: TrieNode, index: int):
if index >= len(word):
node.end = False
if not node.children:
return True
elif word[index] in node:
if _remove(node[word[index]], index + 1):
del node[word[index]]
_remove(self.root, 0)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python: Find If Substring Exists in String Given Condition - python

Related

Algorithm to remove words from a trie whose frequency < 5 and length > 15

palindromic substrings recursive solution using global variable

Counting word strokes while parsing Trie tree

Is there something wrong with this while-loop?

How to implement the remove function of a trie in python?

Categories

Resources