Is there something wrong with this while-loop?

Is there something wrong with this while-loop? - python

When I execute this code, it prints 'Constructed', meaning it executed Trie Construction - then my terminal outputs nothing, it doesn't return or print any error, it's just blank, as if it's still working on the problem. Is there something wrong with the while loop? Is it that the 'trie' is an external variable?
trie is a list of nodes, a class I defined.
class node:
def __init__(self, parent, daughters, edge):
self.parent = parent
self.daughters = daughters
self.edge = edge
trie.append(self)
self.index = len(trie) - 1
patterns is a list of fixed strings.
def TrieConstruction(patterns, trie):
trie.append(node(0, [], 0))
for pattern in patterns:
currentNode = trie[0]
for base in pattern:
for daughter in currentNode.daughters:
if base == daughter.edge:
currentNode = daughter
break
else:
trie.append(node(currentNode, [], base))
currentNode = trie[-1]
print('Constructed.')
return
def PrefixTrieMatching(text, trie):
v = trie[0]
for index, base in enumerate(text):
if v.daughters == []:
pattern_out = []
climb(v.index)
return ''.join(pattern_out)
else:
for daughter in v.daughters:
if base == daughter.edge:
v = daughter
break
else:
print('No matches found.')
return
def climb(index):
if index == 0:
return
else:
pattern_out.append(node.edge)
climb(trie[index].parent)
def TrieMatching(text, trie):
while text != []:
PrefixTrieMatching(text, trie)
text = text[0:len(text) - 2]
print('Complete.')
return
print('Next, we generate a trie with the patterns, and then run the text over the trie to search for matches.')
trie = []
TrieConstruction(patterns, trie)
TrieMatching(text, trie)

EDIT:
Disregard my previous answer, if you are entering a string as text, it should be:
while text != "":
PrefixTrieMatching(text, trie)
text = text[0:len(text) - 2]
as the string would never be an empty list

You are doing more work than needed, just use while text which will return False only for an empty string and just slice your string slicing two chars from the end at a time:
def TrieMatching(text, trie):
while text:
PrefixTrieMatching(text, trie)
text = text[:-2]
An empty list, str, dict etc will always evaluate to False so you don't ever need to explicitly check if my_list != [], if my_str != "", if my_list and if my_str etc.. is sufficient.

Related

Find a prefix based on trie search | | python

Problem :
Given a list of business_names (strings) and a searchTerm (string).
Return a list of business_names that contains searchTerm as prefix in the business_names.
Example 1.
Input:
business_names[] = { "burger king", "McDonald's", "super duper burger's", "subway", "pizza hut"}
searchTerm = "bur"
Ouput:
["burger king", "super duper burger's"]
I have tried solving in this below way.
But I want to implement trie approach to solve this problem. some one please help here?
https://www.geeksforgeeks.org/trie-insert-and-search/
Any linear solution to solve
def prefix(business_names, searchTerm):
split = [i.split() for i in business_names]
ans = []
for name in split:
for i in range(len(name)):
query = ' '.join(name[i:])
if query.startswith(searchTerm):
ans.append(name)
break
return [' '.join(i) for i in ans]

I don't know what means trie approach and if this is what you need but I would write it simpler - without join, range, len
To make sure I use also lower()
business_names = ["burger king", "McDonald's", "super duper burger's", "subway", "pizza hut"]
searchTerm = "bur"
def prefix(business_names, searchTerm):
searchTerm = searchTerm.lower()
results = []
for name in business_names:
for word in name.split(' '):
word = word.lower()
if word.startswith(searchTerm):
results.append(name)
break
return results
print(prefix(business_names, searchTerm))
EDIT:
I took code from link and create this code.
But I had to change two things.
it works only with letters a-z so I have to remove ' and convert to lower()
it search only full words but after removing and pCrawl.isEndOfWord in return pCrawl != None and pCrawl.isEndOfWord it seems it find words which stats with searchTerm
But I have one doubt: maybe it can search better then O(n^2) but first it has to build Trie and it also need some time. So it can be useful when you always search in the same text and you have to build Trie only once. But you have to build separated Trie for every business name - and it doesn't have to be faster.
.
class TrieNode:
# Trie node class
def __init__(self):
self.children = [None]*26
# isEndOfWord is True if node represent the end of the word
self.isEndOfWord = False
class Trie:
# Trie data structure class
def __init__(self):
self.root = self.getNode()
def getNode(self):
# Returns new trie node (initialized to NULLs)
return TrieNode()
def _charToIndex(self,ch):
# private helper function
# Converts key current character into index
# use only 'a' through 'z' and lower case
return ord(ch)-ord('a')
def insert(self, key):
# If not present, inserts key into trie
# If the key is prefix of trie node,
# just marks leaf node
pCrawl = self.root
length = len(key)
for level in range(length):
index = self._charToIndex(key[level])
# if current character is not present
if not pCrawl.children[index]:
pCrawl.children[index] = self.getNode()
pCrawl = pCrawl.children[index]
# mark last node as leaf
pCrawl.isEndOfWord = True
def search(self, key):
# Search key in the trie
# Returns true if key presents
# in trie, else false
pCrawl = self.root
length = len(key)
for level in range(length):
index = self._charToIndex(key[level])
if not pCrawl.children[index]:
return False
pCrawl = pCrawl.children[index]
return pCrawl != None #and pCrawl.isEndOfWord # <-- check `isEndOfWord` to search full words
# driver function
def prefix(business_names, searchTerm):
searchTerm = searchTerm.lower()
results = []
for name in business_names:
# Input keys (use only 'a' through 'z' and lower case)
# remove `'` and convert to list with lower case words
keys = name.lower().replace("'", "").split(" ")
#print('keys:', keys)
# Trie object
t = Trie()
# Construct trie
for key in keys:
#print('key:', key)
t.insert(key)
# Search in trie
if t.search(searchTerm) is True:
results.append(name)
return results
if __name__ == '__main__':
business_names = ["burger king", "McDonald's", "super duper burger's", "subway", "pizza hut"]
searchTerm = "bur"
results = prefix(business_names, searchTerm)
print( results )

Fastest way to store strings then find all stored strings that start with substring [duplicate]

I'm interested in tries and DAWGs (direct acyclic word graph) and I've been reading a lot about them but I don't understand what should the output trie or DAWG file look like.
Should a trie be an object of nested dictionaries? Where each letter is divided in to letters and so on?
Would a lookup performed on such a dictionary be fast if there are 100k or 500k entries?
How to implement word-blocks consisting of more than one word separated with - or space?
How to link prefix or suffix of a word to another part in the structure? (for DAWG)
I want to understand the best output structure in order to figure out how to create and use one.
I would also appreciate what should be the output of a DAWG along with trie.
I do not want to see graphical representations with bubbles linked to each other, I want to know the output object once a set of words are turned into tries or DAWGs.

Unwind is essentially correct that there are many different ways to implement a trie; and for a large, scalable trie, nested dictionaries might become cumbersome -- or at least space inefficient. But since you're just getting started, I think that's the easiest approach; you could code up a simple trie in just a few lines. First, a function to construct the trie:
>>> _end = '_end_'
>>>
>>> def make_trie(*words):
... root = dict()
... for word in words:
... current_dict = root
... for letter in word:
... current_dict = current_dict.setdefault(letter, {})
... current_dict[_end] = _end
... return root
...
>>> make_trie('foo', 'bar', 'baz', 'barz')
{'b': {'a': {'r': {'_end_': '_end_', 'z': {'_end_': '_end_'}},
'z': {'_end_': '_end_'}}},
'f': {'o': {'o': {'_end_': '_end_'}}}}
If you're not familiar with setdefault, it simply looks up a key in the dictionary (here, letter or _end). If the key is present, it returns the associated value; if not, it assigns a default value to that key and returns the value ({} or _end). (It's like a version of get that also updates the dictionary.)
Next, a function to test whether the word is in the trie:
>>> def in_trie(trie, word):
... current_dict = trie
... for letter in word:
... if letter not in current_dict:
... return False
... current_dict = current_dict[letter]
... return _end in current_dict
...
>>> in_trie(make_trie('foo', 'bar', 'baz', 'barz'), 'baz')
True
>>> in_trie(make_trie('foo', 'bar', 'baz', 'barz'), 'barz')
True
>>> in_trie(make_trie('foo', 'bar', 'baz', 'barz'), 'barzz')
False
>>> in_trie(make_trie('foo', 'bar', 'baz', 'barz'), 'bart')
False
>>> in_trie(make_trie('foo', 'bar', 'baz', 'barz'), 'ba')
False
I'll leave insertion and removal to you as an exercise.
Of course, Unwind's suggestion wouldn't be much harder. There might be a slight speed disadvantage in that finding the correct sub-node would require a linear search. But the search would be limited to the number of possible characters -- 27 if we include _end. Also, there's nothing to be gained by creating a massive list of nodes and accessing them by index as he suggests; you might as well just nest the lists.
Finally, I'll add that creating a directed acyclic word graph (DAWG) would be a bit more complex, because you have to detect situations in which your current word shares a suffix with another word in the structure. In fact, this can get rather complex, depending on how you want to structure the DAWG! You may have to learn some stuff about Levenshtein distance to get it right.

Here is a list of python packages that implement Trie:
marisa-trie - a C++ based implementation.
python-trie - a simple pure python implementation.
PyTrie - a more advanced pure python implementation.
pygtrie - a pure python implementation by Google.
datrie - a double array trie implementation based on libdatrie.

Have a look at this:
https://github.com/kmike/marisa-trie
Static memory-efficient Trie structures for Python (2.x and 3.x).
String data in a MARISA-trie may take up to 50x-100x less memory than
in a standard Python dict; the raw lookup speed is comparable; trie
also provides fast advanced methods like prefix search.
Based on marisa-trie C++ library.
Here's a blog post from a company using marisa trie successfully:
https://www.repustate.com/blog/sharing-large-data-structure-across-processes-python/
At Repustate, much of our data models we use in our text analysis can be represented as simple key-value pairs, or dictionaries in Python lingo. In our particular case, our dictionaries are massive, a few hundred MB each, and they need to be accessed constantly. In fact for a given HTTP request, 4 or 5 models might be accessed, each doing 20-30 lookups. So the problem we face is how do we keep things fast for the client as well as light as possible for the server.
...
I found this package, marisa tries, which is a Python wrapper around a C++ implementation of a marisa trie. “Marisa” is an acronym for Matching Algorithm with Recursively Implemented StorAge. What’s great about marisa tries is the storage mechanism really shrinks how much memory you need. The author of the Python plugin claimed 50-100X reduction in size – our experience is similar.
What’s great about the marisa trie package is that the underlying trie structure can be written to disk and then read in via a memory mapped object. With a memory mapped marisa trie, all of our requirements are now met. Our server’s memory usage went down dramatically, by about 40%, and our performance was unchanged from when we used Python’s dictionary implementation.
There are also a couple of pure-python implementations, though unless you're on a restricted platform you'd want to use the C++ backed implementation above for best performance:
https://github.com/bdimmick/python-trie
https://pypi.python.org/pypi/PyTrie

Modified from senderle's method (above). I found that Python's defaultdict is ideal for creating a trie or a prefix tree.
from collections import defaultdict
class Trie:
"""
Implement a trie with insert, search, and startsWith methods.
"""
def __init__(self):
self.root = defaultdict()
# #param {string} word
# #return {void}
# Inserts a word into the trie.
def insert(self, word):
current = self.root
for letter in word:
current = current.setdefault(letter, {})
current.setdefault("_end")
# #param {string} word
# #return {boolean}
# Returns if the word is in the trie.
def search(self, word):
current = self.root
for letter in word:
if letter not in current:
return False
current = current[letter]
if "_end" in current:
return True
return False
# #param {string} prefix
# #return {boolean}
# Returns if there is any word in the trie
# that starts with the given prefix.
def startsWith(self, prefix):
current = self.root
for letter in prefix:
if letter not in current:
return False
current = current[letter]
return True
# Now test the class
test = Trie()
test.insert('helloworld')
test.insert('ilikeapple')
test.insert('helloz')
print test.search('hello')
print test.startsWith('hello')
print test.search('ilikeapple')

There's no "should"; it's up to you. Various implementations will have different performance characteristics, take various amounts of time to implement, understand, and get right. This is typical for software development as a whole, in my opinion.
I would probably first try having a global list of all trie nodes so far created, and representing the child-pointers in each node as a list of indices into the global list. Having a dictionary just to represent the child linking feels too heavy-weight, to me.

Using defaultdict and reduce function.
Create Trie
from functools import reduce
from collections import defaultdict
T = lambda : defaultdict(T)
trie = T()
reduce(dict.__getitem__,'how',trie)['isEnd'] = True
Trie :
defaultdict(<function __main__.<lambda>()>,
{'h': defaultdict(<function __main__.<lambda>()>,
{'o': defaultdict(<function __main__.<lambda>()>,
{'w': defaultdict(<function __main__.<lambda>()>,
{'isEnd': True})})})})
Search In Trie :
curr = trie
for w in 'how':
if w in curr:
curr = curr[w]
else:
print("Not Found")
break
if curr['isEnd']:
print('Found')

from collections import defaultdict
Define Trie:
_trie = lambda: defaultdict(_trie)
Create Trie:
trie = _trie()
for s in ["cat", "bat", "rat", "cam"]:
curr = trie
for c in s:
curr = curr[c]
curr.setdefault("_end")
Lookup:
def word_exist(trie, word):
curr = trie
for w in word:
if w not in curr:
return False
curr = curr[w]
return '_end' in curr
Test:
print(word_exist(trie, 'cam'))

Here is full code using a TrieNode class. Also implemented auto_complete method to return the matching words with a prefix.
Since we are using dictionary to store children, there is no need to convert char to integer and vice versa and don't need to allocate array memory in advance.
class TrieNode:
def __init__(self):
#Dict: Key = letter, Item = TrieNode
self.children = {}
self.end = False
class Trie:
def __init__(self):
self.root = TrieNode()
def build_trie(self,words):
for word in words:
self.insert(word)
def insert(self,word):
node = self.root
for char in word:
if char not in node.children:
node.children[char] = TrieNode()
node = node.children[char]
node.end = True
def search(self, word):
node = self.root
for char in word:
if char in node.children:
node = node.children[char]
else:
return False
return node.end
def _walk_trie(self, node, word, word_list):
if node.children:
for char in node.children:
word_new = word + char
if node.children[char].end:
# if node.end:
word_list.append( word_new)
# word_list.append( word)
self._walk_trie(node.children[char], word_new , word_list)
def auto_complete(self, partial_word):
node = self.root
word_list = [ ]
#find the node for last char of word
for char in partial_word:
if char in node.children:
node = node.children[char]
else:
# partial_word not found return
return word_list
if node.end:
word_list.append(partial_word)
# word_list will be created in this method for suggestions that start with partial_word
self._walk_trie(node, partial_word, word_list)
return word_list
create a Trie
t = Trie()
words = ['hi', 'hieght', 'rat', 'ram', 'rattle', 'hill']
t.build_trie(words)
Search for word
words = ['hi', 'hello']
for word in words:
print(word, t.search(word))
hi True
hel False
search for words using prefix
partial_word = 'ra'
t.auto_complete(partial_word)
['rat', 'rattle', 'ram']

If you want a TRIE implemented as a Python class, here is something I wrote after reading about them:
class Trie:
def __init__(self):
self.__final = False
self.__nodes = {}
def __repr__(self):
return 'Trie<len={}, final={}>'.format(len(self), self.__final)
def __getstate__(self):
return self.__final, self.__nodes
def __setstate__(self, state):
self.__final, self.__nodes = state
def __len__(self):
return len(self.__nodes)
def __bool__(self):
return self.__final
def __contains__(self, array):
try:
return self[array]
except KeyError:
return False
def __iter__(self):
yield self
for node in self.__nodes.values():
yield from node
def __getitem__(self, array):
return self.__get(array, False)
def create(self, array):
self.__get(array, True).__final = True
def read(self):
yield from self.__read([])
def update(self, array):
self[array].__final = True
def delete(self, array):
self[array].__final = False
def prune(self):
for key, value in tuple(self.__nodes.items()):
if not value.prune():
del self.__nodes[key]
if not len(self):
self.delete([])
return self
def __get(self, array, create):
if array:
head, *tail = array
if create and head not in self.__nodes:
self.__nodes[head] = Trie()
return self.__nodes[head].__get(tail, create)
return self
def __read(self, name):
if self.__final:
yield name
for key, value in self.__nodes.items():
yield from value.__read(name + [key])

This version is using recursion
import pprint
from collections import deque
pp = pprint.PrettyPrinter(indent=4)
inp = raw_input("Enter a sentence to show as trie\n")
words = inp.split(" ")
trie = {}
def trie_recursion(trie_ds, word):
try:
letter = word.popleft()
out = trie_recursion(trie_ds.get(letter, {}), word)
except IndexError:
# End of the word
return {}
# Dont update if letter already present
if not trie_ds.has_key(letter):
trie_ds[letter] = out
return trie_ds
for word in words:
# Go through each word
trie = trie_recursion(trie, deque(word))
pprint.pprint(trie)
Output:
Coool👾 <algos>🚸 python trie.py
Enter a sentence to show as trie
foo bar baz fun
{
'b': {
'a': {
'r': {},
'z': {}
}
},
'f': {
'o': {
'o': {}
},
'u': {
'n': {}
}
}
}

This is much like a previous answer but simpler to read:
def make_trie(words):
trie = {}
for word in words:
head = trie
for char in word:
if char not in head:
head[char] = {}
head = head[char]
head["_end_"] = "_end_"
return trie

class TrieNode:
def __init__(self):
self.keys = {}
self.end = False
class Trie:
def __init__(self):
self.root = TrieNode()
def insert(self, word: str, node=None) -> None:
if node == None:
node = self.root
# insertion is a recursive operation
# this is base case to exit the recursion
if len(word) == 0:
node.end = True
return
# if this key does not exist create a new node
elif word[0] not in node.keys:
node.keys[word[0]] = TrieNode()
self.insert(word[1:], node.keys[word[0]])
# that means key exists
else:
self.insert(word[1:], node.keys[word[0]])
def search(self, word: str, node=None) -> bool:
if node == None:
node = self.root
# this is positive base case to exit the recursion
if len(word) == 0 and node.end == True:
return True
elif len(word) == 0:
return False
elif word[0] not in node.keys:
return False
else:
return self.search(word[1:], node.keys[word[0]])
def startsWith(self, prefix: str, node=None) -> bool:
if node == None:
node = self.root
if len(prefix) == 0:
return True
elif prefix[0] not in node.keys:
return False
else:
return self.startsWith(prefix[1:], node.keys[prefix[0]])

class Trie:
head = {}
def add(self,word):
cur = self.head
for ch in word:
if ch not in cur:
cur[ch] = {}
cur = cur[ch]
cur['*'] = True
def search(self,word):
cur = self.head
for ch in word:
if ch not in cur:
return False
cur = cur[ch]
if '*' in cur:
return True
else:
return False
def printf(self):
print (self.head)
dictionary = Trie()
dictionary.add("hi")
#dictionary.add("hello")
#dictionary.add("eye")
#dictionary.add("hey")
print(dictionary.search("hi"))
print(dictionary.search("hello"))
print(dictionary.search("hel"))
print(dictionary.search("he"))
dictionary.printf()
Out
True
False
False
False
{'h': {'i': {'*': True}}}

Python Class for Trie
Trie Data Structure can be used to store data in O(L) where L is the length of the string so for inserting N strings time complexity would be O(NL) the string can be searched in O(L) only same goes for deletion.
Can be clone from https://github.com/Parikshit22/pytrie.git
class Node:
def __init__(self):
self.children = [None]*26
self.isend = False
class trie:
def __init__(self,):
self.__root = Node()
def __len__(self,):
return len(self.search_byprefix(''))
def __str__(self):
ll = self.search_byprefix('')
string = ''
for i in ll:
string+=i
string+='\n'
return string
def chartoint(self,character):
return ord(character)-ord('a')
def remove(self,string):
ptr = self.__root
length = len(string)
for idx in range(length):
i = self.chartoint(string[idx])
if ptr.children[i] is not None:
ptr = ptr.children[i]
else:
raise ValueError("Keyword doesn't exist in trie")
if ptr.isend is not True:
raise ValueError("Keyword doesn't exist in trie")
ptr.isend = False
return
def insert(self,string):
ptr = self.__root
length = len(string)
for idx in range(length):
i = self.chartoint(string[idx])
if ptr.children[i] is not None:
ptr = ptr.children[i]
else:
ptr.children[i] = Node()
ptr = ptr.children[i]
ptr.isend = True
def search(self,string):
ptr = self.__root
length = len(string)
for idx in range(length):
i = self.chartoint(string[idx])
if ptr.children[i] is not None:
ptr = ptr.children[i]
else:
return False
if ptr.isend is not True:
return False
return True
def __getall(self,ptr,key,key_list):
if ptr is None:
key_list.append(key)
return
if ptr.isend==True:
key_list.append(key)
for i in range(26):
if ptr.children[i] is not None:
self.__getall(ptr.children[i],key+chr(ord('a')+i),key_list)
def search_byprefix(self,key):
ptr = self.__root
key_list = []
length = len(key)
for idx in range(length):
i = self.chartoint(key[idx])
if ptr.children[i] is not None:
ptr = ptr.children[i]
else:
return None
self.__getall(ptr,key,key_list)
return key_list
t = trie()
t.insert("shubham")
t.insert("shubhi")
t.insert("minhaj")
t.insert("parikshit")
t.insert("pari")
t.insert("shubh")
t.insert("minakshi")
print(t.search("minhaj"))
print(t.search("shubhk"))
print(t.search_byprefix('m'))
print(len(t))
print(t.remove("minhaj"))
print(t)
Code Oputpt
True
False
['minakshi', 'minhaj']
7
minakshi
minhajsir
pari
parikshit
shubh
shubham
shubhi

With prefix search
Here is #senderle's answer, slightly modified to accept prefix search (and not only whole-word matching):
_end = '_end_'
def make_trie(words):
root = dict()
for word in words:
current_dict = root
for letter in word:
current_dict = current_dict.setdefault(letter, {})
current_dict[_end] = _end
return root
def in_trie(trie, word):
current_dict = trie
for letter in word:
if _end in current_dict:
return True
if letter not in current_dict:
return False
current_dict = current_dict[letter]
t = make_trie(['hello', 'hi', 'foo', 'bar'])
print(in_trie(t, 'hello world'))
# True

In response to #basj
The following code will capture \b (end of word) letters.
_end = '_end_'
def make_trie(words):
root = dict()
for word in words:
current_dict = root
for letter in word:
current_dict = current_dict.setdefault(letter, {})
current_dict[_end] = _end
return root
def in_trie(trie, word):
current_dict = trie
for letter in word:
if letter not in current_dict: # Adjusted the
return False # order of letter
if _end in current_dict[letter]: # checks to capture
return True # the last letter.
current_dict = current_dict[letter]
t = make_trie(['hello', 'hi', 'foo', 'bar'])
>>> print(in_trie(t, 'hi'))
True
>>> print(in_trie(t, 'hola'))
False
>>> print(in_trie(t, 'hello friend'))
True
>>> print(in_trie(t, 'hel'))
None

Python: Create a Binary search Tree using a list

The objective of my code is to get each seperate word from a txt file and put it into a list and then making a binary search tree using that list to count the frequency of each word and printing each word in alphabetical order along with its frequency. Each word in the can only contain letters, numbers, -, or ' The part that I am unable to do with my beginner programming knowledge is to make the Binary Search Tree using the list I have (I am only able to insert the whole list in one Node instead of putting each word in a Node to make the tree). The code I have so far is this:
def read_words(filename):
openfile = open(filename, "r")
templist = []
letterslist = []
for lines in openfile:
for i in lines:
ii = i.lower()
letterslist.append(ii)
for p in letterslist:
if p not in ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',"'","-",' '] and p.isdigit() == False:
letterslist.remove(p)
wordslist = list("".join(letterslist).split())
return wordslist
class BinaryTree:
class _Node:
def __init__(self, value, left=None, right=None):
self._left = left
self._right = right
self._value = value
self._count = 1
def __init__(self):
self.root = None
def isEmpty(self):
return self.root == None
def insert(self, value) :
if self.isEmpty() :
self.root = self._Node(value)
return
parent = None
pointer = self.root
while (pointer != None) :
if value == pointer._value:
pointer._count += 1
return
elif value < pointer._value:
parent = pointer
pointer = pointer._left
else :
parent = pointer
pointer = pointer._right
if (value <= parent._value) :
parent._left = self._Node(value)
else :
parent._right = self._Node(value)
def printTree(self):
pointer = self.root
if pointer._left is not None:
pointer._left.printTree()
print(str(pointer._value) + " " + str(pointer._count))
if pointer._right is not None:
pointer._right.printTree()
def createTree(self,words):
if len(words) > 0:
for word in words:
BinaryTree().insert(word)
return BinaryTree()
else:
return None
def search(self,tree, word):
node = tree
depth = 0
count = 0
while True:
print(node.value)
depth += 1
if node.value == word:
count = node.count
break
elif word < node.value:
node = node.left
elif word > node.value:
node = node.right
return depth, count
def main():
words = read_words('sample.txt')
b = BinaryTree()
b.insert(words)
b.createTree(words)
b.printTree()

Since you're a beginner I'd advice to implement the tree methods with recursion instead of iteration since this will result to simpler implementation. While recursion might seem a bit difficult concept at first often it is the easiest approach.
Here's a draft implementation of a binary tree which uses recursion for insertion, searching and printing the tree, it should support the functionality you need.
class Node(object):
def __init__(self, value):
self.value = value
self.left = None
self.right = None
self.count = 1
def __str__(self):
return 'value: {0}, count: {1}'.format(self.value, self.count)
def insert(root, value):
if not root:
return Node(value)
elif root.value == value:
root.count += 1
elif value < root.value:
root.left = insert(root.left, value)
else:
root.right = insert(root.right, value)
return root
def create(seq):
root = None
for word in seq:
root = insert(root, word)
return root
def search(root, word, depth=1):
if not root:
return 0, 0
elif root.value == word:
return depth, root.count
elif word < root.value:
return search(root.left, word, depth + 1)
else:
return search(root.right, word, depth + 1)
def print_tree(root):
if root:
print_tree(root.left)
print root
print_tree(root.right)
src = ['foo', 'bar', 'foobar', 'bar', 'barfoo']
tree = create(src)
print_tree(tree)
for word in src:
print 'search {0}, result: {1}'.format(word, search(tree, word))
# Output
# value: bar, count: 2
# value: barfoo, count: 1
# value: foo, count: 1
# value: foobar, count: 1
# search foo, result: (1, 1)
# search bar, result: (2, 2)
# search foobar, result: (2, 1)
# search bar, result: (2, 2)
# search barfoo, result: (3, 1)

To answer your direct question, the reason why you are placing all of the words into a single node is because of the following statement inside of main():
b.insert(words)
The insert function creates a Node and sets the value of the node to the item you pass in. Instead, you need to create a node for each item in the list which is what your createTree() function does. The preceeding b.insert is not necessary.
Removing that line makes your tree become correctly formed, but reveals a fundamental problem with the design of your data structure, namely the printTree() method. This method seems designed to traverse the tree and recursively call itself on any child. In your initial version this function worked, because there the tree was mal-formed with only a single node of the whole list (and the print function simply printed that value since right and left were empty).
However with a correctly formed tree the printTree() function now tries to invoke itself on the left and right descendants. The descendants however are of type _Node, not of type BinaryTree, and there is no methodprintTree() declared for _Node objects.
You can salvage your code and solve this new error in one of two ways. First you can implement your BinaryTree.printTree() function as _Node.printTree(). You can't do a straight copy and paste, but the logic of the function won't have to change much. Or you could leave the method where it is at, but wrap each _left or _right node inside of a new BinaryTree so that they would have the necessary printTree() method. Doing this would leave the method where it is at, but you will still have to implement some kind of helper traversal method inside of _Node.
Finally, you could change all of your _Node objects to be _BinaryTree objects instead.
The semantic difference between a node and a tree is one of scope. A node should only be aware of itself, its direct children (left and right), and possibly its parent. A tree on the other hand can be aware of any of its descendents, no matter how far removed. This is accomplished by treating any child node as its own tree. Even a leaf, without any children at all can be thought of as a tree with a depth of 0. This behavior is what lets a tree work recursively. Your code is mixing the two together.

Creating a tree/deeply nested dict from an indented text file in python

Basically, I want to iterate through a file and put the contents of each line into a deeply nested dict, the structure of which is defined by the amount of whitespace at the start of each line.
Essentially the aim is to take something like this:
a
b
c
d
e
And turn it into something like this:
{"a":{"b":"c","d":"e"}}
Or this:
apple
colours
red
yellow
green
type
granny smith
price
0.10
into this:
{"apple":{"colours":["red","yellow","green"],"type":"granny smith","price":0.10}
So that I can send it to Python's JSON module and make some JSON.
At the moment I'm trying to make a dict and a list in steps like such:
{"a":""} ["a"]
{"a":"b"} ["a"]
{"a":{"b":"c"}} ["a","b"]
{"a":{"b":{"c":"d"}}}} ["a","b","c"]
{"a":{"b":{"c":"d"},"e":""}} ["a","e"]
{"a":{"b":{"c":"d"},"e":"f"}} ["a","e"]
{"a":{"b":{"c":"d"},"e":{"f":"g"}}} ["a","e","f"]
etc.
The list acts like 'breadcrumbs' showing where I last put in a dict.
To do this I need a way to iterate through the list and generate something like dict["a"]["e"]["f"] to get at that last dict. I've had a look at the AutoVivification class that someone has made which looks very useful however I'm really unsure of:
Whether I'm using the right data structure for this (I'm planning to send it to the JSON library to create a JSON object)
How to use AutoVivification in this instance
Whether there's a better way in general to approach this problem.
I came up with the following function but it doesn't work:
def get_nested(dict,array,i):
if i != None:
i += 1
if array[i] in dict:
return get_nested(dict[array[i]],array)
else:
return dict
else:
i = 0
return get_nested(dict[array[i]],array)
Would appreciate help!
(The rest of my extremely incomplete code is here:)
#Import relevant libraries
import codecs
import sys
#Functions
def stripped(str):
if tab_spaced:
return str.lstrip('\t').rstrip('\n\r')
else:
return str.lstrip().rstrip('\n\r')
def current_ws():
if whitespacing == 0 or not tab_spaced:
return len(line) - len(line.lstrip())
if tab_spaced:
return len(line) - len(line.lstrip('\t\n\r'))
def get_nested(adict,anarray,i):
if i != None:
i += 1
if anarray[i] in adict:
return get_nested(adict[anarray[i]],anarray)
else:
return adict
else:
i = 0
return get_nested(adict[anarray[i]],anarray)
#initialise variables
jsondict = {}
unclosed_tags = []
debug = []
vividfilename = 'simple.vivid'
# vividfilename = sys.argv[1]
if len(sys.argv)>2:
jsfilename = sys.argv[2]
else:
jsfilename = vividfilename.split('.')[0] + '.json'
whitespacing = 0
whitespace_array = [0,0]
tab_spaced = False
#open the file
with codecs.open(vividfilename,'rU', "utf-8-sig") as vividfile:
for line in vividfile:
#work out how many whitespaces at start
whitespace_array.append(current_ws())
#For first line with whitespace, work out the whitespacing (eg tab vs 4-space)
if whitespacing == 0 and whitespace_array[-1] > 0:
whitespacing = whitespace_array[-1]
if line[0] == '\t':
tab_spaced = True
#strip out whitespace at start and end
stripped_line = stripped(line)
if whitespace_array[-1] == 0:
jsondict[stripped_line] = ""
unclosed_tags.append(stripped_line)
if whitespace_array[-2] < whitespace_array[-1]:
oldnested = get_nested(jsondict,whitespace_array,None)
print oldnested
# jsondict.pop(unclosed_tags[-1])
# jsondict[unclosed_tags[-1]]={stripped_line:""}
# unclosed_tags.append(stripped_line)
print jsondict
print unclosed_tags
print jsondict
print unclosed_tags

Here is an object oriented approach based on a composite structure of nested Node objects.
Input:
indented_text = \
"""
apple
colours
red
yellow
green
type
granny smith
price
0.10
"""
a Node class
class Node:
def __init__(self, indented_line):
self.children = []
self.level = len(indented_line) - len(indented_line.lstrip())
self.text = indented_line.strip()
def add_children(self, nodes):
childlevel = nodes[0].level
while nodes:
node = nodes.pop(0)
if node.level == childlevel: # add node as a child
self.children.append(node)
elif node.level > childlevel: # add nodes as grandchildren of the last child
nodes.insert(0,node)
self.children[-1].add_children(nodes)
elif node.level <= self.level: # this node is a sibling, no more children
nodes.insert(0,node)
return
def as_dict(self):
if len(self.children) > 1:
return {self.text: [node.as_dict() for node in self.children]}
elif len(self.children) == 1:
return {self.text: self.children[0].as_dict()}
else:
return self.text
To parse the text, first create a root node.
Then, remove empty lines from the text, and create a Node instance for every line, pass this to the add_children method of the root node.
root = Node('root')
root.add_children([Node(line) for line in indented_text.splitlines() if line.strip()])
d = root.as_dict()['root']
print(d)
result:
{'apple': [
{'colours': ['red', 'yellow', 'green']},
{'type': 'granny smith'},
{'price': '0.10'}]
}
I think that it should be possible to do it in one step, where you simply call the constructor of Node once, with the indented text as an argument.

Here is a recursive solution. First, transform the input in the following way.
Input:
person:
address:
street1: 123 Bar St
street2:
city: Madison
state: WI
zip: 55555
web:
email: boo#baz.com
First-step output:
[{'name':'person','value':'','level':0},
{'name':'address','value':'','level':1},
{'name':'street1','value':'123 Bar St','level':2},
{'name':'street2','value':'','level':2},
{'name':'city','value':'Madison','level':2},
{'name':'state','value':'WI','level':2},
{'name':'zip','value':55555,'level':2},
{'name':'web','value':'','level':1},
{'name':'email','value':'boo#baz.com','level':2}]
This is easy to accomplish with split(':') and by counting the number of leading tabs:
def tab_level(astr):
"""Count number of leading tabs in a string
"""
return len(astr)- len(astr.lstrip('\t'))
Then feed the first-step output into the following function:
def ttree_to_json(ttree,level=0):
result = {}
for i in range(0,len(ttree)):
cn = ttree[i]
try:
nn = ttree[i+1]
except:
nn = {'level':-1}
# Edge cases
if cn['level']>level:
continue
if cn['level']<level:
return result
# Recursion
if nn['level']==level:
dict_insert_or_append(result,cn['name'],cn['value'])
elif nn['level']>level:
rr = ttree_to_json(ttree[i+1:], level=nn['level'])
dict_insert_or_append(result,cn['name'],rr)
else:
dict_insert_or_append(result,cn['name'],cn['value'])
return result
return result
where:
def dict_insert_or_append(adict,key,val):
"""Insert a value in dict at key if one does not exist
Otherwise, convert value to list and append
"""
if key in adict:
if type(adict[key]) != list:
adict[key] = [adict[key]]
adict[key].append(val)
else:
adict[key] = val

The following code will take a block-indented file and convert into an XML tree; this:
foo
bar
baz
ban
bal
...becomes:
<cmd>foo</cmd>
<cmd>bar</cmd>
<block>
<name>baz</name>
<cmd>ban</cmd>
<cmd>bal</cmd>
</block>
The basic technique is:
Set indent to 0
For each line, get the indent
If > current, step down and save current block/ident on a stack
If == current, append to current block
If < current, pop from the stack until you get to the matching indent
So:
from lxml import builder
C = builder.ElementMaker()
def indent(line):
strip = line.lstrip()
return len(line) - len(strip), strip
def parse_blockcfg(data):
top = current_block = C.config()
stack = []
current_indent = 0
lines = data.split('\n')
while lines:
line = lines.pop(0)
i, line = indent(line)
if i==current_indent:
pass
elif i > current_indent:
# we've gone down a level, convert the <cmd> to a block
# and then save the current ident and block to the stack
prev.tag = 'block'
prev.append(C.name(prev.text))
prev.text = None
stack.insert(0, (current_indent, current_block))
current_indent = i
current_block = prev
elif i < current_indent:
# we've gone up one or more levels, pop the stack
# until we find out which level and return to it
found = False
while stack:
parent_indent, parent_block = stack.pop(0)
if parent_indent==i:
found = True
break
if not found:
raise Exception('indent not found in parent stack')
current_indent = i
current_block = parent_block
prev = C.cmd(line)
current_block.append(prev)
return top

First of all, don't use array and dict as variable names because they're reserved words in Python and reusing them may end up in all sorts of chaos.
OK so if I get you correctly, you have a tree given in a text file, with parenthood indicated by indentations, and you want to recover the actual tree structure. Right?
Does the following look like a valid outline? Because I have trouble putting your current code into context.
result = {}
last_indentation = 0
for l in f.xreadlines():
(c, i) = parse(l) # create parse to return character and indentation
if i==last_indentation:
# sibling to last
elif i>last_indentation:
# child to last
else:
# end of children, back to a higher level
OK then your list are the current parents, that's in fact right - but I'd keep them pointed to the dictionary you've created, not the literal letter
just starting some stuff here
result = {}
parents = {}
last_indentation = 1 # start with 1 so 0 is the root of tree
parents[0] = result
for l in f.xreadlines():
(c, i) = parse(l) # create parse to return character and indentation
if i==last_indentation:
new_el = {}
parents[i-1][c] = new_el
parents[i] = new_el
elif i>last_indentation:
# child to last
else:
# end of children, back to a higher level

How to implement the remove function of a trie in python?

I've read the following implementation of the trie in python:
https://stackoverflow.com/a/11016430/2225221
and tried to make the remove fnction for it.
Basically, I had problems even with the start: If you want to remove a word from a trie, it can has sub-"words", or it can be "subword" of another word.
If you remove with "del dict[key]", you are removing these above mentioned two kinds of words also.
Could anyone help me in this, how to remove properly the chosen word (let us presume it's in the trie)

Basically, to remove a word from the trie (as it is implemented in the answer you linked to), you'd just have to remove its _end marker, for example like this:
def remove_word(trie, word):
current_dict = trie
for letter in word:
current_dict = current_dict.get(letter, None)
if current_dict is None:
# the trie doesn't contain this word.
break
else:
del current_dict[_end]
Note however that this doesn't ensure that the trie has its minimal size. After deleting the word, there may be branches in the trie left that are no longer used by any words. This doesn't affect the correctness of the data structure, it just means that the trie may consume more memory than absolutely necessary. You could improve this by iterating backwards from the leaf node and delete branches until you find one that has more than one child.
EDIT: Here's an idea how you could implement a remove function that also culls any unnecessary branches. There's probably a more efficient way to do it, but this might get you started:
def remove_word2(trie, word):
current_dict = trie
path = [current_dict]
for letter in word:
current_dict = current_dict.get(letter, None)
path.append(current_dict)
if current_dict is None:
# the trie doesn't contain this word.
break
else:
if not path[-1].get(_end, None):
# the trie doesn't contain this word (but a prefix of it).
return
deleted_branches = []
for current_dict, letter in zip(reversed(path[:-1]), reversed(word)):
if len(current_dict[letter]) <= 1:
deleted_branches.append((current_dict, letter))
else:
break
if len(deleted_branches) > 0:
del deleted_branches[-1][0][deleted_branches[-1][1]]
del path[-1][_end]
Essentially, it first finds the "path" to the word that is about to be deleted and then iterates through that backwards to find nodes that can be removed. It then removes the root of the path that can be deleted (which also implicitly deletes the _end node).

I think it is better to do it recursively, code as following:
def remove(self, word):
self.delete(self.tries, word, 0)
def delete(self, dicts, word, i):
if i == len(word):
if 'end' in dicts:
del dicts['end']
if len(dicts) == 0:
return True
else:
return False
else:
return False
else:
if word[i] in dicts and self.delete(dicts[word[i]], word, i + 1):
if len(dicts[word[i]]) == 0:
del dicts[word[i]]
return True
else:
return False
else:
return False

def remove_a_word_util(self, word, idx, node):
if len(word) == idx:
node.is_end_of_word = False
return bool(node.children)
ch = word[idx]
if ch not in node.children:
return True
flag = self.remove_a_word_util(word, idx+1, node.children[ch])
if flag:
return True
node.children.pop(ch)
return bool(node.children) or node.is_end_of_word

One method of handling structures like this is through recursion. The great thing about recursion in this case is that it zips to the bottom of the trie, then passes the returned values back up through the branches.
The following function does just that. It goes to the leaf and deletes the _end value, just in case the input word is a prefix of another. It then passes up a boolean (boo) which indicates that the current_dict is still in an outlying branch. Once we hit a point where the current dict has more than one child, we delete the appropriate branch and set boo to False so each remaining recursion will do nothing.
def trie_trim(term, trie=SYNONYMS, prev=0):
# checks that we haven't hit the end of the word
if term:
first, rest = term[0], term[1:]
current_length = len(trie)
next_length, boo = trie_trim(rest, trie=trie[first], prev=current_length)
# this statement avoids trimming excessively if the input is a prefix because
# if the word is a prefix, the first returned value will be greater than 1
if boo and next_length > 1:
boo = False
# this statement checks for the first occurrence of the current dict having more than one child
# or it checks that we've hit the bottom without trimming anything
elif boo and (current_length > 1 or not prev):
del trie[first]
boo = False
return current_length, boo
# when we do hit the end of the word, delete _end
else:
del trie[_end]
return len(trie) + 1, True

A bit of a long one, but I hope this helps answer your question:
class Trie:
WORD_END = "$"
def __init__(self):
self.trie = {}
def insert(self, word):
cur = self.trie
for char in word:
if char not in cur:
cur[char] = {}
cur = cur[char]
cur[Trie.WORD_END] = word
def delete(self, word):
def _delete(word, cur_trie, i=0):
if i == len(word):
if Trie.WORD_END not in cur_trie:
raise ValueError("'%s' is not registered in the trie..." %word)
cur_trie.pop(Trie.WORD_END)
if len(cur_trie) > 0:
return False
return True
if word[i] not in cur_trie:
raise ValueError("'%s' is not registered in the trie..." %word)
cont = _delete(word, cur_trie[word[i]], i+1)
if cont:
cur_trie.pop(word[i])
if Trie.WORD_END in cur_trie:
return False
return True
return False
_delete(word, self.trie)
t = Trie()
t.insert("bar")
t.insert("baraka")
t.insert("barakalar")
t.delete("barak") # raises error as 'barak' is not a valid WORD_END although it is a valid path.
t.delete("bareka") # raises error as 'e' does not exist in the path.
t.delete("baraka") # deletes the WORD_END of 'baraka' without deleting any letter as there is 'barakalar' afterwards.
t.delete("barakalar") # deletes until the previous word (until the first Trie.WORD_END; "$" - by going backwards with recursion) in the same path (until 'baraka').

In case you need the whole DS:
class TrieNode:
def __init__(self):
self.children = {}
self.wordCounter = 0
self.prefixCounter = 0
class Trie:
def __init__(self):
self.root = TrieNode()
def insert(self, word: str) -> None:
node = self.root
for char in word:
if char not in node.children:
node.children[char] = TrieNode()
node.prefixCounter += 1
node = node.children[char]
node.wordCounter += 1
def countWordsEqualTo(self, word: str) -> int:
node = self.root
if node.children:
for char in word:
node = node.children[char]
else:
return 0
return node.wordCounter
def countWordsStartingWith(self, prefix: str) -> int:
node = self.root
if node.children:
for char in prefix:
node = node.children[char]
else:
return 0
return node.prefixCounter
def erase(self, word: str) -> None:
node = self.root
for char in word:
if node.children:
node.prefixCounter -= 1
node = node.children[char]
else:
return None
node.wordCounter -= 1
if node.wordCounter == 0:
self.dfsRemove(self.root, word, 0)
def dfsRemove(self, node: TrieNode, word: str, idx: int) -> None:
if len(word) == idx:
node.wordCounter = 0
return
char = word[idx]
if char not in node.children:
return
self.dfsRemove(node.children[char], word, idx+1)
node.children.pop(char)
trie = Trie();
trie.insert("apple"); #// Inserts "apple".
trie.insert("apple"); #// Inserts another "apple".
print(trie.countWordsEqualTo("apple")) #// There are two instances of "apple" so return 2.
print(trie.countWordsStartingWith("app")) #// "app" is a prefix of "apple" so return 2.
trie.erase("apple") #// Erases one "apple".
print(trie.countWordsEqualTo("apple")) #// Now there is only one instance of "apple" so return 1.
print(trie.countWordsStartingWith("app")) #// return 1
trie.erase("apple"); #// Erases "apple". Now the trie is empty.
print(trie.countWordsEqualTo("apple")) #// return 0
print(trie.countWordsStartingWith("app")) #// return 0

I would argue that this implementation is the most succinct and easiest to understand after a bit of staring.
def removeWord(word, node=None):
if not node:
node = self.root
if word == "":
node.isEnd = False
return
newnode = node.children[word[0]]
removeWord(word[1:], newnode)
if not newnode.isEnd and len(newnode.children) == 0:
del node.children[word[0]]
Although it's a little tricky to understand with the default parameter node=None at first, this is the most succinct implementation of a Trie removal that handles marking the word node.isEnd = False while also pruning extraneous nodes.
The method is first called as Trie.removeWord("ToBeDeletedWord").
In subsequent recursion calls, a node tied to the corresponding letter ("T" then "o" then "B" then "e" etc. etc.) is added to the next recursion (e.g "remove 'oBeDeletedWord' with the node at T").
Once we hit the end node that has the full string ToBeDeletedWord , the last recursion calls removeWord("", <node d>)
In this last recursion call, we mark node.isEnd = False. Later, the node is no longer marked isEnd and it has no children so we can call the delete operator.
Once that last recursion call ends, the rest of the recursions (e.g TobeDeletedWor, TobeDeletedWo, TobeDeletedW, etc. etc.) will then observe that it too is not an end node and there are no more children. These nodes will also delete.
You will have to read this a couple of times but this implementation is concise, readable, and correct. The difficulty is that the recursion happens midfunction rather than at the beginning or end.

TL;DR
class TrieNode:
children: dict[str, "TrieNode"]
def __init__(self) -> None:
self.children = {}
self.end = False
def __contains__(self, char: str) -> bool:
return char in self.children
def __getitem__(self, __name: str) -> "TrieNode":
return self.children[__name]
def __setitem__(self, __name: str, __value: "TrieNode") -> None:
self.children[__name] = __value
def __len__(self):
return len(self.children)
def __delitem__(self, __name: str):
del self.children[__name]
class Trie:
def __init__(self, words: list[str]) -> None:
self.root = TrieNode()
for w in words:
self.insert(w)
def insert(self, word: str):
curr = self.root
for c in word:
curr = curr.children.setdefault(c, TrieNode())
curr.end = True
def remove(self, word: str):
def _remove(node: TrieNode, index: int):
if index >= len(word):
node.end = False
if not node.children:
return True
elif word[index] in node:
if _remove(node[word[index]], index + 1):
del node[word[index]]
_remove(self.root, 0)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.