Printing Suffix Trees Edges in Python

Printing Suffix Trees Edges in Python - python

I was looking through the code written by Ben Langmead on SuffixTrees. I am having a hard time figuring out how to print all the edges of the suffix tree. What is a way to store them in a set and save it in the object class?
class SuffixTree(object):
class Node(object):
def __init__(self, lab):
self.lab = lab # label on path leading to this node
self.out = {} # outgoing edges; maps characters to nodes
def __init__(self, s):
""" Make suffix tree, without suffix links, from s in quadratic time
and linear space """
suffix=[]
self.suffix=suffix
self.root = self.Node(None)
self.root.out[s[0]] = self.Node(s) # trie for just longest suf
# add the rest of the suffixes, from longest to shortest
for i in xrange(1, len(s)):
# start at root; we’ll walk down as far as we can go
cur = self.root
j = i
while j < len(s):
if s[j] in cur.out:
child = cur.out[s[j]]
lab = child.lab
# Walk along edge until we exhaust edge label or
# until we mismatch
k = j+1
while k-j < len(lab) and s[k] == lab[k-j]:
k += 1
if k-j == len(lab):
cur = child # we exhausted the edge
suffix.append(child.lab)
j = k
else:
# we fell off in middle of edge
cExist, cNew = lab[k-j], s[k]
# create “mid”: new node bisecting edge
mid = self.Node(lab[:k-j])
mid.out[cNew] = self.Node(s[k:])
# original child becomes mid’s child
mid.out[cExist] = child
# original child’s label is curtailed
child.lab = lab[k-j:]
# mid becomes new child of original parent
cur.out[s[j]] = mid
else:
# Fell off tree at a node: make new edge hanging off it
cur.out[s[j]] = self.Node(s[j:])
def followPath(self, s):
""" Follow path given by s. If we fall off tree, return None. If we
finish mid-edge, return (node, offset) where 'node' is child and
'offset' is label offset. If we finish on a node, return (node,
None). """
cur = self.root
i = 0
while i < len(s):
c = s[i]
if c not in cur.out:
return (None, None) # fell off at a node
child = cur.out[s[i]]
lab = child.lab
j = i+1
while j-i < len(lab) and j < len(s) and s[j] == lab[j-i]:
j += 1
if j-i == len(lab):
cur = child # exhausted edge
i = j
elif j == len(s):
return (child, j-i) # exhausted query string in middle of edge
else:
return (None, None) # fell off in the middle of the edge
return (cur, None) # exhausted query string at internal node
def hasSubstring(self, s):
""" Return true iff s appears as a substring """
node, off = self.followPath(s)
return node
def hasSuffix(self, s):
""" Return true iff s is a suffix """
node, off = self.followPath(s)
if node is None:
return False # fell off the tree
if off is None:
# finished on top of a node
return '$' in node.out
else:
# finished at offset 'off' within an edge leading to 'node'
return node.lab[off] == '$'

Related

Sum levels wise binary tree

I must implement a sum for each level in a binary tree
I must return a list that contains at position i, the ith sum
example: if I have this tree
24
/ \
14 27
/ \ / \
11 20 12 55
I must return [24, 41, 98]
I tried to implement this solution in python
def sommaperlivelli(p, lista):
if p == None:
return
if p.left != None and p.right != None:
lista.append(p.left.key + p.right.key)
sommaperlivelli(p.left, lista)
sommaperlivelli(p.right, lista)
return lista
I can get only the first sum and I can't add the root. How I can do it?
this is the class that I use
class NodoABR:
def __init__(self, key = None, left = None, right = None, parent = None):
self.key = key
self.left = left
self.right = right
self.parent = parent
this is how I add node to the tree
def inserisciNodo(p, n):
if p == None:
return NodoABR(n)
else:
if p.key == n:
return p
elif p.key < n:
rchild = inserisciNodo(p.right, n)
p.right = rchild
rchild.parent = p
else:
lchild = inserisciNodo(p.left, n)
p.left = lchild
lchild.parent = p
return p
this is a BinarySearchTree
in main function I do this
p = NodoABR(24)
p = inserisciNodo(p, 14)
p = inserisciNodo(p, 27)
p = inserisciNodo(p, 11)
p = inserisciNodo(p, 20)
p = inserisciNodo(p, 12)
p = inserisciNodo(p, 55)
print(sommaperlivelli(p,[]))

Assuming you have similar Tree Node class, you can try this & modify to suite your particular needs.
from collections import deque
class Node: # TreeNode eg.
def __init__(self, key):
self.data = key
self.left = None
self.right = None
def LevelSum(root):
answer = []
if (root == None): return 0 # base case
result = root.data
# track of # nodes at every level when traversal, and sum them up
q = deque()
q.append(root)
while len(q): # > 0):
size = len(q)
# Iterate for all the nodes
tot = 0
while size: # meaning --- size > 0
# Dequeue an node from queue
tmp = q.popleft()
# Add this node's value to current sum.
tot += tmp.data
# Enqueue left and right children of dequeued node
if (tmp.left != None): q.append(tmp.left)
if (tmp.right != None): q.append(tmp.right)
size -= 1
print(tot, result)
answer.append(tot)
return answer
Example:
if __name__ == '__main__':
root = Node(1)
root.left = Node(2)
root.right = Node(3)
root.left.left = Node(4)
root.left.right = Node(5)
root.right.right = Node(6)
root.right.right.left = Node(9)
root.right.right.right = Node(11)
"""
# Constructed Binary tree is:
# 1
# / \
# 2 3 # 5
# / \ \
# 4 5 6 # 15
# / \
# 9 11 # 20
"""
print("level sum is", LevelSum(root))
[Edit] The PO author wants to learn another approach. (non-queue)
from collections import Counter
from typing import List
class Node:
def __init__(self, key):
self.data = key
self.left = None
self.right = None
def level_sum(root: Node) -> int:
def dfs(node, level):
if not node: return
total[level] += node.data
dfs(node.left,level + 1)
dfs(node.right,level + 1)
total = Counter()
dfs(root, 1)
print(total) # showing each level & its total --- can be commented out
return total.most_common()[0][1] # the largest total

Using your NodoABR class, here's a simple tree traversal, depth-first, where we note the level at each iteration, and use it to store the node values, arr is an array (or list) of lists, one for each level:
# Create sample tree
root = NodoABR(24, NodoABR(14, NodoABR(11), NodoABR(20)), NodoABR(27, NodoABR(12), NodoABR(55)))
def handle_node(nd, level):
# Add a new level list if needed
if len(arr) == level:
arr.append([])
# Append this node value to its proper level list
arr[level].append(nd.key)
# Recurse, increasing the level
if nd.left is not None:
handle_node(nd.left, level+1)
if nd.right is not None:
handle_node(nd.right, level+1)
# Main program
arr = []
handle_node(root, 0)
print(arr)
And this is the output:
[[24], [14, 27], [11, 20, 12, 55]]

Algorithm to remove words from a trie whose frequency < 5 and length > 15

I have a huge trie dictionary that I built from data from web. Although it is just 5MB when I write the trie into a file its' size is so big when I load it on the memory (more than 100 MB). So I've to compress the trie.
I am facing difficulties in writing a recursive function (preferably runs in linear time like a DFS) to remove the words whose frequency is < 5 and length > 15. Any help is appreciated
Here is my trie structure.
class TrieNode:
def __init__(self):
self.ch = '|'
self.score = 0
self.childs = [None]*26
self.isWord = False
class Trie:
def __init__(self):
self.root = TrieNode('$')
#staticmethod
def print_trie(node, level):
if node is None:
return
print(node.ch, " ", level, " ", node.isWord)
for i in range(26):
Trie.print_trie(node.childs[i], level+1)
def insert(self, word):
word = word.lower()
if not is_valid(word):
return
childs = self.root.childs
i = 0
while i < len(word):
idx = to_int(word[i])
if childs[idx] is not None:
t = childs[idx]
else:
t = TrieNode(word[i])
childs[idx] = t
childs = t.childs
if i == len(word)-1:
t.isWord = True
t.score += 1
i += 1
def search_node(self, word):
word = word.lower()
if not is_valid(word):
return False, 0
if self.root is None or word is None or len(word) == 0:
return False, 0
children = self.root.childs
for i in range(len(word)):
idx = to_int(word[i])
if children[idx] is not None:
t = children[idx]
children = t.childs
else:
return False, 0
if t.isWord:
return True, t.score
else:
return False, t.score

The following method takes a node and its level (initially pass in root and 0) and returns True if the node should remain alive after pruning and False if the node should be removed from the trie (with its subtrie).
def prune(node, level):
if node is None:
return False
canPruneNode = True
for idx in xrange(len(node.children)):
# If any of the children remains alive, don't prune current node.
if prune(children[idx], level + 1):
canPruneNode = False
else:
# Remove dead child.
node.children[idx] = None
if node.isWord and level > 15 and node.score < 5:
node.isWord = False
# Current node should be removed if and only if all of its children
# were removed and it doesn't represent a word itself after pruning.
return node.isWord or not canPruneNode

I am not sure if removing will solve the problem. The space consumed is not because of the words but because of the 26 children every node has.
Eg. I have a word cat with frequency 30 & there's another word cater whose frequency is 10. So, if you delete the node for t in cat then all the subsequent nodes will be deleted (that is cater will be reduced to cat)
So, removing a word from Trie means nothing but setting its score to 0.

Counting word strokes while parsing Trie tree

I'm trying to solve the keyboard autocompletion problem described here.
The problem is to calculate how many keystrokes a word requires, given some dictionary and autocomplete rules. For example, for the dictionary:
data = ['hello', 'hell', 'heaven', 'goodbye']
We get the following results (please refer to the link above for further explanations):
{'hell': 2, 'heaven': 2, 'hello': 3, 'goodbye': 1}
Quick explanation: if the user types h, then e is autocompleted because all words starting with h also have e as second letter. Now if the user types in l, the other l is filled, giving 2 strokes for the word hell. Of course, hello would require one more stroke. Please, see the link above for more examples.
My Trie code is the following, and it works fine (taken from https://en.wikipedia.org/wiki/Trie). The Stack code is to parse the tree from root (see edit below):
class Stack(object):
def __init__(self, size):
self.data = [None]*size
self.i = 0
self.size = size
def pop(self):
if self.i == 0:
return None
item = self.data[self.i - 1]
self.i-= 1
return item
def push(self, item):
if self.i >= self.size:
return None
self.data[self.i] = item
self.i+= 1
return item
def __str__(self):
s = '# Stack contents #\n'
if self.i == 0:
return
for idx in range(self.i - 1, -1, -1):
s+= str(self.data[idx]) + '\n'
return s
class Trie(object):
def __init__(self, value, children):
self.value = value #char
self.children = children #{key, trie}
class PrefixTree(object):
def __init__(self, data):
self.root = Trie(None, {})
self.data = data
for w in data:
self.insert(w, w)
def insert(self, string, value):
node = self.root
i = 0
n = len(string)
while i < n:
if string[i] in node.children:
node = node.children[string[i]]
i = i + 1
else:
break
while i < n:
node.children[string[i]] = Trie(string[:i], {})
node = node.children[string[i]]
i = i + 1
node.value = value
def find(self, key):
node = self.root
for char in key:
if char in node.children:
node = node.children[char]
else:
return None
return node
I couldn't figure it out how to count the number of strokes:
data = ['hello', 'hell', 'heaven', 'goodbye']
tree = PrefixTree(data)
strokes = {w:1 for w in tree.data} #at least 1 stroke is necessary
And here's the code to parse the tree from the root:
stack = Stack(100)
stack.push((None, pf.root))
print 'Key\tChilds\tValue'
print '--'*25
strokes = {}
while stack.i > 0:
key, curr = stack.pop()
# if something:
#update strokes
print '%s\t%s\t%s' % (key, len(curr.children), curr.value)
for key, node in curr.children.items():
stack.push((key, node))
print strokes
Any idea or constructive comment would help, thanks!
Edit
Great answer by #SergiyKolesnikov. There's one small change that can be done in order to avoid the call to endsWith(). I just added a boolean field to the Trie class:
class Trie(object):
def __init__(self, value, children, eow):
self.value = value #char
self.children = children #{key, trie}
self.eow = eow # end of word
And at the end of insert():
def insert(self, string, value):
#...
node.value = value
node.eow = True
Then just replace curr.value.endswith('$'): with curr.eow. Thank you all!

The trie for your example can look like this
' '
| \
H G
| |
E O
| \ |
L A O
| | |
L$ V D
| | |
O E B
| |
N Y
|
E
What nodes in the trie can be seen as markers for user key strokes? There are two types of such nodes:
Inner nodes with more than one child, because the user has to choose among multiple alternatives.
Nodes that represent the last letter of a word, but are not leaves (marked with $), because the user has to type the next letter if the current word is not what is needed.
While traversing the trie recursively one counts how many of these marker nodes were encountered before the last letter of a word was reached. This count is the number of strokes needed for the word.
For the word "hell" it is two marker nodes: ' ' and E (2 strokes).
For the word "hello" it is three marker nodes: ' ', E, L$ (3 strokes).
And so on...
What needs to be changed in your implementation:
The end of a valid word needs to be marked in the tree, so that the second condition can be checked. Therefore, we change the last line of the PrefixTree.insert() method from
node.value = value
to
node.value = value + '$'
Now we add a stroke counter for each stack item (the last value in the triple pushed to the stack) and the checks that increase the counter:
stack = Stack(100)
stack.push((None, tree.root, 0)) # We start with stroke counter = 0
print('Key\tChilds\tValue')
print('--'*25)
strokes = {}
while stack.i > 0:
key, curr, stroke_counter = stack.pop()
if curr.value is not None and curr.value.endswith('$'):
# The end of a valid word is reached. Save the word and the corresponding stroke counter.
strokes[curr.value[:-1]] = stroke_counter
if len(curr.children) > 1:
# Condition 2 is true. Increase the stroke counter.
stroke_counter += 1
if curr.value is not None and curr.value.endswith('$') and len(curr.children) > 0:
# Condition 1 is true. Increase the stroke counter.
stroke_counter += 1
print('%s\t%s\t%s' % (key, len(curr.children), curr.value))
for key, node in curr.children.items():
stack.push((key, node, stroke_counter)) # Save the stroke counter
print(strokes)
Output:
Key Childs Value
--------------------------------------------------
None 2 None
h 1
e 2 h
a 1 he
v 1 hea
e 1 heav
n 0 heaven$
l 1 he
l 1 hell$
o 0 hello$
g 1
o 1 g
o 1 go
d 1 goo
b 1 good
y 1 goodb
e 0 goodbye$
{'heaven': 2, 'goodbye': 1, 'hell': 2, 'hello': 3}

While you go through your stack, you should keep a stroke counter for each node:
It begins at 0 for None.
If the current node has more than 2 children, the counter of the
children will be 1 more than the current counter.
If the current value is a valid word and has at least one child, the
counter of the child(ren) will be 1 more than the current counter.
For documentation purpose, here's my Ruby answer :
class Node
attr_reader :key, :children
attr_writer :final
def initialize(key, children = [])
#key = key
#children = children
#final = false
end
def final?
#final
end
end
class Trie
attr_reader :root
def initialize
#root = Node.new('')
end
def add(word)
node = root
word.each_char.each{|c|
next_node = node.children.find{|child| child.key == c}
if next_node then
node = next_node
else
next_node = Node.new(c)
node.children.push(next_node)
node = next_node
end
}
node.final = true
end
def count_strokes(node=root,word="",i=0)
word=word+node.key
strokes = {}
if node.final? then
strokes[word]=i
if node.children.size>0 then
i+=1
end
elsif node.children.size>1 then
i+=1
end
node.children.each{|c|
strokes.merge!(count_strokes(c, word, i))
}
strokes
end
end
data = ['hello', 'hell', 'heaven', 'goodbye']
trie = Trie.new
data.each do |word|
trie.add(word)
end
# File.readlines('/usr/share/dict/british-english').each{|line|
# trie.add line.strip
# }
puts trie.count_strokes
#=> {"hell"=>2, "hello"=>3, "heaven"=>2, "goodbye"=>1}
60 lines only, and it take less than 3 seconds for 100 000 words.

Making a linked list without built in functions

I have a project to make a linked list in python.
My program needs to add to the list, remove from it and get elements. Sounds easy right? Wrong! We aren't allowed to use normal lists or built-in functions (other than the basic print, str...)
I have one problem with my code, I have to initialise a blank list then add the elements, 1 by 1. Everything else works fine.
My questions:
Is this how a normal python list works?
Is it possible to add items to a linked list with a loop? (without another list)
Here's the code:
class Node: # the node class
def __init__(self, cargo = None, next = None): # __init__ stands for initialize
self.cargo = cargo # e.g. steve
self.next = next # represents the next node or None if its the last
def __str__(self): # __str__ is called when the node is printed or converted to a string
return str(self.cargo) # return a string
class List: # the main list class
def __init__(self): # default is to initialize an empty list
self.first_node = None
self.last_node = None
self.length = 0
def get(self, position, length): # function for efficiency
if position == "end":
position = length - 1 # last
if position > length - 1: # can't go beyond last
raise ValueError("List index out of range")
prv_node = self.first_node
node = self.first_node # start at the first
num = 0
while num < position: # go up to position
prv_node = node # remember the previous node
node = node.next # next node!
num = num + 1
return prv_node, node, position
def add_node(self, cargo, position = "end"): # adds a node
prv_node, node, position = self.get(position, self.length + 1) # +1 because the length is being increased
print("adding node at "+str(position)+": "+str(cargo))
if position == 0: # first
self.first_node = Node(cargo, next = self.first_node) # the first node is the new node
if self.length == 0: # first node to be added
self.last_node = self.first_node # there is only one node
elif position == self.length: # last
self.last_node.next = Node(cargo, next = None) # last_node.next was None, it is now a new node
self.last_node = self.last_node.next # last node is now the new last node
else: # normal
prv_node.next = Node(cargo, next = node) # stick it in between
self.length = self.length + 1 # length is now + 1
def get_node(self, position): # gets a node
...
def remove_node(self, position): # removes a node
...
def __str__(self): # when the list is printed
node = self.first_node # start from the first
string = ""
while node != self.last_node: # go to the end
string = string + str(node) + ", " # print each node
node = node.next
string = string + str(self.last_node) # last node hasn't been added yet
return string
# initialize
mylist = List()
mylist.add_node("steve")
mylist.add_node("james")
mylist.add_node("tom")
mylist.add_node("david")
mylist.add_node("hoe-yin")
mylist.add_node("daniel")
print(mylist)
[EDIT] second question re-phrased

Here's how Python lists are implemented in CPython: http://www.laurentluce.com/posts/python-list-implementation/
If you have your values in some other iterable, then yes:
for item in list_of_items:
mylist.add_node(item)

A* algorithm implementation node-parent bug

I have a problem with my attempt at A* - this does not happen every time it runs, but every so often it makes it so two cells become each other's parents.
This in turn leads to an infinite loop when I try to recreate the path. I've been trying to find out why this happens for some time now, but can't figure it out, so any help would be appreciated.
def find_path(self, start, end):
tstart = time.time()
count = 0
evaled = set()
notEvaled = set([start])
start.g_score(start)
start.h_score(end)
start.f_score()
while len(notEvaled) != 0:
current = min(notEvaled, key=attrgetter('f'))
notEvaled.remove(current)
for neighbour in current.neighbours:
if neighbour.full or neighbour in evaled: continue
if neighbour in notEvaled:
if neighbour.parent != None and neighbour.parent.g > current.g:
neighbour.parent = current
neighbour.g_score(start)
neighbour.f_score()
else:
neighbour.parent = current
neighbour.g_score(start)
neighbour.h_score(end)
neighbour.f_score()
notEvaled.add(neighbour)
if neighbour == end:
path = []
while end != None:
path.insert(0, end)
end = end.parent
return path
evaled.add(current)
return None
Here are my score function, but i doubt they matter
def g_score(self, start):
if self == start:
self.g = 0
else:
self.g = self.parent.g + 1
def h_score(self, end):
self.h = abs(end.x - self.x) + abs(end.y - self.y)
def f_score(self):
self.f = self.g + self.h

Before I begin: the common naming convention is openSet/closedSet instead of evaled/notEvaled. The naming convention you use becomes confusing when you need to use a double negative, as in: aNode not in notEvaled. So, in my answer I use the openSet/closedSet convension.
One thing that stands out to me is you add the current node to the closedSet (evaled) immediately after removing it from the openSet (notEvaled).
The second thing that stands out is that you check whether a node is the endpoint before adding it to the openSet. While this may seem like an optimization, it actually forces you to perform the check on every single opened node, rather than on the subset of open nodes that are actually visited.
Here is an adaptation of your A* algorithm that should work as expected. It is based on the pseudo code from Wikipedia except that it integrates the reconstruct_path method:
def find_path(self, start, end):
tstart = time.time()
count = 0
closedSet = set()
openSet = set([start])
start.g_score(start)
start.h_score(end)
start.f_score()
while len(openSet) != 0:
current = min(openSet, key=attrgetter('f'))
openSet.remove(current)
closedSet.add(current)
#Check if goal is reached
if(current == end):
#Construct path from start to goal
path = []
while end != None:
path.insert(0,end)
end = end.parent
return path
#Open neighbors
for neighbour in current.neighbours:
if neighbour.full: continue
if neighbour in closedSet: continue
tentative_g_score = 1 + current.g
if neighbour not in openSet or tentative_g_score < neighbor.g:
neighbour.parent = current
neighbour.g = tentative_g_score
neighbour.f_score()
if neighbour not in openSet:
openSet.add(neighbour)
return None

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Printing Suffix Trees Edges in Python - python

Related

Sum levels wise binary tree

Algorithm to remove words from a trie whose frequency < 5 and length > 15

Counting word strokes while parsing Trie tree

Making a linked list without built in functions

A* algorithm implementation node-parent bug

Categories

Resources