I have an implementation of Kosaraju's algorithm for finding SCCs in Python. The code below contains a recursive (fine on the small test cases) version and a non-recursive one (which I ultimately need because of the size of the real dataset).
I have run both the recursive and non-recursive version on a few test datasets and get the correct answer. However running it on the much larger dataset that I ultimately need to use, produces the wrong result. Going through the real data is not really an option because it contains nearly a million nodes.
My problem is that I don't know how to proceed from here. My suspision is that I either forgot a certain case of graph constellation in my test cases, or that I have a more fundamental misunderstanding about how this algo is supposed to work.
#!/usr/bin/env python3
import heapq
class Node():
"""A class to represent nodes in a DirectedGraph. It has attributes for
performing DFS."""
def __init__(self, i):
self.id = i
self.edges = []
self.rev_edges = []
self.explored = False
self.fin_time = 0
self.leader = 0
def add_edge(self, edge_id):
self.edges.append(edge_id)
def add_rev_edge(self, edge_id):
self.rev_edges.append(edge_id)
def mark_explored(self):
self.explored = True
def set_leader(self, leader_id):
self.leader = leader_id
def set_fin_time(self, fin_time):
self.fin_time = fin_time
class DirectedGraph():
"""A class to represent directed graphs via the adjacency list approach.
Each dictionary entry is a Node."""
def __init__(self, length, list_of_edges):
self.nodes = {}
self.nodes_by_fin_time = {}
self.length = length
self.fin_time = 1 # counter for the finishing time
self.leader_count = 0 # counter for the size of leader nodes
self.scc_heapq = [] # heapq to store the ssc by size
self.sccs_computed = False
for n in range(1, length + 1):
self.nodes[str(n)] = Node(str(n))
for n in list_of_edges:
ns = n[0].split(' ')
self.nodes[ns[0]].add_edge(ns[1])
self.nodes[ns[1]].add_rev_edge(ns[0])
def n_largest_sccs(self, n):
if not self.sccs_computed:
self.compute_sccs()
return heapq.nlargest(n, self.scc_heapq)
def compute_sccs(self):
"""First compute the finishing times and the resulting order of nodes
via a DFS loop. Second use that new order to compute the SCCs and order
them by their size."""
# Go through the given graph in reverse order, computing the finishing
# times of each node, and create a second graph that uses the finishing
# times as the IDs.
i = self.length
while i > 0:
node = self.nodes[str(i)]
if not node.explored:
self.dfs_fin_times(str(i))
i -= 1
# Populate the edges of the nodes_by_fin_time
for n in self.nodes.values():
for e in n.edges:
e_head_fin_time = self.nodes[e].fin_time
self.nodes_by_fin_time[n.fin_time].add_edge(e_head_fin_time)
# Use the nodes ordered by finishing times to calculate the SCCs.
i = self.length
while i > 0:
self.leader_count = 0
node = self.nodes_by_fin_time[str(i)]
if not node.explored:
self.dfs_leaders(str(i))
heapq.heappush(self.scc_heapq, (self.leader_count, node.id))
i -= 1
self.sccs_computed = True
def dfs_fin_times(self, start_node_id):
stack = [self.nodes[start_node_id]]
# Perform depth-first search along the reversed edges of a directed
# graph. While doing this populate the finishing times of the nodes
# and create a new graph from those nodes that uses the finishing times
# for indexing instead of the original IDs.
while len(stack) > 0:
curr_node = stack[-1]
explored_rev_edges = 0
curr_node.mark_explored()
for e in curr_node.rev_edges:
rev_edge_head = self.nodes[e]
# If the head of the rev_edge has already been explored, ignore
if rev_edge_head.explored:
explored_rev_edges += 1
continue
else:
stack.append(rev_edge_head)
# If the current node has no valid, unexplored outgoing reverse
# edges, pop it from the stack, populate the fin time, and add it
# to the new graph.
if len(curr_node.rev_edges) - explored_rev_edges == 0:
sink_node = stack.pop()
# The fin time is 0 if that node has not received a fin time.
# Prevents dealing with the same node twice here.
if sink_node and sink_node.fin_time == 0:
sink_node.set_fin_time(str(self.fin_time))
self.nodes_by_fin_time[str(self.fin_time)] = \
Node(str(self.fin_time))
self.fin_time += 1
def dfs_leaders(self, start_node_id):
stack = [self.nodes_by_fin_time[start_node_id]]
while len(stack) > 0:
curr_node = stack.pop()
curr_node.mark_explored()
self.leader_count += 1
for e in curr_node.edges:
if not self.nodes_by_fin_time[e].explored:
stack.append(self.nodes_by_fin_time[e])
###### Recursive verions below ###################################
def dfs_fin_times_rec(self, start_node_id):
curr_node = self.nodes[start_node_id]
curr_node.mark_explored()
for e in curr_node.rev_edges:
if not self.nodes[e].explored:
self.dfs_fin_times_rec(e)
curr_node.set_fin_time(str(self.fin_time))
self.nodes_by_fin_time[str(self.fin_time)] = Node(str(self.fin_time))
self.fin_time += 1
def dfs_leaders_rec(self, start_node_id):
curr_node = self.nodes_by_fin_time[start_node_id]
curr_node.mark_explored()
for e in curr_node.edges:
if not self.nodes_by_fin_time[e].explored:
self.dfs_leaders_rec(e)
self.leader_count += 1
To run:
#!/usr/bin/env python3
import utils
from graphs import scc_computation
# data = utils.load_tab_delimited_file('data/SCC.txt')
data = utils.load_tab_delimited_file('data/SCC_5.txt')
# g = scc_computation.DirectedGraph(875714, data)
g = scc_computation.DirectedGraph(11, data)
g.compute_sccs()
# for e, v in g.nodes.items():
# print(e, v.fin_time)
# for e, v in g.nodes_by_fin_time.items():
# print(e, v.edges)
print(g.n_largest_sccs(20))
Most complex test case (SCC_5.txt):
1 5
1 4
2 3
2 11
2 6
3 7
4 2
4 8
4 10
5 7
5 5
5 3
6 8
6 11
7 9
8 2
8 8
9 3
10 1
11 9
11 6
Drawing of that test case: https://imgur.com/a/LA3ObpN
This produces 4 SCCs:
Bottom: Size 4, nodes 2, 8, 6, 11
Left: Size 3, nodes 1, 10, 4
Top: Size 1, node 5
Right: Size 3, nodes 7, 3, 9
Ok, I figured out the missing cases. The algorithm wasn't performing correctly on very strongly connected graphs and duplicated edges. Here is an adjusted version of the test case I posted above with a duplicated edge and more edges to turn the whole graph into one big SCC.
1 5
1 4
2 3
2 6
2 11
3 2
3 7
4 2
4 8
4 10
5 1
5 3
5 5
5 7
6 8
7 9
8 2
8 2
8 4
8 8
9 3
10 1
11 9
11 6
Related
What mistake have i done here ?
def levelOrder(root):
#Write your code here
que = []
que.append(root)
while que != []:
coot = que.pop()
print(coot.data,end=" ")
if coot.left is not None:
que.append(coot.left)
if coot.right is not None:
que.append(coot.right)
OutPut Expected:1 2 5 3 6 4
MY_output: 1 2 5 6 3 4
You are appending nodes to end end of the list que(using append()). And also removing the nodes from the end of the list que(using list.pop()), this would not preserve the order, so for something like
1
/ \
2 3
/ \ / \
4 5 6 7
After first iteration would have que=[2, 3], and then you would pop 3 first instead of 2, which is incorrect. Instead, you should be popping 2, popping from the left(since you are appending the new nodes to the right).
So replacing coot = que.pop() with coot = que.pop(0) in your existing code should fix the issue. But note that list.pop(0) is a O(n) operation python. So I would suggest using collections.deque instead.
With deque, your code can be -
from collections import deque
def levelOrder(root):
#Write your code here
que = deque()
que.append(root)
while que != []:
coot = que.popleft()
print(coot.data,end=" ")
if coot.left is not None:
que.append(coot.left)
if coot.right is not None:
que.append(coot.right)
I have a simple graph with 4 nodes A,B,C,D as well as the following edges:
[A,B]
[B,D]
[B,C]
I want to find paths that start at the node C given a certain length n. For example:
for n = 1 I will only have [C] as a possible path. Result is 1
for n = 2 we only have [C,B]. Result is 1
for n = 3 we have [C,B,C] , [C,B,D], [C,B,A]. Result is 3
etc.
I have written the following (python) code:
dg = {'A':['B'],
'B':['C','D','A'],
'D':['B'],
'C':['B']}
beg = ['C']
def makePath(n):
count = 0
curArr = beg
for i in range(n):
count = len(curArr)
tmp = []
for i in curArr:
tmp.extend(dg[i])
curArr = tmp
return count
However it gets extremely slow above n=12. Is there a better algorithm to solve this and more importantly. one that can be generalized for any undirected graph (i.e. with up to 20 nodes)?
Hi i have a big text file with format like this:
1 3 1
2 3 -1
5 7 1
6 1 -1
3 2 -1
the first column is the starting node, the second column the ending node and the third column shows the sign between two nodes. So i have positive and negative signs.
Im reading the graph with the code below:
G = nx.Graph()
G = nx.read_edgelist('network.txt', delimiter='\t', nodetype=int, data=(('weight', int),))
print(nx.info(G))
I also found a function to find the neighbors of a specific node:
list1 = list(G.neigbors(1))
So i have a list with the adjacency nodes of node 1. How can a find the sign between the node 1 and each adjacency node? (For example that edge between 1-3 has sign 1, the edge 1-5 has sign -1 etc)
An example for node 1:
n_from = 1
for n_to in G.neighbors(n_from):
sign = G[n_from][n_to]['weight']
print('edge from {} to {} has sign {}'.format(
n_from, n_to, sign))
which prints, for the example input you gave:
edge from 1 to 3 has sign 1
edge from 1 to 6 has sign -1
A similar approach, treating G[n_from] as a dict:
n_from = 1
for n_to, e_data in G[n_from].items():
sign = e_data['weight']
# then print
You can alternatively use Graph.get_edge_data, as such:
e_data = G.get_edge_data(n_from, n_to)
sign = e_data.get('weight')
i' m new to algorithm, and I developed such function for finding the tree height of the input sequence.
import sys, threading
class TreeHeight:
def read(self):
self.n = int(sys.stdin.readline())
self.parent = list(map(int, sys.stdin.readline().split()))
def compute_height(self):
# Replace this code with a faster implementation
maxHeight = 0
for vertex in range(self.n):
height = 0
i = vertex
while i != -1:
height += 1
i = self.parent[i]
maxHeight = max(maxHeight, height)
return maxHeight
def main():
tree = TreeHeight()
tree.read()
print(tree.compute_height())
threading.Thread(target=main).start()
That works like this:
5
4 -1 4 1 1
And output
3
Because there are 5 nodes with numbers from 0 to 4, node 0 is a child of node 4, node 1 is the root,node 2 is a child of node 4, node 3 is a child of node 1 and node 4 is a child of node1. The height of this tree is 3, because the number of vertices on the path from root 1 to leaf 2 is 3
My solution is working, but it is not optimized for deep trees. For example, program crashes for input 100000.
What is the better solution in my case?
I am working on implementing the Strongly Connected Components Program from input file of numbers.I know the algorithm on how to do this,but having hard time implementing it in python.
STRONGLY-CONNECTED-COMPONENTS(G)
1. run DFS on G to compute finish times
2. compute G'
3. run DFS on G', but when selecting which node to vist do so
in order of decreasing finish times (as computed in step 1)
4. output the vertices of each tree in the depth-first forest
of step 3 as a separate strongly connected component
The file looks like this:
5 5
1 2
2 4
2 3
3 4
4 5
The first line is no. of nodes and edges.The rest of the lines are two integers u and v separated by a space, which means a directed edge from node u to node v.The output is to be a strongly connected component and the no.of these components.
DFS(G)
1 for each vertex u in G.V
2 u.color = WHITE
3 u.π = NIL
4 time = 0
5 for each vertex u in G.V
6 if u.color == WHITE
7 DFS-VISIT(G, u)
DFS-VISIT(G, u)
1 time = time + 1 // white vertex u has just been discovered
2 u.d = time
3 u.color = GRAY
4 for each v in G.adj[u]
5 if v.color == WHITE
6 v.π = u
7 DFS-VISIT(G, u)
8 u.color = BLACK // blacken u; it is finished
9 time = time + 1
10 u.f = time
In the above algorithm how should I traverse the reverse graph to find SCC.
Here, implemented in Python.
Please notice that I construct G and G' at the same time. My DFS is also modified. The visited array stores in which component each node is. Also, the DFS receives a sequence argument, that is the order in which the nodes will be tested. In the first DFS, we pass a xrange(n), but in the second time, we pass the reversed(order) from the first execution.
The program will output something like:
3
[1, 1, 1, 2, 3]
In that output, we have 3 strongly connected components, with the 3 first nodes in a single component and the remaining two with one component each.
def DFSvisit(G, v, visited, order, component):
visited[v] = component
for w in G[v]:
if not visited[w]:
DFSvisit(G, w, visited, order, component)
order.append(v);
def DFS(G, sequence, visited, order):
components = 0
for v in sequence:
if not visited[v]:
components += 1
DFSvisit(G, v, visited, order, components)
n, m = (int(i) for i in raw_input().strip().split())
G = [[] for i in xrange(n)]
Gt = [[] for i in xrange(n)]
for i in xrange(m):
a, b = (int(i) for i in raw_input().strip().split())
G[a-1].append(b-1)
Gt[b-1].append(a-1)
order = []
components = [0]*n
DFS(G, xrange(n), [0]*n, order)
DFS(Gt, reversed(order), components, [])
print max(components)
print components
class graphSCC:
def __init__(self, graplist):
self.graphlist = graphlist
self.visitedNode = {}
self.SCC_dict = {}
self.reversegraph = {}
def reversegraph(self):
for edge in self.graphlist:
line = edge.split("\t")
self.reverseGraph.setdefault(strip("\r"), []).append()
return self.reverseGraph
def dfs(self):
SCC_count = 0
for x in self.reversegraph.keys():
self.visitednode[x] = 0
for x in self.reversegraph.keys():
if self.visitednode[x] == 0:
count += 1
self.explore(x, count)
def explore(self, node, count):
self.visitednode[node] = 1
for val in self.reversegraph[node]:
if self.visitednode[val] == 0:
self.explore(val, count)
self.SCC_dict.setdefault(count, []).append(node)
length = 0
node = 0
for x in graph.SCC_dict.keys():
if length < len(graph.SCC_dict[x]):
length = len(graph.SCC_dict[x])
node = x
length is the required answer