Python 科萨拉朱';SCCs的s算法,非递归

Python 科萨拉朱';SCCs的s算法,非递归,python,recursion,graph-theory,depth-first-search,kosaraju-algorithm,Python,Recursion,Graph Theory,Depth First Search,Kosaraju Algorithm,我有一个Kosaraju在Python中查找SCC的算法的实现。下面的代码包含一个递归(在小测试用例中很好)版本和一个非递归版本(由于实际数据集的大小,我最终需要这个版本) 我已经在一些测试数据集上运行了递归和非递归版本,并得到了正确的答案。然而,在我最终需要使用的更大的数据集上运行它会产生错误的结果。浏览真实数据实际上不是一个选项,因为它包含近一百万个节点 我的问题是我不知道如何从这里开始。我的怀疑是,要么我忘记了测试用例中的某个图形星座,要么我对这个算法应该如何工作有一个更基本的误解 #!/

我有一个Kosaraju在Python中查找SCC的算法的实现。下面的代码包含一个递归(在小测试用例中很好)版本和一个非递归版本(由于实际数据集的大小,我最终需要这个版本)

我已经在一些测试数据集上运行了递归和非递归版本,并得到了正确的答案。然而,在我最终需要使用的更大的数据集上运行它会产生错误的结果。浏览真实数据实际上不是一个选项,因为它包含近一百万个节点

我的问题是我不知道如何从这里开始。我的怀疑是,要么我忘记了测试用例中的某个图形星座,要么我对这个算法应该如何工作有一个更基本的误解

#!/usr/bin/env python3

import heapq


class Node():
    """A class to represent nodes in a DirectedGraph. It has attributes for
    performing DFS."""

    def __init__(self, i):
        self.id = i
        self.edges = []
        self.rev_edges = []
        self.explored = False
        self.fin_time = 0
        self.leader = 0

    def add_edge(self, edge_id):
        self.edges.append(edge_id)

    def add_rev_edge(self, edge_id):
        self.rev_edges.append(edge_id)

    def mark_explored(self):
        self.explored = True

    def set_leader(self, leader_id):
        self.leader = leader_id

    def set_fin_time(self, fin_time):
        self.fin_time = fin_time


class DirectedGraph():
    """A class to represent directed graphs via the adjacency list approach.
    Each dictionary entry is a Node."""

    def __init__(self, length, list_of_edges):
        self.nodes = {}
        self.nodes_by_fin_time = {}
        self.length = length
        self.fin_time = 1  # counter for the finishing time
        self.leader_count = 0  # counter for the size of leader nodes
        self.scc_heapq = []  # heapq to store the ssc by size
        self.sccs_computed = False

        for n in range(1, length + 1):
            self.nodes[str(n)] = Node(str(n))

        for n in list_of_edges:
            ns = n[0].split(' ')
            self.nodes[ns[0]].add_edge(ns[1])
            self.nodes[ns[1]].add_rev_edge(ns[0])

    def n_largest_sccs(self, n):
        if not self.sccs_computed:
            self.compute_sccs()

        return heapq.nlargest(n, self.scc_heapq)

    def compute_sccs(self):
        """First compute the finishing times and the resulting order of nodes
        via a DFS loop. Second use that new order to compute the SCCs and order
        them by their size."""

        # Go through the given graph in reverse order, computing the finishing
        # times of each node, and create a second graph that uses the finishing
        # times as the IDs.
        i = self.length
        while i > 0:
            node = self.nodes[str(i)]
            if not node.explored:
                self.dfs_fin_times(str(i))
            i -= 1

        # Populate the edges of the nodes_by_fin_time
        for n in self.nodes.values():
            for e in n.edges:
                e_head_fin_time = self.nodes[e].fin_time
                self.nodes_by_fin_time[n.fin_time].add_edge(e_head_fin_time)

        # Use the nodes ordered by finishing times to calculate the SCCs.
        i = self.length
        while i > 0:
            self.leader_count = 0
            node = self.nodes_by_fin_time[str(i)]
            if not node.explored:
                self.dfs_leaders(str(i))

            heapq.heappush(self.scc_heapq, (self.leader_count, node.id))
            i -= 1

        self.sccs_computed = True

    def dfs_fin_times(self, start_node_id):
        stack = [self.nodes[start_node_id]]

        # Perform depth-first search along the reversed edges of a directed
        # graph. While doing this populate the finishing times of the nodes
        # and create a new graph from those nodes that uses the finishing times
        # for indexing instead of the original IDs.
        while len(stack) > 0:
            curr_node = stack[-1]
            explored_rev_edges = 0
            curr_node.mark_explored()

            for e in curr_node.rev_edges:
                rev_edge_head = self.nodes[e]
                # If the head of the rev_edge has already been explored, ignore
                if rev_edge_head.explored:
                    explored_rev_edges += 1
                    continue
                else:
                    stack.append(rev_edge_head)

            # If the current node has no valid, unexplored outgoing reverse
            # edges, pop it from the stack, populate the fin time, and add it
            # to the new graph.
            if len(curr_node.rev_edges) - explored_rev_edges == 0:
                sink_node = stack.pop()
                # The fin time is 0 if that node has not received a fin time.
                # Prevents dealing with the same node twice here.
                if sink_node and sink_node.fin_time == 0:
                    sink_node.set_fin_time(str(self.fin_time))
                    self.nodes_by_fin_time[str(self.fin_time)] = \
                        Node(str(self.fin_time))
                    self.fin_time += 1

    def dfs_leaders(self, start_node_id):
        stack = [self.nodes_by_fin_time[start_node_id]]
        while len(stack) > 0:
            curr_node = stack.pop()
            curr_node.mark_explored()
            self.leader_count += 1

            for e in curr_node.edges:
                if not self.nodes_by_fin_time[e].explored:
                    stack.append(self.nodes_by_fin_time[e])

###### Recursive verions below ###################################

    def dfs_fin_times_rec(self, start_node_id):
        curr_node = self.nodes[start_node_id]
        curr_node.mark_explored()
        for e in curr_node.rev_edges:
            if not self.nodes[e].explored:
                self.dfs_fin_times_rec(e)

        curr_node.set_fin_time(str(self.fin_time))
        self.nodes_by_fin_time[str(self.fin_time)] = Node(str(self.fin_time))
        self.fin_time += 1

    def dfs_leaders_rec(self, start_node_id):
        curr_node = self.nodes_by_fin_time[start_node_id]
        curr_node.mark_explored()
        for e in curr_node.edges:
            if not self.nodes_by_fin_time[e].explored:
                self.dfs_leaders_rec(e)

        self.leader_count += 1
要运行:

#!/usr/bin/env python3

import utils
from graphs import scc_computation

# data = utils.load_tab_delimited_file('data/SCC.txt')
data = utils.load_tab_delimited_file('data/SCC_5.txt')

# g = scc_computation.DirectedGraph(875714, data)
g = scc_computation.DirectedGraph(11, data)

g.compute_sccs()

# for e, v in g.nodes.items():
#     print(e, v.fin_time)

# for e, v in g.nodes_by_fin_time.items():
#     print(e, v.edges)

print(g.n_largest_sccs(20))
最复杂的测试用例(SCC_5.txt):

该测试用例的图纸:

这将产生4个SCC:

  • 底部:尺寸4,节点2,8,6,11
  • 左:大小3,节点1,10,4
  • 顶部:尺寸1,节点5
  • 右:大小3,节点7,3,9

    • 好的,我找到了丢失的箱子。该算法在强连通图和重复边上执行不正确。这是我在上面发布的测试用例的一个调整版本,其中有一个复制的边和更多的边,以将整个图变成一个大的SCC

      1 5
      1 4
      2 3
      2 6
      2 11
      3 2
      3 7
      4 2
      4 8
      4 10
      5 1
      5 3
      5 5
      5 7
      6 8
      7 9
      8 2
      8 2
      8 4
      8 8
      9 3
      10 1
      11 9
      11 6
      
      1 5
      1 4
      2 3
      2 6
      2 11
      3 2
      3 7
      4 2
      4 8
      4 10
      5 1
      5 3
      5 5
      5 7
      6 8
      7 9
      8 2
      8 2
      8 4
      8 8
      9 3
      10 1
      11 9
      11 6