在pyspark中使用graphframe(Pregel API';s)实现组织中的员工层次结构和深度

在pyspark中使用graphframe(Pregel API';s)实现组织中的员工层次结构和深度,pyspark,graphframes,pregel,Pyspark,Graphframes,Pregel,我有spark/scala Graphx解决方案,它可以解决员工层级问题,并为我提供每个员工与顶级经理相比的深度。它在内部使用pregelapi。我是否可以使用pyspark graphframes实现相同的功能,如果可能,可以使用pregel api import org.apache.spark._ import org.apache.spark.graphx._ import org.apache.spark.sql.functions._ import org.apache.s

我有spark/scala Graphx解决方案,它可以解决员工层级问题,并为我提供每个员工与顶级经理相比的深度。它在内部使用pregelapi。我是否可以使用pyspark graphframes实现相同的功能,如果可能,可以使用pregel api

    import org.apache.spark._
import org.apache.spark.graphx._
import org.apache.spark.sql.functions._ 
import org.apache.spark.sql.DataFrame

type Role = String
case class Employee(name: String, role: Role)

val employeeRawData = Array(
  (1L, "Steve", "Jobs", "CEO", None),
  (2L, "Leslie", "Lamport", "CTO", Some(1L)),
  (3L, "Jason", "Fried", "Manager", Some(1L)),
  (4L, "Joel", "Spolsky", "Manager", Some(2L)),
  (5L, "Jeff", "Dean", "Lead", Some(4L)),
  (6L, "Martin", "Odersky", "Sr.Dev", Some(5L)),
  (7L, "Linus", "Trovalds", "Dev", Some(6L)),
  (8L, "Steve", "Wozniak", "Dev", Some(6L)),
  (9L, "Matei", "Zaharia", "Dev", Some(6L)),
  (10L, "James", "Faeldon", "Intern", Some(7L))
)

val employeeDf = sc.parallelize(employeeRawData, 4).toDF(
  "employeeId",
  "firstName",
  "lastName",
  "role",
  "supervisorId"
).cache()

val verticesRdd: RDD[(VertexId, Employee)] = employeeDf
  .select($"employeeId", concat($"firstName", lit(" "), $"lastName"), $"role")
  .rdd.map(emp => (emp.getLong(0), Employee(emp.getString(1), emp.getString(2))))

val edgesRdd: RDD[Edge[String]] = employeeDf
  .filter($"supervisorId".isNotNull) # Remove vertices without supervisor, in Scala None === Null
  .select($"supervisorId", $"employeeId", $"role") # First column is supervisorID (not employeeId), since direction of edge is top-down
  .rdd.map(emp => Edge(emp.getLong(0), emp.getLong(1), emp.getString(2))) # Edge property is the Role

# Define a default employee in case there are missing employee referenced in Graph
val missingEmployee = Employee("John Doe", "Unknown")

# Let's build the graph model
val employeeGraph: Graph[Employee, String] = Graph(verticesRdd, edgesRdd, missingEmployee)

# The structure of the message to be passed to vertices
case class EmployeeMessage(
  currentId: Long, # Tracks the most recent vertex appended to path and used for flagging isCyclic
  level: Int, # The number of up-line supervisors (level in reporting heirarchy)
  head: String, # The top-most supervisor
  path: List[String], # The reporting path to the the top-most supervisor
  isCyclic: Boolean, # Is the reporting structure of the employee cyclic
  isLeaf: Boolean # Is the employee rank and file (no down-line reporting employee)
)

# The structure of the vertex values of the graph
case class EmployeeValue(
  name: String, # The employee name
  currentId: Long, # Initial value is the employeeId
  level: Int, # Initial value is zero
  head: String, # Initial value is this employee's name
  path: List[String], # Initial value contains this employee's name only
  isCyclic: Boolean, # Initial value is false
  isLeaf: Boolean # Initial value is true
)

# Initialize the employee vertices
val employeeValueGraph: Graph[EmployeeValue, String] = employeeGraph.mapVertices { (id, v) =>
  EmployeeValue(
    name = v.name,
    currentId = id,
    level = 0,
    head = v.name,
    path = List(v.name),
    isCyclic = false,
    isLeaf = false
  )
}

def vprog(
  vertexId: VertexId, 
  value: EmployeeValue, 
  message: EmployeeMessage
): EmployeeValue = {
  
  if (message.level == 0) { #superstep 0 - initialize
    value.copy(level = value.level + 1)
  } else if (message.isCyclic) { # set isCyclic
    value.copy(isCyclic = true)
  } else if (!message.isLeaf) { # set isleaf    
    value.copy(isLeaf = false)  
  } else { # set new values
    value.copy(
      currentId = message.currentId,
      level = value.level + 1,
      head = message.head,
      path = value.name :: message.path
    )
  }
}


def sendMsg(
  triplet: EdgeTriplet[EmployeeValue, String]
): Iterator[(VertexId, EmployeeMessage)] = {
  
  val src = triplet.srcAttr
  val dst = triplet.dstAttr
  
  # Handle cyclic reporting structure
  if (src.currentId == triplet.dstId || src.currentId == dst.currentId) {
    if (!src.isCyclic) { # Set isCyclic
      Iterator((triplet.dstId, EmployeeMessage(
        currentId = src.currentId, 
        level = src.level, 
        head = src.head, 
        path = src.path, 
        isCyclic = true,
        isLeaf = src.isLeaf
      )))
    } else { # Already marked as isCyclic (possibly, from previous superstep) so ignore
      Iterator.empty
    }
  } else { # Regular reporting structure
    if (src.isLeaf) { # Initially every vertex is leaf. Since this is a source then it should NOT be a leaf, update
      Iterator((triplet.srcId, EmployeeMessage(
        currentId = src.currentId,
        level = src.level,
        head = src.head, 
        path = src.path, 
        isCyclic = false, 
        isLeaf = false # This is the only important value here
      )))
    } else { # Set new values by propagating source values to destination
      #Iterator.empty
      Iterator((triplet.dstId, EmployeeMessage(
        currentId = src.currentId,
        level = src.level,
        head = src.head, 
        path = src.path, 
        isCyclic = false, # Set to false so that cyclic updating is ignored in vprog
        isLeaf = true # Set to true so that leaf updating is ignored in vprog
      )))
    }
  }
}


def mergeMsg(msg1: EmployeeMessage, msg2: EmployeeMessage): EmployeeMessage = msg2

val initialMsg = EmployeeMessage(
    currentId = 0L, 
    level = 0, 
    head = "", 
    path = Nil, 
    isCyclic = false, 
    isLeaf = true
)

val results = employeeValueGraph.pregel(
  initialMsg,
  Int.MaxValue,
  EdgeDirection.Out
)(
  vprog,
  sendMsg,
  mergeMsg
)

val resultDf = results
  .vertices.map { case (id, v) => (id, v.name, v.level, v.head, v.path.reverse.mkString(">"), v.isCyclic, v.isLeaf) }
  .toDF("id", "employee", "level", "head", "path", "cyclic", "leaf")

val df = resultDf.withColumn("letters", split(col("path"), ">"))

val numCols = df
  .withColumn("letters_size", size($"letters"))
  .agg(max($"letters_size"))
  .head()
  .getInt(0)

df.drop(col("path")).drop(col("leaf")).drop("cyclic")
  .select( col("*") +: 
    (0 until numCols).map(i => $"letters".getItem(i).as(s"level$i")): _*
  ).drop(col("letters")).orderBy(col("level"))
  .show()
这将生成类似这样的输出


有人能给我介绍一下Pypark graphframes的转换吗。

这能回答你的问题吗?