Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/scala/18.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Scala 分解多个嵌套列,执行agg并联接所有表_Scala_Apache Spark_Dataframe_Apache Spark Sql - Fatal编程技术网

Scala 分解多个嵌套列,执行agg并联接所有表

Scala 分解多个嵌套列,执行agg并联接所有表,scala,apache-spark,dataframe,apache-spark-sql,Scala,Apache Spark,Dataframe,Apache Spark Sql,我想知道是否还有其他更有效的方法来完成这项工作,例如: val df0 = df.select($"id", explode($"event.x0") as "n_0" ).groupBy("id").agg(sum("n_0") as "0") val df1 = df.select($"id", explode($"event.x1") as "n_1").groupBy("id").agg(sum("n_1") as "1") val df2 = df.select($"id", expl

我想知道是否还有其他更有效的方法来完成这项工作,例如:

val df0 = df.select($"id", explode($"event.x0") as "n_0" ).groupBy("id").agg(sum("n_0") as "0")
val df1 = df.select($"id", explode($"event.x1") as "n_1").groupBy("id").agg(sum("n_1") as "1")
val df2 = df.select($"id", explode($"event.x2") as "n_2").groupBy("id").agg(sum("n_2") as "2")
val df3 = df.select($"id", explode($"event.x3") as "n_3").groupBy("id").agg(sum("n_3") as "3)


val final_df = df.join(df0, "id").join(df1, "id").join(df2, "id").join(df3, "id")
我试着这样做:

val df_x = df.select($"id", $"event", explode($"event.x0") as "0" )
            .select($"id", $"event", $"0", explode($"event.x1") as "1")
            .select($"id", $"event", $"0", $"1", explode($"event.x2") as "2")
            .groupBy("id")
            .agg(sum("0") as "0", sum("1") as "1", sum("2") as "2")

val final_df = df.join(df_x, "id")
尽管它跑得更快!!!!聚合值是错误的,因此它实际上不起作用:(


有没有减少联接数量的方法?

假设每个id没有太多匹配记录,可以使用
collect\u list
聚合函数将所有匹配数组收集到一个数组数组中,然后使用用户定义的函数对这些嵌套数组求和:

val flattenAndSum = udf[Int, mutable.Seq[mutable.Seq[Int]]] { seqOfArrays => seqOfArrays.flatten.sum }

val sums = df.groupBy($"id").agg(
  collect_list($"event.x0") as "arr0",
  collect_list($"event.x1") as "arr1",
  collect_list($"event.x2") as "arr2",
  collect_list($"event.x3") as "arr3"
).select($"id",
  flattenAndSum($"arr0") as "0",
  flattenAndSum($"arr1") as "1",
  flattenAndSum($"arr2") as "2",
  flattenAndSum($"arr3") as "3"
)

df.join(sums, "id")
或者,如果无法做出该假设,您可以创建一个用户定义的聚合函数来动态执行展平和求和。这更安全,可能更快,但需要更多的工作:

// implement a UDAF:
class FlattenAndSum extends UserDefinedAggregateFunction {
  override def inputSchema: StructType = new StructType().add("arr", ArrayType(IntegerType))
  override def bufferSchema: StructType = new StructType().add("sum", IntegerType)
  override def dataType: DataType = IntegerType
  override def deterministic: Boolean = true

  override def initialize(buffer: MutableAggregationBuffer): Unit = buffer.update(0, 0)

  override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
    val current = buffer.getAs[Int](0)
    val toAdd = input.getAs[Seq[Int]](0).sum
    buffer.update(0, current + toAdd)
  }

  override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
    buffer1.update(0, buffer1.getAs[Int](0) + buffer2.getAs[Int](0))
  }

  override def evaluate(buffer: Row): Any = buffer.getAs[Int](0)
}

// use it in aggregation:
val flattenAndSum = new FlattenAndSum()

val sums = df.groupBy($"id").agg(
  flattenAndSum($"event.x0") as "0",
  flattenAndSum($"event.x1") as "1",
  flattenAndSum($"event.x2") as "2",
  flattenAndSum($"event.x3") as "3"
)

df.join(sums, "id")

假设每个id没有太多匹配记录,可以使用
collect_list
聚合函数将所有匹配数组收集到一个数组数组中,然后使用用户定义的函数对这些嵌套数组求和:

val flattenAndSum = udf[Int, mutable.Seq[mutable.Seq[Int]]] { seqOfArrays => seqOfArrays.flatten.sum }

val sums = df.groupBy($"id").agg(
  collect_list($"event.x0") as "arr0",
  collect_list($"event.x1") as "arr1",
  collect_list($"event.x2") as "arr2",
  collect_list($"event.x3") as "arr3"
).select($"id",
  flattenAndSum($"arr0") as "0",
  flattenAndSum($"arr1") as "1",
  flattenAndSum($"arr2") as "2",
  flattenAndSum($"arr3") as "3"
)

df.join(sums, "id")
或者,如果无法做出该假设,您可以创建一个用户定义的聚合函数来动态执行展平和求和。这更安全,可能更快,但需要更多的工作:

// implement a UDAF:
class FlattenAndSum extends UserDefinedAggregateFunction {
  override def inputSchema: StructType = new StructType().add("arr", ArrayType(IntegerType))
  override def bufferSchema: StructType = new StructType().add("sum", IntegerType)
  override def dataType: DataType = IntegerType
  override def deterministic: Boolean = true

  override def initialize(buffer: MutableAggregationBuffer): Unit = buffer.update(0, 0)

  override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
    val current = buffer.getAs[Int](0)
    val toAdd = input.getAs[Seq[Int]](0).sum
    buffer.update(0, current + toAdd)
  }

  override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
    buffer1.update(0, buffer1.getAs[Int](0) + buffer2.getAs[Int](0))
  }

  override def evaluate(buffer: Row): Any = buffer.getAs[Int](0)
}

// use it in aggregation:
val flattenAndSum = new FlattenAndSum()

val sums = df.groupBy($"id").agg(
  flattenAndSum($"event.x0") as "0",
  flattenAndSum($"event.x1") as "1",
  flattenAndSum($"event.x2") as "2",
  flattenAndSum($"event.x3") as "3"
)

df.join(sums, "id")