Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/scala/16.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Scala 在Spark中按交替顺序分组_Scala_Dataframe_Apache Spark_Group By - Fatal编程技术网

Scala 在Spark中按交替顺序分组

Scala 在Spark中按交替顺序分组,scala,dataframe,apache-spark,group-by,Scala,Dataframe,Apache Spark,Group By,我有一组数据,可以识别交替序列。但是,我想将这些数据分组到一个块中,而保留所有其他数据不变。也就是说,在id中出现任何闪烁的地方,我想用一个id覆盖该id组,该id根据订单生成。作为一个小例子,请考虑 val dataDF = Seq( ("a", "silom", 3, 1), ("a", "silom", 2, 2), ("a", "silom", 1, 3), ("a", "silom", 0, 4), // flickering; id=0 ("

我有一组数据,可以识别交替序列。但是,我想将这些数据分组到一个块中,而保留所有其他数据不变。也就是说,在id中出现任何闪烁的地方,我想用一个id覆盖该id组,该id根据订单生成。作为一个小例子,请考虑

val dataDF = Seq(
    ("a", "silom", 3, 1),
    ("a", "silom", 2, 2),
    ("a", "silom", 1, 3),
    ("a", "silom", 0, 4),  // flickering; id=0
    ("a", "silom", 1, 5),  // flickering; id=0
    ("a", "silom", 0, 6),  // flickering; id=0
    ("a", "silom", 1, 7),
    ("a", "silom", 2, 8),
    ("a", "silom", 3, 9),
    ("a", "silom", 4, 10),
    ("a", "silom", 3, 11),  // flickering and so on
    ("a", "silom", 4, 12),
    ("a", "silom", 3, 13),
    ("a", "silom", 4, 14),
    ("a", "silom", 5, 15)
).toDF("user", "cat", "id", "time_sec")

val resultDataDF = Seq(
    ("a", "silom", 3, 1),
    ("a", "silom", 2, 2),
    ("a", "silom", 1, 3),
    ("a", "silom", 0, 15),  // grouped by flickering summing on time_sec
    ("a", "silom", 1, 7),
    ("a", "silom", 2, 8),
    ("a", "silom", 3, 9),
    ("a", "silom", 4, 60),
    ("a", "silom", 5, 15). // grouped by flickering summing on time_sec
).toDF("user", "cat", "id", "time_sec")
现在是更现实的MWE。在这种情况下,我们可以有多个用户和cat;不幸的是,这种方法不使用DataFrameAPI,需要向驱动程序收集数据。这是不可伸缩的,需要通过删除返回数组索引的长度来递归调用
getGrps

我如何使用dataframe API实现这一点,从而不需要将数据收集到驱动程序中,因为数据量太大,这是不可能的?还有,如果有更好的方法,那会是什么


第二种方法是使用
collect\u list
,但这依赖于
getGrps
递归工作,我无法正常工作。这是我到目前为止为
collect\u列表
减去递归而修改的
getGrps
代码

val data = recastDataDF
    .select($"*" +: category.map(
        name => 
        lag("id", 1).over(
            Window.partitionBy("user", "cat").orderBy("time_sec")
        )
        .alias(s"lag_${name}_id")): _*)
    .withColumn("sequencing_diff", when($"cat" === "silom", ($"lag_silom_id" - $"id").cast(DoubleType))
                .otherwise(($"lag_suk_id" - $"id")))
    .drop("lag_silom_id", "lag_suk_id")
    .withColumn("rn", row_number.over(Window.partitionBy("user", "cat").orderBy("time_sec")).cast(DoubleType))
    .withColumn("id_rn", array($"id", $"rn", $"sequencing_diff"))
    .groupBy($"user", $"cat").agg(collect_list($"id_rn").alias("array_data"))

// collect one row to develop how the UDF would work
val testList = data.where($"user" === "a" && $"cat" === "silom").select("array_data").collect
    .map(x => x(0).asInstanceOf[WrappedArray[WrappedArray[Any]]])
    .map(x => x.toArray)
    .head
    .map(x => (x(0).toString.toDouble, x(1).toString.toDouble, x(2).asInstanceOf[Double]))

// this code would be in the UDF; that is, we would pass array_data to the UDF
scala.util.Sorting.stableSort(testList, (e1: (Double, Double, Double), e2: (Double, Double, Double)) => e1._2 < e2._2)

val shifted: Array[(Double, Double, Double)] = testList.drop(1)
val combined = testList
    .zipAll(shifted, (Double.NaN, Double.NaN, Double.NaN), (Double.NaN, Double.NaN, Double.NaN))

val testArr = combined.map{
    case (data0, data1) =>
    if(data0._3 != data1._3 && data0._2 > 1) {
        (data0._2, data0._1)
    }
    else (Double.NaN, Double.NaN)
    }
    .filter(t => t._1 == t._1 && t._1 == t._1) 

// called inside the UDF
def getGrps2(arr: Array[(Double, Double)]): (Array[Double], Double) = {
    // no need for user or cat

    if(arr.nonEmpty) {
        val rowNum = arr.take(1)(0)._1
        val keepID = arr.take(1)(0)._2
        val newArr = arr.drop(1)

        val rowNums = (Array(rowNum)) ++ newArr.zipWithIndex.map{
            case (tups, idx) => 
            if(rowNum + idx + 1 == tups._1) {
                rowNum + 1 + idx
            }
            else Double.NaN
        }
            .filter(v => v == v)

        (rowNums, keepID)
    }
    else (Array(Double.NaN), Double.NaN)
}
val data=recastDataDF
。选择($“*”+:category.map(
名称=>
滞后(“id”,1)。结束(
窗口。分区依据(“用户”、“类别”)。订购依据(“时间秒”)
)
.alias(s“lag{name}\u id”):)
.withColumn(“排序差异”,当($“cat”===“silom”,($“lag\u silom\u id”-$“id”).cast(双类型))
。否则($“lag_suk_id”-$“id”))
.drop(“lag\u silom\u id”、“lag\u suk\u id”)
.withColumn(“rn”,行号。超过(窗口。分区(“用户”,“类别”)。订购人(“时间秒”))。强制转换(双重类型))
.withColumn(“id_rn”,数组($“id”,“$”rn“,$“sequencing_diff”))
.groupBy($“user”,“$“cat”).agg(收集列表($“id\u rn”).alias(“数组数据”))
//收集一行数据,以确定UDF的工作方式
val testList=数据。其中($“用户”===“a”&&&$“cat”====“silom”)。选择(“阵列\U数据”)。收集
.map(x=>x(0).a安装[WrappedArray[WrappedArray[Any]])
.map(x=>x.toArray)
.头
.map(x=>(x(0).toString.toDouble,x(1).toString.toDouble,x(2).asInstanceOf[Double]))
//此代码将位于UDF中;也就是说,我们将把数组_数据传递给UDF
scala.util.Sorting.stableSort(testList,(e1:(Double,Double,Double),e2:(Double,Double,Double))=>e1.\u2
如果(数据0.\u 3!=data1.\u 3&&data0.\u 2>1){
(数据0.\u 2,数据0.\u 1)
}
else(Double.NaN,Double.NaN)
}
.filter(t=>t.\u 1==t.\u 1&&t.\u 1==t.\u 1)
//在UDF内部调用
def getGrps2(arr:Array[(Double,Double)]:(Array[Double],Double)={
//不需要用户或cat
if(arr.nonEmpty){
val rowNum=arr.take(1)(0)。\u 1
val keepID=arr.take(1)(0)。\u 2
val newArr=arr.drop(1)
val rowNums=(数组(rowNum))++newArr.zipWithIndex.map{
案例(tups,idx)=>
if(rowNum+idx+1==tups.\u 1){
rowNum+1+idx
}
我要双份的
}
.filter(v=>v==v)
(鲁努姆斯,基皮德)
}
else(数组(Double.NaN),Double.NaN)
}
我们将
.withColumn(“data\u to\u update”,udf)
data\u to\u update
列将是一个
WrappedArray[Tuple2[Array[Double],Double]]
,行号将被id覆盖。用户
a
,cat
silom
的结果将是

WrappedArray((数组(4.0,5.0,6.0),0.0),(数组(10.0,11.0,12.0,13.0),4.0))


数组块是行号,Double是用

更新这些行的id。在对
数组_数据
列进行操作的UDF中应用以下递归方法将创建所需的结果

def getGrps(arr: Array[(Double, Double)]): Array[(Array[Double], Double)] = {

    def returnAlternatingIDs(arr: Array[(Double, Double)], 
                             altIDs: Array[(Array[Double], Double)]): Array[(Array[Double], Double)] = arr match {

        case arr if arr.nonEmpty =>
            val rowNum = arr.take(1)(0)._1
            val keepID = arr.take(1)(0)._2
            val newArr = arr.drop(1)

            val rowNums = (Array(rowNum)) ++ newArr.zipWithIndex.map{
                case (tups, idx) => 
                if(rowNum + idx + 1 == tups._1) {
                    rowNum + 1 + idx
                }
                else {
                    Double.NaN
                }
            }
                .filter(v => v == v)

            val updateArray = altIDs ++ Array((rowNums, keepID))
            returnAlternatingIDs(arr.drop(rowNums.length), updateArray)
        case _ => altIDs
    }

    returnAlternatingIDs(arr, Array((Array(Double.NaN), Double.NaN))).drop(1)
}
第一个
collect_列表的返回值是
Array((Array(5.0,6.0,7.0),0.0),(Array(11.0,12.0,13.0,14.0),4.0))

完全自定义项

val identifyFlickeringIDs: UserDefinedFunction = udf {
    (colArrayData: WrappedArray[WrappedArray[Double]]) =>
    val newArray: Array[(Double, Double, Double)] = colArrayData.toArray
        .map(x => (x(0).toDouble, x(1).toDouble, x(2).toDouble))

    // sort array by rn via less than relation
    stableSort(newArray, (e1: (Double, Double, Double), e2: (Double, Double, Double)) => e1._2 < e2._2)

    val shifted: Array[(Double, Double, Double)] = newArray.toArray.drop(1)
    val combined = newArray
        .zipAll(shifted, (Double.NaN, Double.NaN, Double.NaN), (Double.NaN, Double.NaN, Double.NaN))

    val parsedArray = combined.map{
        case (data0, data1) =>
        if(data0._3 != data1._3 && data0._2 > 1 && data0._3 + data1._3 == 0) {
            (data0._2, data0._1)
        }
        else (Double.NaN, Double.NaN)
        }
        .filter(t => t._1 == t._1 && t._1 == t._1)

    getGrps(parsedArray).filter(data => data._1.length > 1)
}
val identificationflickingids:UserDefinedFunction=udf{
(colArrayData:WrappedArray[WrappedArray[Double]])=>
val newArray:Array[(Double,Double,Double)]=colArrayData.toArray
.map(x=>(x(0.toDouble,x(1.toDouble,x(2.toDouble))
//通过小于关系按rn排序数组
stableSort(新数组,(e1:(双,双,双),e2:(双,双,双))=>e1.\u2
如果(data0.\u 3!=data1.\u 3&&data0.\u 2>1&&data0.\u 3+data1.\u 3==0){
(数据0.\u 2,数据0.\u 1)
}
else(Double.NaN,Double.NaN)
}
.filter(t=>t.\u 1==t.\u 1&&t.\u 1==t.\u 1)
getGrps(parsedArray).filter(数据=>data.\u 1.length>1)
}
val identifyFlickeringIDs: UserDefinedFunction = udf {
    (colArrayData: WrappedArray[WrappedArray[Double]]) =>
    val newArray: Array[(Double, Double, Double)] = colArrayData.toArray
        .map(x => (x(0).toDouble, x(1).toDouble, x(2).toDouble))

    // sort array by rn via less than relation
    stableSort(newArray, (e1: (Double, Double, Double), e2: (Double, Double, Double)) => e1._2 < e2._2)

    val shifted: Array[(Double, Double, Double)] = newArray.toArray.drop(1)
    val combined = newArray
        .zipAll(shifted, (Double.NaN, Double.NaN, Double.NaN), (Double.NaN, Double.NaN, Double.NaN))

    val parsedArray = combined.map{
        case (data0, data1) =>
        if(data0._3 != data1._3 && data0._2 > 1 && data0._3 + data1._3 == 0) {
            (data0._2, data0._1)
        }
        else (Double.NaN, Double.NaN)
        }
        .filter(t => t._1 == t._1 && t._1 == t._1)

    getGrps(parsedArray).filter(data => data._1.length > 1)
}