Warning: file_get_contents(/data/phpspider/zhask/data//catemap/3/apache-spark/5.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Spark UserDefinedAggregateFunction:scala.MatchError 0.0(属于java.lang.Double类)_Scala_Apache Spark_Apache Spark Sql_User Defined Functions - Fatal编程技术网

Spark UserDefinedAggregateFunction:scala.MatchError 0.0(属于java.lang.Double类)

Spark UserDefinedAggregateFunction:scala.MatchError 0.0(属于java.lang.Double类),scala,apache-spark,apache-spark-sql,user-defined-functions,Scala,Apache Spark,Apache Spark Sql,User Defined Functions,我试图在Spark 2.0.2和Scala上使用UserDefinedAggregateFunction,但遇到了匹配错误。我已经创建了下面的测试用例,我正在编写的代码与下面的代码类似 我试图通过聚合窗口累积一个值。这不仅仅是一个累积的总和,而是我需要根据一些条件计算出要保留的数字 作为一个测试案例,我创建了一个摊销表,在这里我必须计算每个月的期初和期末余额 数据如下所示: +------+--------+------------+---------+ |期间|资本|利息|还款| +-----

我试图在Spark 2.0.2和Scala上使用
UserDefinedAggregateFunction
,但遇到了匹配错误。我已经创建了下面的测试用例,我正在编写的代码与下面的代码类似

我试图通过聚合窗口累积一个值。这不仅仅是一个累积的总和,而是我需要根据一些条件计算出要保留的数字

作为一个测试案例,我创建了一个摊销表,在这里我必须计算每个月的期初和期末余额

数据如下所示:

+------+--------+------------+---------+
|期间|资本|利息|还款|
+------+--------+------------+---------+
|201601|   0.00 |       0.10 |    0.00 |
|201602|1000.00 |       0.00 |    0.00 |
|201603|2000.00 |       0.10 |    0.00 |
|201604|   0.00 |       0.10 | -200.00 |
|201605|   0.00 |       0.10 | -200.00 |
|201606|   0.00 |       0.10 | -200.00 |
|201607|   0.00 |       0.10 | -200.00 |
|201608|   0.00 |       0.00 | -200.00 |
|201609|   0.00 |       0.10 | -200.00 |
|201610|   0.00 |       0.10 | -200.00 |
|201611|   0.00 |       0.10 | -200.00 |
|201612|   0.00 |       0.10 | -200.00 |
+------+--------+------------+---------+
我无法正确格式化CSV,但我已将其添加到此处的要点:

我试图计算
期初
期末
余额,然后从聚合中返回
期末
余额

斯卡拉
package me.nevi
导入org.apache.spark.sql_
导入org.apache.spark.sql.expressions.{MutableAggregationBuffer,UserDefinedAggregateFunction,Window}
导入org.apache.spark.sql.types.{StructType,DoubleType,DataType}
对象聚合测试{
对象摊销结算余额扩展了UserDefinedAggregateFunction{
重写def inputSchema:StructType=new StructType().add(“资本”,DoubleType).add(“利率”,DoubleType).add(“还款”,DoubleType)
重写def bufferSchema:StructType=new StructType().add(“打开”,DoubleType)。add(“关闭”,DoubleType)
覆盖def数据类型:数据类型=新结构类型()。添加(“关闭”,DoubleType)
覆盖def deterministic:Boolean=true
覆盖def初始化(缓冲区:MutableAggregationBuffer):单位={
buffer.update(0,0.0)
buffer.update(1,0.0)
}
覆盖def更新(缓冲区:MutableAggregationBuffer,输入:行):单位={
如果(!input.isNullAt(0)){
println(buffer.get(0))
println(buffer.get(1))
buffer.update(0,buffer.getDouble(1))
//(期初+资本)*利息-还款
update(1,(buffer.getDouble(0)+input.getDouble(0))*input.getDouble(1)+input.getDouble(2))
}否则{
//如果是第一张唱片?
buffer.update(0,input.getDouble(0))
buffer.update(1,input.getDouble(0))
}
}
覆盖def合并(buffer1:MutableAggregationBuffer,buffer2:Row):单位={
buffer1.update(0,buffer1.getDouble(0))
buffer1.update(1,buffer1.getDouble(1))
}
覆盖def求值(缓冲区:行):任意={
buffer.getDouble(1)
}
}
def main(参数:数组[字符串]):单位={
System.setProperty(“hadoop.home.dir”,“C:/spark”)
System.setProperty(“spark.sql.warehouse.dir”file:///tmp/spark-warehouse")
val spark:SparkSession=SparkSession.builder()
.master(“本地[*]”)
.appName(“聚合测试”)
.getOrCreate()
导入spark.implicits_
val df=spark.read.option(“header”,true).csv(“file:///d:/interest_calc.csv")
df.show()
val windowSpec=Window.orderBy(df.col(“期间”))
val calc=带有列(“期末”,摊销期末余额($“资本”,“利息”,“还款”)。超过(windowSpec))
计算显示()
}
}
我得到一个例外:

scala.MatchError:0.0(属于java.lang.Double类)
在org.apache.spark.sql.catalyst.CatalystTypeConverters$StructConverter.toCatalystImpl(CatalystTypeConverters.scala:256)上
在org.apache.spark.sql.catalyst.CatalystTypeConverters$StructConverter.toCatalystImpl(CatalystTypeConverters.scala:251)上
位于org.apache.spark.sql.catalyst.CatalystTypeConverters$CatalystTypeConverter.toCatalyst(CatalystTypeConverters.scala:103)
在org.apache.spark.sql.CatalystTypeConverters$$anonfun$createToCatalystConverters$2.apply上(CatalystTypeConverters.scala:403)
位于org.apache.spark.sql.execution.aggregate.ScalaUDAF.eval(udaf.scala:440)
位于org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificMutableProjection.apply(未知源)
位于org.apache.spark.sql.execution.AggregateProcessor.evaluate(WindowExec.scala:1029)
位于org.apache.spark.sql.execution.unbounddReceidingWindowFunctionFrame.write(WindowExec.scala:822)
位于org.apache.spark.sql.execution.WindowExec$$anonfun$15$$anon$1.next(WindowExec.scala:398)
位于org.apache.spark.sql.execution.WindowExec$$anonfun$15$$anon$1.next(WindowExec.scala:289)
位于org.apache.spark.sql.catalyst.expressions.GeneratedClass$GenerateEditor.processNext(未知源)
位于org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
位于org.apache.spark.sql.execution.whisttagecodegenexec$$anonfun$8$$anon$1.hasNext(whisttagecodegenexec.scala:370)
位于org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246)
位于org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240)
位于org.apache.spark.rdd.rdd$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(rdd.scala:803)
位于org.apache.spark.rdd.rdd$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(rdd.scala:803)
在org.apache.spark.rdd.MapPartitionsRDD.compute上(MapPartitionsRDD.scala:38)
在org.apache.spark.rdd.rdd.computeOrReadCheckpoint(rdd.scala:319)
位于org.apache.spark.rdd.rdd.iterator(rdd.scala:283)
位于org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
在
package me.nevi

import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction, Window}
import org.apache.spark.sql.types.{DataType, DoubleType, StructType}

object AggregationTest {

  object amortisedClosingBalance extends UserDefinedAggregateFunction {
    override def inputSchema: StructType = new StructType().add("Capital", DoubleType).add("InterestRate", DoubleType).add("Repayment", DoubleType)

    override def bufferSchema: StructType = new StructType().add("Opening", DoubleType).add("Closing", DoubleType)

    override def dataType: DataType = new StructType().add("Opening", DoubleType).add("Closing", DoubleType)

    override def deterministic: Boolean = true

    override def initialize(buffer: MutableAggregationBuffer): Unit = {
      buffer.update(0, 0.0)
      buffer.update(1, 0.0)
    }

    override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
      if (!input.isNullAt(0)) {
        println(buffer.get(0))
        println(buffer.get(1))
        buffer.update(0, buffer.getDouble(1))
        // (opening + capital) * interestrate - repayment
        buffer.update(1, input.getDouble(0)
          + buffer.getDouble(0) + input.getDouble(2) + (buffer.getDouble(0) + input.getDouble(0)) * (input.getDouble(1) / 12))
      } else {
        // if first record?
        buffer.update(0, input.getDouble(0))
        buffer.update(1, input.getDouble(0))
      }
    }

    override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
      buffer1.update(0, buffer1.getDouble(0))
      buffer1.update(1, buffer1.getDouble(1))
    }

    override def evaluate(buffer: Row): Any = {
      Row(buffer.getDouble(0), buffer.getDouble(1))
    }
  }

  def main(args: Array[String]): Unit = {
    System.setProperty("hadoop.home.dir", "C:/spark")
    System.setProperty("spark.sql.warehouse.dir", "file:///tmp/spark-warehouse")

    val spark: SparkSession = SparkSession.builder()
      .master("local[*]")
      .appName("Aggregation Test")
      .getOrCreate()

    import spark.implicits._

    val df = spark.read.option("header", true).csv("file:///d:/interest_calc.csv")

    df.show()

    val windowSpec = Window.orderBy(df.col("Period").asc)

    var calc = df.withColumn("Calcs", amortisedClosingBalance($"Capital", $"InterestRate", $"Repayment").over(windowSpec))
    calc = calc.withColumn("Opening", round($"Calcs".getField("Opening"), 2)).withColumn("Closing", round($"Calcs".getField("Closing"),2))
      .drop("Calcs")

    calc.show()

  }
}
StructType(StructField(Closing,DoubleType,true))
override def dataType: DataType = DoubleType
override def evaluate(buffer: Row): Any = {
  Row(buffer.getDouble(1))
}