Spark UserDefinedAggregateFunction:scala.MatchError 0.0(属于java.lang.Double类)
我试图在Spark 2.0.2和Scala上使用Spark UserDefinedAggregateFunction:scala.MatchError 0.0(属于java.lang.Double类),scala,apache-spark,apache-spark-sql,user-defined-functions,Scala,Apache Spark,Apache Spark Sql,User Defined Functions,我试图在Spark 2.0.2和Scala上使用UserDefinedAggregateFunction,但遇到了匹配错误。我已经创建了下面的测试用例,我正在编写的代码与下面的代码类似 我试图通过聚合窗口累积一个值。这不仅仅是一个累积的总和,而是我需要根据一些条件计算出要保留的数字 作为一个测试案例,我创建了一个摊销表,在这里我必须计算每个月的期初和期末余额 数据如下所示: +------+--------+------------+---------+ |期间|资本|利息|还款| +-----
UserDefinedAggregateFunction
,但遇到了匹配错误。我已经创建了下面的测试用例,我正在编写的代码与下面的代码类似
我试图通过聚合窗口累积一个值。这不仅仅是一个累积的总和,而是我需要根据一些条件计算出要保留的数字
作为一个测试案例,我创建了一个摊销表,在这里我必须计算每个月的期初和期末余额
数据如下所示:
+------+--------+------------+---------+
|期间|资本|利息|还款|
+------+--------+------------+---------+
|201601| 0.00 | 0.10 | 0.00 |
|201602|1000.00 | 0.00 | 0.00 |
|201603|2000.00 | 0.10 | 0.00 |
|201604| 0.00 | 0.10 | -200.00 |
|201605| 0.00 | 0.10 | -200.00 |
|201606| 0.00 | 0.10 | -200.00 |
|201607| 0.00 | 0.10 | -200.00 |
|201608| 0.00 | 0.00 | -200.00 |
|201609| 0.00 | 0.10 | -200.00 |
|201610| 0.00 | 0.10 | -200.00 |
|201611| 0.00 | 0.10 | -200.00 |
|201612| 0.00 | 0.10 | -200.00 |
+------+--------+------------+---------+
我无法正确格式化CSV,但我已将其添加到此处的要点:
我试图计算期初
和期末
余额,然后从聚合中返回期末
余额
斯卡拉
package me.nevi
导入org.apache.spark.sql_
导入org.apache.spark.sql.expressions.{MutableAggregationBuffer,UserDefinedAggregateFunction,Window}
导入org.apache.spark.sql.types.{StructType,DoubleType,DataType}
对象聚合测试{
对象摊销结算余额扩展了UserDefinedAggregateFunction{
重写def inputSchema:StructType=new StructType().add(“资本”,DoubleType).add(“利率”,DoubleType).add(“还款”,DoubleType)
重写def bufferSchema:StructType=new StructType().add(“打开”,DoubleType)。add(“关闭”,DoubleType)
覆盖def数据类型:数据类型=新结构类型()。添加(“关闭”,DoubleType)
覆盖def deterministic:Boolean=true
覆盖def初始化(缓冲区:MutableAggregationBuffer):单位={
buffer.update(0,0.0)
buffer.update(1,0.0)
}
覆盖def更新(缓冲区:MutableAggregationBuffer,输入:行):单位={
如果(!input.isNullAt(0)){
println(buffer.get(0))
println(buffer.get(1))
buffer.update(0,buffer.getDouble(1))
//(期初+资本)*利息-还款
update(1,(buffer.getDouble(0)+input.getDouble(0))*input.getDouble(1)+input.getDouble(2))
}否则{
//如果是第一张唱片?
buffer.update(0,input.getDouble(0))
buffer.update(1,input.getDouble(0))
}
}
覆盖def合并(buffer1:MutableAggregationBuffer,buffer2:Row):单位={
buffer1.update(0,buffer1.getDouble(0))
buffer1.update(1,buffer1.getDouble(1))
}
覆盖def求值(缓冲区:行):任意={
buffer.getDouble(1)
}
}
def main(参数:数组[字符串]):单位={
System.setProperty(“hadoop.home.dir”,“C:/spark”)
System.setProperty(“spark.sql.warehouse.dir”file:///tmp/spark-warehouse")
val spark:SparkSession=SparkSession.builder()
.master(“本地[*]”)
.appName(“聚合测试”)
.getOrCreate()
导入spark.implicits_
val df=spark.read.option(“header”,true).csv(“file:///d:/interest_calc.csv")
df.show()
val windowSpec=Window.orderBy(df.col(“期间”))
val calc=带有列(“期末”,摊销期末余额($“资本”,“利息”,“还款”)。超过(windowSpec))
计算显示()
}
}
我得到一个例外:
scala.MatchError:0.0(属于java.lang.Double类)
在org.apache.spark.sql.catalyst.CatalystTypeConverters$StructConverter.toCatalystImpl(CatalystTypeConverters.scala:256)上
在org.apache.spark.sql.catalyst.CatalystTypeConverters$StructConverter.toCatalystImpl(CatalystTypeConverters.scala:251)上
位于org.apache.spark.sql.catalyst.CatalystTypeConverters$CatalystTypeConverter.toCatalyst(CatalystTypeConverters.scala:103)
在org.apache.spark.sql.CatalystTypeConverters$$anonfun$createToCatalystConverters$2.apply上(CatalystTypeConverters.scala:403)
位于org.apache.spark.sql.execution.aggregate.ScalaUDAF.eval(udaf.scala:440)
位于org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificMutableProjection.apply(未知源)
位于org.apache.spark.sql.execution.AggregateProcessor.evaluate(WindowExec.scala:1029)
位于org.apache.spark.sql.execution.unbounddReceidingWindowFunctionFrame.write(WindowExec.scala:822)
位于org.apache.spark.sql.execution.WindowExec$$anonfun$15$$anon$1.next(WindowExec.scala:398)
位于org.apache.spark.sql.execution.WindowExec$$anonfun$15$$anon$1.next(WindowExec.scala:289)
位于org.apache.spark.sql.catalyst.expressions.GeneratedClass$GenerateEditor.processNext(未知源)
位于org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
位于org.apache.spark.sql.execution.whisttagecodegenexec$$anonfun$8$$anon$1.hasNext(whisttagecodegenexec.scala:370)
位于org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246)
位于org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240)
位于org.apache.spark.rdd.rdd$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(rdd.scala:803)
位于org.apache.spark.rdd.rdd$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(rdd.scala:803)
在org.apache.spark.rdd.MapPartitionsRDD.compute上(MapPartitionsRDD.scala:38)
在org.apache.spark.rdd.rdd.computeOrReadCheckpoint(rdd.scala:319)
位于org.apache.spark.rdd.rdd.iterator(rdd.scala:283)
位于org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
在
package me.nevi
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction, Window}
import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
object AggregationTest {
object amortisedClosingBalance extends UserDefinedAggregateFunction {
override def inputSchema: StructType = new StructType().add("Capital", DoubleType).add("InterestRate", DoubleType).add("Repayment", DoubleType)
override def bufferSchema: StructType = new StructType().add("Opening", DoubleType).add("Closing", DoubleType)
override def dataType: DataType = new StructType().add("Opening", DoubleType).add("Closing", DoubleType)
override def deterministic: Boolean = true
override def initialize(buffer: MutableAggregationBuffer): Unit = {
buffer.update(0, 0.0)
buffer.update(1, 0.0)
}
override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
println(buffer.get(0))
println(buffer.get(1))
buffer.update(0, buffer.getDouble(1))
// (opening + capital) * interestrate - repayment
buffer.update(1, input.getDouble(0)
+ buffer.getDouble(0) + input.getDouble(2) + (buffer.getDouble(0) + input.getDouble(0)) * (input.getDouble(1) / 12))
} else {
// if first record?
buffer.update(0, input.getDouble(0))
buffer.update(1, input.getDouble(0))
}
}
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
buffer1.update(0, buffer1.getDouble(0))
buffer1.update(1, buffer1.getDouble(1))
}
override def evaluate(buffer: Row): Any = {
Row(buffer.getDouble(0), buffer.getDouble(1))
}
}
def main(args: Array[String]): Unit = {
System.setProperty("hadoop.home.dir", "C:/spark")
System.setProperty("spark.sql.warehouse.dir", "file:///tmp/spark-warehouse")
val spark: SparkSession = SparkSession.builder()
.master("local[*]")
.appName("Aggregation Test")
.getOrCreate()
import spark.implicits._
val df = spark.read.option("header", true).csv("file:///d:/interest_calc.csv")
df.show()
val windowSpec = Window.orderBy(df.col("Period").asc)
var calc = df.withColumn("Calcs", amortisedClosingBalance($"Capital", $"InterestRate", $"Repayment").over(windowSpec))
calc = calc.withColumn("Opening", round($"Calcs".getField("Opening"), 2)).withColumn("Closing", round($"Calcs".getField("Closing"),2))
.drop("Calcs")
calc.show()
}
}
StructType(StructField(Closing,DoubleType,true))
override def dataType: DataType = DoubleType
override def evaluate(buffer: Row): Any = {
Row(buffer.getDouble(1))
}