Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/scala/17.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181

Warning: file_get_contents(/data/phpspider/zhask/data//catemap/3/apache-spark/5.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
绑定属性,树:Scala中的dayofmonth(cast(timestamp#122 as date))_Scala_Apache Spark_Dataframe_Group By_Aggregate - Fatal编程技术网

绑定属性,树:Scala中的dayofmonth(cast(timestamp#122 as date))

绑定属性,树:Scala中的dayofmonth(cast(timestamp#122 as date)),scala,apache-spark,dataframe,group-by,aggregate,Scala,Apache Spark,Dataframe,Group By,Aggregate,我有一个数据帧df=[id:String,value:Int,type:String,timestamp:java.sql.Date],我需要的结果是: 我的数据帧: +----+-------++-------+------------------------+ | id | type | value | timestamp | +----+-------+--------+------------------------+ | 1 | rent | 12

我有一个数据帧df=[id:String,value:Int,type:String,timestamp:java.sql.Date],我需要的结果是:

我的数据帧:

+----+-------++-------+------------------------+
| id | type  | value  | timestamp              |
+----+-------+--------+------------------------+
| 1 |  rent  |  12    |  2016-09-19T00:00:00Z
| 1 |   rent |  12    |  2016-09-19T00:00:00Z
| 1 | buy    |  12    |  2016-09-20T00:00:00Z
| 1 |  rent  |  12    |  2016-09-20T00:00:00Z
| 1 |   buy  |  12    |  2016-09-18T00:00:00Z
| 1 | buy    |  12    |  2016-09-18T00:00:00Z
+----+-------+-------+------------------------+ 
我需要结果

id : 1
totalValue  : 72
typeForDay : {"rent: 2, "buy" : 2 }  --- group By based on id and dayofmonth(col("timestamp"))  atmost 1 type per day 
我试过:

val ddf = df.
.groupBy("id", )
.agg(collect_set("type"),
sum("value") as "totalValue") 

val count_by_value = udf {( gti :scala.collection.mutable.WrappedArray[String]) => if (gti == null) null else  gti.groupBy(identity).mapValues(_.size)}


val result =  ddf.withColumn("totalValue", count_by_value($"collect_list(type)"))
.drop("collect_list(type)")
这给了我一个错误:

org.apache.spark.SparkException: Job aborted due to stage failure: Task 115 in stage 15.0 failed 4 times, most recent failure: Lost task 115.3 in stage 15.0 (TID 1357, ip-172-31-9-47.ec2.internal): org.apache.spark.sql.catalyst.errors.package$TreeNodeException: Binding attribute, tree: dayofmonth(cast(timestamp#122 as date))#137 
  at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:49) 
  at org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1.applyOrElse(BoundAttribute.scala:86) 
  at org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1.applyOrElse(BoundAttribute.scala:85) 
  at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:243) 
  at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:243) 
  at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:53) 
  at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:242) 
  at org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:233) 
  at org.apache.spark.sql.catalyst.expressions.BindReferences$.bindReference(BoundAttribute.scala:85)
  at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection$$anonfun$$init$$2.apply(Projection.scala:62)
  at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection$$anonfun$$init$$2.apply(Projection.scala:62)
  at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
  at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
  at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
  at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
  at scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
  at scala.collection.AbstractTraversable.map(Traversable.scala:105)
  at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.<init>(Projection.scala:62)
  at org.apache.spark.sql.execution.SparkPlan$$anonfun$newMutableProjection$1.apply(SparkPlan.scala:234)
  at org.apache.spark.sql.execution.SparkPlan$$anonfun$newMutableProjection$1.apply(SparkPlan.scala:234)
  at org.apache.spark.sql.execution.Exchange.org$apache$spark$sql$execution$Exchange$$getPartitionKeyExtractor$1(Exchange.scala:197)
  at org.apache.spark.sql.execution.Exchange$$anonfun$3.apply(Exchange.scala:209)
  at org.apache.spark.sql.execution.Exchange$$anonfun$3.apply(Exchange.scala:208)
  at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$21.apply(RDD.scala:728)
  at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$21.apply(RDD.scala:728)
  at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
  at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
  at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
  at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
  at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
  at org.apache.spark.scheduler.Task.run(Task.scala:89)
  at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
  at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
  at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
  at java.lang.Thread.run(Thread.java:745) Caused by: java.lang.RuntimeException: Couldn't find dayofmonth(cast(timestamp#122 as date))#137 in [customerId#81,timestamp#122,benefit#111]
  at scala.sys.package$.error(package.scala:27)
  at org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1$$anonfun$applyOrElse$1.apply(BoundAttribute.scala:92)
  at org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1$$anonfun$applyOrElse$1.apply(BoundAttribute.scala:86)
  at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:48) ... 34 more 
org.apache.spark.SparkException:作业因阶段失败而中止:阶段15.0中的任务115失败4次,最近的失败:阶段15.0中的任务115.3丢失(TID 1357,ip-172-31-9-47.ec2.internal):org.apache.spark.sql.catalyst.errors.package$treenodexception:Binding属性,tree:dayofmonth(cast(timestamp#122 as date))137
位于org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:49)
位于org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1.applyOrElse(BoundAttribute.scala:86)
位于org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1.applyOrElse(BoundAttribute.scala:85)
位于org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:243)
位于org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:243)
位于org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:53)
位于org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:242)
位于org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:233)
位于org.apache.spark.sql.catalyst.expressions.BindReferences$.bindReference(BoundAttribute.scala:85)
在org.apache.spark.sql.catalyst.expressions.解释器mutableprojection$$anonfun$$init$$2.apply上(Projection.scala:62)
在org.apache.spark.sql.catalyst.expressions.解释器mutableprojection$$anonfun$$init$$2.apply上(Projection.scala:62)
位于scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
位于scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
位于scala.collection.mutable.resizeblearray$class.foreach(resizeblearray.scala:59)
位于scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
位于scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
位于scala.collection.AbstractTraversable.map(Traversable.scala:105)
位于org.apache.spark.sql.catalyst.expressions.expressions.ExpressionedMutableProjection.(Projection.scala:62)
位于org.apache.spark.sql.execution.SparkPlan$$anonfun$newMutableProjection$1.apply(SparkPlan.scala:234)
位于org.apache.spark.sql.execution.SparkPlan$$anonfun$newMutableProjection$1.apply(SparkPlan.scala:234)
在org.apache.spark.sql.execution.Exchange.org$apache$spark$sql$execution$Exchange$$getPartitionKeyExtractor$1(Exchange.scala:197)
位于org.apache.spark.sql.execution.Exchange$$anonfun$3.apply(Exchange.scala:209)
位于org.apache.spark.sql.execution.Exchange$$anonfun$3.apply(Exchange.scala:208)
位于org.apache.spark.rdd.rdd$$anonfun$mapPartitionsInternal$1$$anonfun$apply$21.apply(rdd.scala:728)
位于org.apache.spark.rdd.rdd$$anonfun$mapPartitionsInternal$1$$anonfun$apply$21.apply(rdd.scala:728)
在org.apache.spark.rdd.MapPartitionsRDD.compute上(MapPartitionsRDD.scala:38)
在org.apache.spark.rdd.rdd.computeOrReadCheckpoint(rdd.scala:306)上
位于org.apache.spark.rdd.rdd.iterator(rdd.scala:270)
在org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)上
在org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)上
位于org.apache.spark.scheduler.Task.run(Task.scala:89)
位于org.apache.spark.executor.executor$TaskRunner.run(executor.scala:213)
位于java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
位于java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
在java.lang.Thread.run(Thread.java:745)处,由于以下原因导致:java.lang.RuntimeException:在[customerId 81,timestamp 122,benefit 111]中找不到dayofmonth(cast(timestamp#122 as date))137
在scala.sys.package$.error处(package.scala:27)
位于org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1$$anonfun$applyorlse$1.apply(BoundAttribute.scala:92)
位于org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1$$anonfun$applyorlse$1.apply(BoundAttribute.scala:86)
在org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:48)。。。34多
运行代码(经过几次修复使其编译…)不会产生您在my env(使用Spark 1.6.2)上描述的异常,但也不会产生所需的结果-您无法尝试在
ddf
上计算每种类型的天数,因为
ddf
已仅按
id
进行分组,并且时间戳数据丢失

下面是一个替代实现,使用UDAF(用户定义的聚合函数)将
MapType
列的值“合并”到单个映射中:

val toMap = udf { (typ: String, count: Int) => Map(typ -> count) }

val result = df
  // First: group by id AND type, count distinct days and sum value:
  .groupBy("id", "type").agg(countDistinct(dayofmonth(col("timestamp"))) as "daysPerType", sum("value") as "valPerType")
  // Then: convert type and count into a single Map column
  .withColumn("typeForDay", toMap(col("type"), col("daysPerType")))
  // Lastly: use a custom aggregation function to "merge" the maps (assuming keys are unique to begin with!)
  .groupBy("id").agg(sum("valPerType") as "totalValue", CombineMaps(col("typeForDay")) as "typeForDay")

result.show() 
// prints:
// +---+----------+------------------------+
// | id|totalValue|              typeForDay|
// +---+----------+------------------------+
// |  1|        72|Map(buy -> 2, rent -> 2)|
// +---+----------+------------------------+
以及
组合映射的实现

object CombineMaps extends UserDefinedAggregateFunction {
  override def inputSchema: StructType = new StructType().add("map", dataType)
  override def bufferSchema: StructType = inputSchema
  override def dataType: DataType = MapType(StringType, IntegerType)
  override def deterministic: Boolean = true

  override def initialize(buffer: MutableAggregationBuffer): Unit = buffer.update(0 , Map[String, Int]())

  // naive implementation - assuming keys won't repeat, otherwise later value for key overrides earlier one
  override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
    val before = buffer.getAs[Map[String, Int]](0)
    val toAdd = input.getAs[Map[String, Int]](0)
    val result = before ++ toAdd
    buffer.update(0, result)
  }

  override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = update(buffer1, buffer2)

  override def evaluate(buffer: Row): Any = buffer.getAs[Map[String, Int]](0)
}

我遵循了您的想法,但我怀疑,如果我有另一个类似于type的列(带有字符串值),但将仅按id分组,该怎么办。我需要单独这样做并将结果与此合并吗?