Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/scala/16.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181

Warning: file_get_contents(/data/phpspider/zhask/data//catemap/8/qt/6.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Java 在给定异常的RDD上执行combineByKey转换。火花变换_Java_Scala_Apache Spark - Fatal编程技术网

Java 在给定异常的RDD上执行combineByKey转换。火花变换

Java 在给定异常的RDD上执行combineByKey转换。火花变换,java,scala,apache-spark,Java,Scala,Apache Spark,我正在尝试使用以下代码生成客户统计信息。这是一个combineByKey转换。我得到一个ArrayIndexOutOfBounds异常。我想知道原因,但我没有得到任何暗示。请任何人澄清,为什么我会得到这个例外。多谢各位 def createComb = (t:Array[String]) => { val total = t(5).toDouble val q = t(4).toInt (total/q, total/q, q, total)} def mergeValues : ((D

我正在尝试使用以下代码生成客户统计信息。这是一个combineByKey转换。我得到一个ArrayIndexOutOfBounds异常。我想知道原因,但我没有得到任何暗示。请任何人澄清,为什么我会得到这个例外。多谢各位

def createComb = (t:Array[String]) => {
val total = t(5).toDouble
val q = t(4).toInt
(total/q, total/q, q, total)}

def mergeValues : ((Double,Double,Int,Double), Array[String]) =>
(Double,Double,Int,Double) = 
{case((mx,mn,q,tot),t) =>{
val total = t(5).toDouble
val quan = t(4).toInt
val mxx = scala.math.max(mx, total/q)
val minn = scala.math.min(mn, total/q)
(mxx,minn,quan+q,total+tot)}}

def mergeComb:((Double,Double,Int,Double),(Double,Double,Int,Double)) =>
(Double,Double,Int,Double) = 
{ case((mx1,mn1,q1,tot1),(mx2,mn2,q2,tot2)) =>
(scala.math.max(mx1,mx2), scala.math.min(mn1,mn2), q1+q2, tot1+tot2)}

val statsOfCust = productsTotalByKey.combineByKey(createComb, mergeValues, mergeComb, new org.apache.spark.HashPartitioner(productsTotalByKey.partitions.size))
这是我在spark cluster上执行上述代码后在RDD上执行时得到的输出

scala> statsOfCust.first
[Stage 22:>                                                         (0 + 1) / 2]18/11/17 21:26:31 WARN TaskSetManager: Lost task 0.0 in stage 22.0 (TID 26, wn01.itversity.com, executor 9): java.lang.ArrayIndexOutOfBoundsException: 5
    at $line80.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$anonfun$createComb$1.apply(<console>:24)
    at $line80.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$anonfun$createComb$1.apply(<console>:23)
    at org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:189)
    at org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:188)
    at org.apache.spark.util.collection.AppendOnlyMap.changeValue(AppendOnlyMap.scala:144)
    at org.apache.spark.util.collection.SizeTrackingAppendOnlyMap.changeValue(SizeTrackingAppendOnlyMap.scala:32)
    at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:194)
    at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
    at org.apache.spark.scheduler.Task.run(Task.scala:109)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
  at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599)
  at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587)
  at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586)
  at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
  at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
  at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1586)
  at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
  at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
  at scala.Option.foreach(Option.scala:257)
  at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
  at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1820)
  at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1769)
  at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1758)
  at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
  at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
  at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
  at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
  at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074)
  at org.apache.spark.rdd.RDD$$anonfun$take$1.apply(RDD.scala:1358)
  at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
  at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
  at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
  at org.apache.spark.rdd.RDD.take(RDD.scala:1331)
  at org.apache.spark.rdd.RDD$$anonfun$first$1.apply(RDD.scala:1372)
  at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
  at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
  at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
  at org.apache.spark.rdd.RDD.first(RDD.scala:1371)
  ... 49 elided
Caused by: java.lang.ArrayIndexOutOfBoundsException: 5
  at $anonfun$createComb$1.apply(<console>:24)
  at $anonfun$createComb$1.apply(<console>:23)
  at org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:189)
  at org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:188)
  at org.apache.spark.util.collection.AppendOnlyMap.changeValue(AppendOnlyMap.scala:144)
  at org.apache.spark.util.collection.SizeTrackingAppendOnlyMap.changeValue(SizeTrackingAppendOnlyMap.scala:32)
  at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:194)
  at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63)
  at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
  at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
  at org.apache.spark.scheduler.Task.run(Task.scala:109)
  at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
  at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
  at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
  at java.lang.Thread.run(Thread.java:745)
scala>statsOfCust.first
[阶段22:>(0+1)/2]18/11/17 21:26:31警告TaskSetManager:在阶段22.0中丢失任务0.0(TID 26,wn01.itversity.com,executor 9):java.lang.ArrayIndexOutofBounds异常:5
在$line80.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$anonfun$createComb$1.应用(:24)
在$line80.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$anonfun$createComb$1.应用(:23)
位于org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:189)
位于org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:188)
位于org.apache.spark.util.collection.AppendOnlyMap.changeValue(AppendOnlyMap.scala:144)
位于org.apache.spark.util.collection.SizeTrackingAppendOnlyMap.changeValue(SizeTrackingAppendOnlyMap.scala:32)
位于org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:194)
位于org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63)
在org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)上
在org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)上
位于org.apache.spark.scheduler.Task.run(Task.scala:109)
位于org.apache.spark.executor.executor$TaskRunner.run(executor.scala:345)
位于java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
位于java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
运行(Thread.java:745)
驱动程序堆栈跟踪:
位于org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599)
位于org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587)
位于org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586)
位于scala.collection.mutable.resizeblearray$class.foreach(resizeblearray.scala:59)
位于scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
位于org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1586)
位于org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
位于org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
位于scala.Option.foreach(Option.scala:257)
位于org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
位于org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1820)
位于org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1769)
位于org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1758)
位于org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
位于org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
位于org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
位于org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
位于org.apache.spark.SparkContext.runJob(SparkContext.scala:2074)
在org.apache.spark.rdd.rdd$$anonfun$take$1.apply上(rdd.scala:1358)
位于org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
位于org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
位于org.apache.spark.rdd.rdd.withScope(rdd.scala:363)
位于org.apache.spark.rdd.rdd.take(rdd.scala:1331)
位于org.apache.spark.rdd.rdd$$anonfun$first$1.apply(rdd.scala:1372)
位于org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
位于org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
位于org.apache.spark.rdd.rdd.withScope(rdd.scala:363)
位于org.apache.spark.rdd.rdd.first(rdd.scala:1371)
... 49删去
原因:java.lang.ArrayIndexOutOfBoundsException:5
在$anonfun$createComb$1处。应用(:24)
在$anonfun$createComb$1。应用(:23)
位于org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:189)
位于org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:188)
位于org.apache.spark.util.collection.AppendOnlyMap.changeValue(AppendOnlyMap.scala:144)
位于org.apache.spark.util.collection.SizeTrackingAppendOnlyMap.changeValue(SizeTrackingAppendOnlyMap.scala:32)
位于org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:194)
位于org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63)
在org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)上
在org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)上
位于org.apache.spark.scheduler.Task.run(Task.scala:109)
位于org.apache.spark.executor.executor$TaskRunner.run(executor.scala:345)
位于java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
位于java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
运行(Thread.java:745)

如果假设t数组至少有6个元素,那么
createComb
方法似乎有问题


这只是一个快速的过程。如果有帮助,请告诉我。如果没有,我将尝试进一步调查:)

是的,我为combineByKey转换传递了错误的RDD。谢谢