Apache spark spark作业服务器作业崩溃

Apache spark spark作业服务器作业崩溃,apache-spark,spark-jobserver,Apache Spark,Spark Jobserver,我正在使用spark jobserver运行作业(需要+-10分钟)。 作业在执行过程中随机崩溃(2上约1次),执行器上出现以下异常: ERROR 2016-10-13 19:22:58,617 Logging.scala:95 - org.apache.spark.executor.Executor: Exception in task 24.0 in stage 1.0 (TID 25) org.apache.spark.TaskKilledException: null at or

我正在使用spark jobserver运行作业(需要+-10分钟)。 作业在执行过程中随机崩溃(2上约1次),执行器上出现以下异常:

ERROR 2016-10-13 19:22:58,617 Logging.scala:95 - org.apache.spark.executor.Executor: Exception in task 24.0 in stage 1.0 (TID 25) org.apache.spark.TaskKilledException: null at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:217) ~[spark-core_2.10-1.6.2.1.jar:1.6.2.1] at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) [na:1.8.0_101] at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) [na:1.8.0_101] at java.lang.Thread.run(Thread.java:745) [na:1.8.0_101] ERROR 2016-10-13 19:22:58,617 Logging.scala:95 - org.apache.spark.storage.DiskBlockObjectWriter: Uncaught exception while reverting partial writes to file /var/lib/spark/rdd/spark-4e4b3899-3ba4-47ad-b3af-5a4431321c5a/executor-0e09bdd5-44ef-4cb1-9c18-7659de428f6b/blockmgr-cce06e16-035b-4536-b8be-99c14e89757e/23/temp_shuffle_3c6ba81d-f8bd-4479-9bc8-c65a4f784613 java.nio.channels.ClosedByInterruptException: null at java.nio.channels.spi.AbstractInterruptibleChannel.end(AbstractInterruptibleChannel.java:202) ~[na:1.8.0_101] at sun.nio.ch.FileChannelImpl.truncate(FileChannelImpl.java:372) ~[na:1.8.0_101] at org.apache.spark.storage.DiskBlockObjectWriter.revertPartialWritesAndClose(DiskBlockObjectWriter.scala:164) ~[spark-core_2.10-1.6.2.1.jar:1.6.2.1] at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.stop(BypassMergeSortShuffleWriter.java:226) [spark-core_2.10-1.6.2.1.jar:1.6.2.1] at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79) [spark-core_2.10-1.6.2.1.jar:1.6.2.1] at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41) [spark-core_2.10-1.6.2.1.jar:1.6.2.1]ERROR 2016-10-13 19:22:58,617 Logging.scala:95 - org.apache.spark.storage.DiskBlockObjectWriter: Uncaught exception while reverting partial writes to file /var/lib/spark/rdd/spark-4e4b3899-3ba4-47ad-b3af-5a4431321c5a/executor-0e09bdd5-44ef-4cb1-9c18-7659de428f6b/blockmgr-cce06e16-035b-4536-b8be-99c14e89757e/2b/temp_shuffle_ba04b671-f5a3-49c4-901e-879ead59f48a java.nio.channels.ClosedByInterruptException: null at java.nio.channels.spi.AbstractInterruptibleChannel.end(AbstractInterruptibleChannel.java:202) ~[na:1.8.0_101] at sun.nio.ch.FileChannelImpl.truncate(FileChannelImpl.java:372) ~[na:1.8.0_101] at org.apache.spark.storage.DiskBlockObjectWriter.revertPartialWritesAndClose(DiskBlockObjectWriter.scala:164) ~[spark-core_2.10-1.6.2.1.jar:1.6.2.1] at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.stop(BypassMergeSortShuffleWriter.java:226) [spark-core_2.10-1.6.2.1.jar:1.6.2.1] at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79) [spark-core_2.10-1.6.2.1.jar:1.6.2.1] at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41) [spark-core_2.10-1.6.2.1.jar:1.6.2.1] at org.apache.spark.scheduler.Task.run(Task.scala:89) [spark-core_2.10-1.6.2.1.jar:1.6.2.1] at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227) [spark-core_2.10-1.6.2.1.jar:1.6.2.1] at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) [na:1.8.0_101] at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) [na:1.8.0_101] at java.lang.Thread.run(Thread.java:745) [na:1.8.0_101] at org.apache.spark.scheduler.Task.run(Task.scala:89) [spark-core_2.10-1.6.2.1.jar:1.6.2.1] at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227) [spark-core_2.10-1.6.2.1.jar:1.6.2.1] at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) [na:1.8.0_101] at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) [na:1.8.0_101] at java.lang.Thread.run(Thread.java:745) [na:1.8.0_101] ERROR 2016-10-13 19:22:58,617 Logging.scala:95 - org.apache.spark.storage.DiskBlockObjectWriter: Uncaught exception while reverting partial writes to file /var/lib/spark/rdd/spark-4e4b3899-3ba4-47ad-b3af-5a4431321c5a/executor-0e09bdd5-44ef-4cb1-9c18-7659de428f6b/blockmgr-cce06e16-035b-4536-b8be-99c14e89757e/2b/temp_shuffle_ba04b671-f5a3-49c4-901e-879ead59f48a java.nio.channels.ClosedByInterruptException: null at java.nio.channels.spi.AbstractInterruptibleChannel.end(AbstractInterruptibleChannel.java:202) ~[na:1.8.0_101] at sun.nio.ch.FileChannelImpl.truncate(FileChannelImpl.java:372) ~[na:1.8.0_101] at org.apache.spark.storage.DiskBlockObjectWriter.revertPartialWritesAndClose(DiskBlockObjectWriter.scala:164) ~[spark-core_2.10-1.6.2.1.jar:1.6.2.1] at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.stop(BypassMergeSortShuffleWriter.java:226) [spark-core_2.10-1.6.2.1.jar:1.6.2.1] at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79) [spark-core_2.10-1.6.2.1.jar:1.6.2.1] at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41) [spark-core_2.10-1.6.2.1.jar:1.6.2.1] at org.apache.spark.scheduler.Task.run(Task.scala:89) [spark-core_2.10-1.6.2.1.jar:1.6.2.1] at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227) [spark-core_2.10-1.6.2.1.jar:1.6.2.1] at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) [na:1.8.0_101] at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) [na:1.8.0_101] at java.lang.Thread.run(Thread.java:745) [na:1.8.0_101] .... 错误2016-10-13 19:22:58617 Logging.scala:95-org.apache.spark.executor.executor:第1.0阶段任务24.0中出现异常(TID 25) org.apache.spark.TaskKilledException:null 在org.apache.spark.executor.executor$TaskRunner.run(executor.scala:217)~[spark-core_2.10-1.6.2.1.jar:1.6.2.1] 位于java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)[na:1.8.0_101] 在java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)[na:1.8.0_101] 在java.lang.Thread.run(Thread.java:745)[na:1.8.0_101] 错误2016-10-13 19:22:58617 Logging.scala:95-org.apache.spark.storage.DiskBlockObjectWriter:将部分写入还原到文件时出现未捕获异常/var/lib/spark/rdd/spark-4e4b3899-3ba4-47ad-b3af-5a4431321c5a/EXCETOR-0e09bdd5-44ef-4cb1-9c18-7659de428f6b/blockmgr-cce06e16-035b-4536-b8be-99c14e89757e/23/temp_shuffle-3c6ba81d-f8bd-4479-9bc8-c65a4f784613 java.nio.channels.ClosedByInterruptException:null 在java.nio.channels.spi.AbstractInterruptableChannel.end(AbstractInterruptableChannel.java:202)~[na:1.8.0_101] 在sun.nio.ch.FileChannelImpl.truncate(FileChannelImpl.java:372)~[na:1.8.0_101] 在org.apache.spark.storage.DiskBlockObjectWriter.RevertPartialWriteAndClose(DiskBlockObjectWriter.scala:164)~[spark-core_2.10-1.6.2.1.jar:1.6.2.1] 在org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.stop(BypassMergeSortShuffleWriter.java:226)[spark-core_2.10-1.6.2.1.jar:1.6.2.1] 在org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79)[spark-core_2.10-1.6.2.1.jar:1.6.2.1] 在org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)[spark-core_2.10-1.6.2.1.jar:1.6.2.1]错误2016-10-13 19:22:58617 Logging.scala:95-org.apache.spark.storage.DiskBlockObjectWriter:将部分写入还原到文件时出现未捕获异常/var/lib/spark/rdd/spark-4e4b3899-3ba4-47ad-b3af-5a4431321c5a/executor-0e09bdd5-44ef-4cb1-9c18-7659de428f6b/blockmgr-cce06e16-035b-4536-b8be-99c14e89757e/2b/temp_shuffle_ba04b671-f5a3-49c4-901e-879EAD9F48A java.nio.channels.ClosedByInterruptException:null 在java.nio.channels.spi.AbstractInterruptableChannel.end(AbstractInterruptableChannel.java:202)~[na:1.8.0_101] 在sun.nio.ch.FileChannelImpl.truncate(FileChannelImpl.java:372)~[na:1.8.0_101] 在org.apache.spark.storage.DiskBlockObjectWriter.RevertPartialWriteAndClose(DiskBlockObjectWriter.scala:164)~[spark-core_2.10-1.6.2.1.jar:1.6.2.1] 在org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.stop(BypassMergeSortShuffleWriter.java:226)[spark-core_2.10-1.6.2.1.jar:1.6.2.1] 在org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79)[spark-core_2.10-1.6.2.1.jar:1.6.2.1] 在org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)[spark-core_2.10-1.6.2.1.jar:1.6.2.1] 在org.apache.spark.scheduler.Task.run(Task.scala:89)[spark-core_2.10-1.6.2.1.jar:1.6.2.1] 在org.apache.spark.executor.executor$TaskRunner.run(executor.scala:227)[spark-core_2.10-1.6.2.1.jar:1.6.2.1] 位于java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)[na:1.8.0_101] 在java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)[na:1.8.0_101] 在java.lang.Thread.run(Thread.java:745)[na:1.8.0_101] 在org.apache.spark.scheduler.Task.run(Task.scala:89)[spark-core_2.10-1.6.2.1.jar:1.6.2.1] 在org.apache.spark.executor.executor$TaskRunner.run(executor.scala:227)[spark-core_2.10-1.6.2.1.jar:1.6.2.1] 位于java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)[na:1.8.0_101] 在java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)[na:1.8.0_101] 在java.lang.Thread.run(Thread.java:745)[na:1.8.0_101] 错误2016-10-13 19:22:58617 Logging.scala:95-org.apache.spark.storage.DiskBlockObjectWriter:将部分写入还原到文件时出现未捕获异常/var/lib/spark/rdd/spark-4e4b3899-3ba4-47ad-b3af-5a4431321c5a/executor-0e09bdd5-44ef-4cb1-9c18-7659de428f6b/blockmgr-cce06e16-035b-4536-b8be-99c14e89757e/2b/temp_shuffle_ba04b671-f5a3-49c4-901e-879EAD9F48A java.nio.channels.ClosedByInterruptException:null 在java.nio.channels.spi.AbstractInterruptableChannel.end(AbstractInterruptableChannel.java:202)~[na:1.8.0_101] 在sun.nio.ch.FileChannelImpl.truncate(FileChannelImpl.java:372)~[na:1.8.0_101] 在org.apache.spark.storage.DiskBlockObjectWriter.RevertPartialWriteAndClose(DiskBlockObjectWriter.scala:164)~[spark-core_2.10-1.6.2.1.jar:1.6.2.1] 在org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.stop(BypassMergeSortShuffleWriter.java:226)[spark-core_2.10-1.6.2.1.jar:1.6.2.1] 在org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79)[spark-core_2.10-1.6.2.1.jar:1.6.2.1] 在org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)[spark-core_2.10-1.6.2.1.jar:1.6.2.1] 在org.apache.spark.scheduler.Task.run(Task.scala:89)[spark-core_2.10-1.6.2.1.jar:1.6.2.1] 在org.apache.spark.executor.executor$TaskRunner.run(executor.scala:227)[spark-core_2.10-1.6.2.1.jar:1.6.2.1] 位于java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)[na:1.8.0_101] 在java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)[na:1.8.0_101] 在java.lang.Thread.run(Thread.java:745)[na:1.8.0_101] .... 驱动程序日志:

[2016-10-14 10:07:22,701] INFO o.a.s.s.DAGScheduler [1efb38e8-eaa4-4bdc-8df7-b5047f713c02] [akka://JobServer/user/context-supervisor/9027f2fd-com.my.sparkJob] - Job 0 failed: runJob at RDDFunctions.scala:37, took 94.010942 s [2016-10-14 10:07:22,704] INFO s.j.JobStatusActor [] [akka://JobServer/user/context-supervisor/9027f2fd-com.my.sparkJob/status-actor] - Job 1efb38e8-eaa4-4bdc-8df7-b5047f713c02 finished with an error [2016-10-14 10:07:22,705] INFO a.a.DeadLetterActorRef [] [akka://JobServer/deadLetters] - Message [spark.jobserver.CommonMessages$JobErroredOut] from Actor[akka://JobServer/user/context-supervisor/9027f2fd-com.my.sparkJob/status-actor#-2093845382] to Actor[akka://JobServer/deadLetters] was not delivered. [10] dead letters encountered, no more dead letters will be logged. This logging can be turned off or adjusted with configuration settings 'akka.log-dead-letters' and 'akka.log-dead-letters-during-shutdown'. [2016-10-14 10:07:22,705] WARN s.j.JobManagerActor [] [akka://JobServer/user/context-supervisor/9027f2fd-com.my.sparkJob] - Exception from job 1efb38e8-eaa4-4bdc-8df7-b5047f713c02: org.apache.spark.SparkException: Job aborted due to stage failure: Exception while getting task result: java.lang.NullPointerException at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431) ~[spark-core_2.10-1.6.2.1.jar:1.6.2.1] at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1419) ~[spark-core_2.10-1.6.2.1.jar:1.6.2.1] at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1418) ~[spark-core_2.10-1.6.2.1.jar:1.6.2.1] at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) ~[scala-library-2.10.6.jar:na] at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47) ~[scala-library-2.10.6.jar:na] at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418) ~[spark-core_2.10-1.6.2.1.jar:1.6.2.1] at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799) ~[spark-core_2.10-1.6.2.1.jar:1.6.2.1] at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799) ~[spark-core_2.10-1.6.2.1.jar:1.6.2.1] at scala.Option.foreach(Option.scala:236) ~[scala-library-2.10.6.jar:na] at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799) ~[spark-core_2.10-1.6.2.1.jar:1.6.2.1] at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1640) ~[spark-core_2.10-1.6.2.1.jar:1.6.2.1] at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599) ~[spark-core_2.10-1.6.2.1.jar:1.6.2.1] at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588) ~[spark-core_2.10-1.6.2.1.jar:1.6.2.1] at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) ~[spark-core_2.10-1.6.2.1.jar:1.6.2.1] at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620) ~[spark-core_2.10-1.6.2.1.jar:1.6.2.1] at org.apache.spark.SparkContext.runJob(SparkContext.scala:1832) ~[spark-core_2.10-1.6.2.1.jar:1.6.2.1] at org.apache.spark.SparkContext.runJob(SparkContext.scala:1845) ~[spark-core_2.10-1.6.2.1.jar:1.6.2.1] at org.apache.spark.SparkContext.runJob(SparkContext.scala:1922) ~[spark-core_2.10-1.6.2.1.jar:1.6.2.1] at com.datastax.spark.connector.RDDFunctions.saveToCassandra(RDDFunctions.scala:37) ~[spark-cassandra-connector_2.10-1.6.0.jar:1.6.0] at com.my.sparkJob.init(sparkJob.scala:228) ~[csm-spark-2016-10-14T10_04_36.212+02_00.jar:na] at com.my.sparkJob$.runJob(sparkJob.scala:166) ~[csm-spark-2016-10-14T10_04_36.212+02_00.jar:na] at com.my.sparkJob$.runJob(sparkJob.scala:122) ~[csm-spark-2016-10-14T10_04_36.212+02_00.jar:na] at com.my.sparkJob$.runJob(sparkJob.scala:119) ~[csm-spark-2016-10-14T10_04_36.212+02_00.jar:na] at spark.jobserver.JobManagerActor$$anonfun$spark$jobserver$JobManagerActor$$getJobFuture$4.apply(JobManagerActor.scala:235) ~[spark-job-server.jar:0.5.2.501] at scala.concurrent.impl.Future$PromiseCompletingRunnable.liftedTree1$1(Future.scala:24) ~[scala-library-2.10.6.jar:na] at scala.concurrent.impl.Future$PromiseCompletingRunnable.run(Future.scala:24) ~[scala-library-2.10.6.jar:na] at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) [na:1.8.0_101] at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) [na:1.8.0_101] at java.lang.Thread.run(Thread.java:745) [na:1.8.0_101] [2016-10-14 10:07:22,708] INFO s.j.LocalContextSupervisorActor [] [akka://JobServer/user/context-supervisor] - Shutting down context 9027f2fd-com.my.sparkJob [2016-10-14 10:07:22701]信息o.a.s.s.DAGScheduler[1efb38e8-eaa4-4bdc-8df7-b5047f713c02][akka://JobServer/user/context-supervisor/9027f2fd-com.my.sparkJob]-作业0失败:在RDDFunctions处运行作业。scala:37,耗时94.010942秒 [2016-10-14 10:07:22704]信息s.j.JobStatusActor[][akka://JobServer/user/context-supervisor/902