Apache spark Spark Streaming-从检查点重新启动
我们正在构建一个容错系统,该系统可以读取卡夫卡,并编写HBase和HDFS。批次每5秒运行一次。下面是我们希望设置的场景:Apache spark Spark Streaming-从检查点重新启动,apache-spark,spark-streaming,Apache Spark,Spark Streaming,我们正在构建一个容错系统,该系统可以读取卡夫卡,并编写HBase和HDFS。批次每5秒运行一次。下面是我们希望设置的场景: 在启用检查点的情况下启动新的spark流式处理,从kafka读取数据,处理数据并存储到HDFS和HBase 杀死spark流媒体作业,消息继续流入Kafka 重新启动spark streaming作业,下面是我们真正希望发生的事情:spark streaming读取检查点数据并使用正确的kafka偏移量重新启动。即使spark streaming作业已终止并重新启动,也不会
val ddqSsc = StreamingContext.getOrCreate(checkpointDir, () =>
createDDQStreamingContext(slideInterval.toLong, inputKafka, outputKafka, hbaseVerTableName, checkpointDir, baseRawHdfs, securityProtocol, groupID, zooKeeper, kafkaBrokers, hiveDBToLoad, hiveTableToLoad))
2) 下面是getOrCreate调用的函数的初始部分:
def createDDQStreamingContext(slideInterval: Long, inputKafka: String, outputKafka: String, hbaseVerTableName: String, checkpointDir: String, baseRawHdfs: String, securityProtocol: String, groupID: String, zooKeeper: String, kafkaBrokers: String, hiveDBToLoad: String, hiveTableToLoad: String): StreamingContext = {
val sparkConf = new SparkConf()
val ssc = new StreamingContext(sparkConf, Seconds(slideInterval))
//val sqlContext = new SQLContext(sc)
val sqlContext = new HiveContext(ssc.sparkContext)
import sqlContext.implicits._
ssc.checkpoint(checkpointDir)
val kafkaTopics = Set(inputKafka)
//Kafka parameters
var kafkaParams = Map[String, String]()
kafkaParams += ("bootstrap.servers" -> kafkaBrokers)
kafkaParams += ("zookeeper.connect" -> zooKeeper)
//Need this in a kerberos environment
kafkaParams += ("security.protocol" -> securityProtocol)
kafkaParams += ("sasl.kerberos.service.name" -> "kafka")
//WHAT IS THIS!!??
kafkaParams += ("group.id" -> groupID)
kafkaParams += ("key.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer")
kafkaParams += ("value.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer")
val inputDataDstream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, kafkaTopics)
=========================堆栈跟踪====================
2017-04-03 11:27:27047错误[驱动程序]纱线。应用程序管理员:用户
类引发异常:java.lang.NullPointerException
java.lang.NullPointerException
位于org.apache.spark.sql.SQLConf.getConf(SQLConf.scala:638)
位于org.apache.spark.sql.SQLConf.dataframeanalysis(SQLConf.scala:573)
位于org.apache.spark.sql.DataFrame(DataFrame.scala:132)
位于org.apache.spark.sql.DataFrame$.apply(DataFrame.scala:52)
位于org.apache.spark.sql.SQLContext.createDataFrame(SQLContext.scala:417)
位于org.apache.spark.sql.SQLImplicits.rddToDataFrameHolder(SQLImplicits.scala:155)
在com.wellsfargo.eda.bigdata.dced.dataprocessor.ddqKafkaDataProcessor$$anonfun$createDDQStreamingContext$1.apply(ddqKafkaDataProcessor.scala:97)
在com.wellsfargo.eda.bigdata.dced.dataprocessor.ddqKafkaDataProcessor$$anonfun$createDDQStreamingContext$1.apply(ddqKafkaDataProcessor.scala:73)
在org.apache.spark.streaming.dstream.dstream$$anonfun$transform$1$$anonfun$apply$21.apply(dstream.scala:700)
在org.apache.spark.streaming.dstream.dstream$$anonfun$transform$1$$anonfun$apply$21.apply(dstream.scala:700)
位于org.apache.spark.streaming.dstream.dstream$$anonfun$transform$2$$anonfun$5.apply(dstream.scala:714)
位于org.apache.spark.streaming.dstream.dstream$$anonfun$transform$2$$anonfun$5.apply(dstream.scala:712)
位于org.apache.spark.streaming.dstream.TransformedDStream.compute(TransformedDStream.scala:46)
在org.apache.spark.streaming.dstream.dstream$$anonfun$getOrCompute$1$$anonfun$1$$anonfun$apply$7.apply(dstream.scala:352)
在org.apache.spark.streaming.dstream.dstream$$anonfun$getOrCompute$1$$anonfun$1$$anonfun$apply$7.apply(dstream.scala:352)
在scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)中
位于org.apache.spark.streaming.dstream.dstream$$anonfun$getOrCompute$1$$anonfun$1.apply(dstream.scala:351)
位于org.apache.spark.streaming.dstream.dstream$$anonfun$getOrCompute$1$$anonfun$1.apply(dstream.scala:351)
位于org.apache.spark.streaming.dstream.dstream.createRDDWithLocalProperties(dstream.scala:426)
位于org.apache.spark.streaming.dstream.TransformedDStream.createRDDWithLocalProperties(TransformedDStream.scala:65)
位于org.apache.spark.streaming.dstream.dstream$$anonfun$getOrCompute$1.apply(dstream.scala:346)
位于org.apache.spark.streaming.dstream.dstream$$anonfun$getOrCompute$1.apply(dstream.scala:344)
在scala.Option.orElse(Option.scala:257)
位于org.apache.spark.streaming.dstream.dstream.getOrCompute(dstream.scala:341)
位于org.apache.spark.streaming.dstream.MappedDStream.compute(MappedDStream.scala:35)
在org.apache.spark.streaming.dstream.dstream$$anonfun$getOrCompute$1$$anonfun$1$$anonfun$apply$7.apply(dstream.scala:352)
在org.apache.spark.streaming.dstream.dstream$$anonfun$getOrCompute$1$$anonfun$1$$anonfun$apply$7.apply(dstream.scala:352)
在scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)中
位于org.apache.spark.streaming.dstream.dstream$$anonfun$getOrCompute$1$$anonfun$1.apply(dstream.scala:351)
位于org.apache.spark.streaming.dstream.dstream$$anonfun$getOrCompute$1$$anonfun$1.apply(dstream.scala:351)
位于org.apache.spark.streaming.dstream.dstream.createRDDWithLocalProperties(dstream.scala:426)
位于org.apache.spark.streaming.dstream.dstream$$anonfun$getOrCompute$1.apply(dstream.scala:346)
位于org.apache.spark.streaming.dstream.dstream$$anonfun$getOrCompute$1.apply(dstream.scala:344)
在scala.Option.orElse(Option.scala:257)
位于org.apache.spark.streaming.dstream.dstream.getOrCompute(dstream.scala:341)
位于org.apache.spark.streaming.dstream.ForEachDStream.generateJob(ForEachDStream.scala:47)
位于org.apache.spark.streaming.DStreamGraph$$anonfun$1.apply(DStreamGraph.scala:115)
位于org.apache.spark.streaming.DStreamGraph$$anonfun$1.apply(DStreamGraph.scala:114)
位于scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:251)
位于scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:251)
位于scala.collection.mutable.resizeblearray$class.foreach(resizeblearray.scala:59)
位于scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
位于scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:251)
位于scala.collection.AbstractTraversable.flatMap(Traversable.scala:105)
在org.apache上