Warning: file_get_contents(/data/phpspider/zhask/data//catemap/0/hadoop/6.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Scala spark streaming无法执行toDF功能_Scala_Hadoop_Apache Spark_Spark Streaming - Fatal编程技术网

Scala spark streaming无法执行toDF功能

Scala spark streaming无法执行toDF功能,scala,hadoop,apache-spark,spark-streaming,Scala,Hadoop,Apache Spark,Spark Streaming,我正在尝试制作一个连接到Flume的spark流媒体应用程序 当数据是RDD时,我设法保存了它,但如果我试图使用toDF函数将其转换为数据帧,则会出错。我正在使用shell,因此看不出错误是什么 这是我正在执行的代码: //importing relevant libraries import org.apache.spark.SparkConf import org.apache.spark.streaming._ import org.apache.spark.streaming.flume

我正在尝试制作一个连接到Flume的spark流媒体应用程序

当数据是RDD时,我设法保存了它,但如果我试图使用toDF函数将其转换为数据帧,则会出错。我正在使用shell,因此看不出错误是什么

这是我正在执行的代码:

//importing relevant libraries
import org.apache.spark.SparkConf
import org.apache.spark.streaming._
import org.apache.spark.streaming.flume._
import org.apache.spark.util.IntParam
import org.apache.spark.storage.StorageLevel

//creating the spark streaming configuration
val ssc = new StreamingContext(sc, Seconds(5))
val stream = FlumeUtils.createStream(ssc, "0.0.0.0", 44444, StorageLevel.MEMORY_ONLY_SER_2)

//starting the streaming job
val textStream = stream.map(e => new String(e.event.getBody.array) )
val numlines = textStream.count()
numlines.print()
textStream.foreachRDD { rdd =>
  //some stuff that needs to be created
  import java.util.Date
  val d = new Date
  //delimeter of '&'
  val rdd_s = rdd.map(line => line.split("&"))
  val rdd_split = rdd_s.map(line => (d.getTime.toString, line(2), line(3).toInt))
  //only saves the data if the toDF is comented out.
  rdd_split.saveAsTextFile("/flume/text/final/")

  //creating the data-frame - if commented out, the data will be saved to file
  val sqlContext = SQLContext.getOrCreate(rdd.sparkContext)
  import sqlContext.implicits._  
  val df = rdd_split.toDF("moment", "ID","amount")
  df.saveAsTextFile("/idan/streaming/flume/text/final/withtime")
}

ssc.start()
找到了这个问题的答案 我需要做的是创建sqlContext的惰性单例来创建DataFrame。 以下是singleton代码的最终版本:

//creating the case class for the DataFrame
case class Record(moment:String, name:String, id:String, amount:Int)
/** Lazily instantiated singleton instance of SQLContext */
object SQLContextSingleton {
  @transient private var instance: SQLContext = null
  // Instantiate SQLContext on demand
  def getInstance(sc: SparkContext): SQLContext = synchronized {
    if (instance == null) {
      instance = new SQLContext(sc)
    }
    instance
  }
}
要创建数据帧,我需要为其创建单例:

val sqlContext = SQLContextSingleton.getInstance(rdd_s.sparkContext)
import sqlContext.implicits._
val df = sqlContext.createDataFrame(rdd_s.map(line => (d.getTime, line(0), line(1), line(2), line(3))))
编辑: 它给了我这个错误:

16/08/09 13:16:05 ERROR scheduler.JobScheduler: Error running job streaming job 1470748565000 ms.1
java.lang.NullPointerException
    at org.apache.spark.sql.hive.client.ClientWrapper.conf(ClientWrapper.scala:205)
    at org.apache.spark.sql.hive.HiveContext.hiveconf$lzycompute(HiveContext.scala:554)
    at org.apache.spark.sql.hive.HiveContext.hiveconf(HiveContext.scala:553)
    at org.apache.spark.sql.hive.HiveContext$$anonfun$configure$1.apply(HiveContext.scala:540)
    at org.apache.spark.sql.hive.HiveContext$$anonfun$configure$1.apply(HiveContext.scala:539)
    at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
    at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
    at scala.collection.immutable.List.foreach(List.scala:318)
    at scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
    at scala.collection.AbstractTraversable.map(Traversable.scala:105)
    at org.apache.spark.sql.hive.HiveContext.configure(HiveContext.scala:539)
    at org.apache.spark.sql.hive.HiveContext.metadataHive$lzycompute(HiveContext.scala:252)
    at org.apache.spark.sql.hive.HiveContext.metadataHive(HiveContext.scala:239)
    at org.apache.spark.sql.hive.HiveContext$$anon$2.<init>(HiveContext.scala:459)
    at org.apache.spark.sql.hive.HiveContext.catalog$lzycompute(HiveContext.scala:459)
    at org.apache.spark.sql.hive.HiveContext.catalog(HiveContext.scala:458)
    at org.apache.spark.sql.hive.HiveContext$$anon$3.<init>(HiveContext.scala:475)
    at org.apache.spark.sql.hive.HiveContext.analyzer$lzycompute(HiveContext.scala:475)
    at org.apache.spark.sql.hive.HiveContext.analyzer(HiveContext.scala:474)
    at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:34)
    at org.apache.spark.sql.DataFrame.<init>(DataFrame.scala:133)
    at org.apache.spark.sql.DataFrame$.apply(DataFrame.scala:52)
    at org.apache.spark.sql.SQLContext.createDataFrame(SQLContext.scala:417)
    at org.apache.spark.sql.SQLImplicits.rddToDataFrameHolder(SQLImplicits.scala:155)
    at $line46.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(<console>:58)
    at $line46.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(<console>:48)
    at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:661)
    at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:661)
    at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:50)
    at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:50)
    at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:50)
    at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:426)
    at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:49)
    at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:49)
    at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:49)
    at scala.util.Try$.apply(Try.scala:161)
    at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
    at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:224)
    at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:224)
    at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:224)
    at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
    at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:223)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
    at java.lang.Thread.run(Thread.java:745)
16/08/09 13:16:08 ERROR scheduler.ReceiverTracker: Deregistered receiver for stream 0: Stopped by driver
16/08/09 13:16:05错误调度程序。作业调度程序:运行作业流作业147074856500 ms时出错。1
java.lang.NullPointerException
位于org.apache.spark.sql.hive.client.ClientWrapper.conf(ClientWrapper.scala:205)
位于org.apache.spark.sql.hive.HiveContext.hiveconf$lzycompute(HiveContext.scala:554)
位于org.apache.spark.sql.hive.HiveContext.hiveconf(HiveContext.scala:553)
位于org.apache.spark.sql.hive.HiveContext$$anonfun$configure$1.apply(HiveContext.scala:540)
位于org.apache.spark.sql.hive.HiveContext$$anonfun$configure$1.apply(HiveContext.scala:539)
位于scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
位于scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
位于scala.collection.immutable.List.foreach(List.scala:318)
位于scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
位于scala.collection.AbstractTraversable.map(Traversable.scala:105)
位于org.apache.spark.sql.hive.HiveContext.configure(HiveContext.scala:539)
位于org.apache.spark.sql.hive.HiveContext.metadataHive$lzycompute(HiveContext.scala:252)
位于org.apache.spark.sql.hive.HiveContext.metadataHive(HiveContext.scala:239)
位于org.apache.spark.sql.hive.HiveContext$$anon$2。(HiveContext.scala:459)
位于org.apache.spark.sql.hive.HiveContext.catalog$lzycompute(HiveContext.scala:459)
位于org.apache.spark.sql.hive.HiveContext.catalog(HiveContext.scala:458)
位于org.apache.spark.sql.hive.HiveContext$$anon$3。(HiveContext.scala:475)
位于org.apache.spark.sql.hive.HiveContext.analyzer$lzycompute(HiveContext.scala:475)
位于org.apache.spark.sql.hive.HiveContext.analyzer(HiveContext.scala:474)
位于org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:34)
位于org.apache.spark.sql.DataFrame(DataFrame.scala:133)
位于org.apache.spark.sql.DataFrame$.apply(DataFrame.scala:52)
位于org.apache.spark.sql.SQLContext.createDataFrame(SQLContext.scala:417)
位于org.apache.spark.sql.SQLImplicits.rddToDataFrameHolder(SQLImplicits.scala:155)
在$line46.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1.应用(:58)
在$line46.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1.应用(:48)
在org.apache.spark.streaming.dstream.dstream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(dstream.scala:661)
在org.apache.spark.streaming.dstream.dstream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(dstream.scala:661)
在org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:50)
在org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:50)
在org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:50)
位于org.apache.spark.streaming.dstream.dstream.createRDDWithLocalProperties(dstream.scala:426)
在org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:49)
在org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply上(ForEachDStream.scala:49)
在org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply上(ForEachDStream.scala:49)
在scala.util.Try$.apply处(Try.scala:161)
位于org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
在org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:224)
位于org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:224)
位于org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:224)
在scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)中
位于org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:223)
位于java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
位于java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
运行(Thread.java:745)
2008年8月16日13:16:08错误计划程序。ReceiverTracker:流0的已注销接收器:已被驱动程序停止
有没有人碰巧遇到这种问题并且知道如何帮助?
谢谢:)

请参阅帮助他人重现问题您无法在spark ui日志中看到错误吗?问题很长,因此我编辑了该问题。哪一行是ClientWrapper。scala:205?请参阅帮助他人重现问题您无法在spark ui日志中看到错误吗?问题很长,因此我编辑了该问题。哪一行是ClientWrapper.scala:205?