Scala spark streaming无法执行toDF功能
我正在尝试制作一个连接到Flume的spark流媒体应用程序 当数据是RDD时,我设法保存了它,但如果我试图使用toDF函数将其转换为数据帧,则会出错。我正在使用shell,因此看不出错误是什么 这是我正在执行的代码:Scala spark streaming无法执行toDF功能,scala,hadoop,apache-spark,spark-streaming,Scala,Hadoop,Apache Spark,Spark Streaming,我正在尝试制作一个连接到Flume的spark流媒体应用程序 当数据是RDD时,我设法保存了它,但如果我试图使用toDF函数将其转换为数据帧,则会出错。我正在使用shell,因此看不出错误是什么 这是我正在执行的代码: //importing relevant libraries import org.apache.spark.SparkConf import org.apache.spark.streaming._ import org.apache.spark.streaming.flume
//importing relevant libraries
import org.apache.spark.SparkConf
import org.apache.spark.streaming._
import org.apache.spark.streaming.flume._
import org.apache.spark.util.IntParam
import org.apache.spark.storage.StorageLevel
//creating the spark streaming configuration
val ssc = new StreamingContext(sc, Seconds(5))
val stream = FlumeUtils.createStream(ssc, "0.0.0.0", 44444, StorageLevel.MEMORY_ONLY_SER_2)
//starting the streaming job
val textStream = stream.map(e => new String(e.event.getBody.array) )
val numlines = textStream.count()
numlines.print()
textStream.foreachRDD { rdd =>
//some stuff that needs to be created
import java.util.Date
val d = new Date
//delimeter of '&'
val rdd_s = rdd.map(line => line.split("&"))
val rdd_split = rdd_s.map(line => (d.getTime.toString, line(2), line(3).toInt))
//only saves the data if the toDF is comented out.
rdd_split.saveAsTextFile("/flume/text/final/")
//creating the data-frame - if commented out, the data will be saved to file
val sqlContext = SQLContext.getOrCreate(rdd.sparkContext)
import sqlContext.implicits._
val df = rdd_split.toDF("moment", "ID","amount")
df.saveAsTextFile("/idan/streaming/flume/text/final/withtime")
}
ssc.start()
找到了这个问题的答案
我需要做的是创建sqlContext的惰性单例来创建DataFrame。
以下是singleton代码的最终版本:
//creating the case class for the DataFrame
case class Record(moment:String, name:String, id:String, amount:Int)
/** Lazily instantiated singleton instance of SQLContext */
object SQLContextSingleton {
@transient private var instance: SQLContext = null
// Instantiate SQLContext on demand
def getInstance(sc: SparkContext): SQLContext = synchronized {
if (instance == null) {
instance = new SQLContext(sc)
}
instance
}
}
要创建数据帧,我需要为其创建单例:
val sqlContext = SQLContextSingleton.getInstance(rdd_s.sparkContext)
import sqlContext.implicits._
val df = sqlContext.createDataFrame(rdd_s.map(line => (d.getTime, line(0), line(1), line(2), line(3))))
编辑:
它给了我这个错误:
16/08/09 13:16:05 ERROR scheduler.JobScheduler: Error running job streaming job 1470748565000 ms.1
java.lang.NullPointerException
at org.apache.spark.sql.hive.client.ClientWrapper.conf(ClientWrapper.scala:205)
at org.apache.spark.sql.hive.HiveContext.hiveconf$lzycompute(HiveContext.scala:554)
at org.apache.spark.sql.hive.HiveContext.hiveconf(HiveContext.scala:553)
at org.apache.spark.sql.hive.HiveContext$$anonfun$configure$1.apply(HiveContext.scala:540)
at org.apache.spark.sql.hive.HiveContext$$anonfun$configure$1.apply(HiveContext.scala:539)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
at scala.collection.immutable.List.foreach(List.scala:318)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
at scala.collection.AbstractTraversable.map(Traversable.scala:105)
at org.apache.spark.sql.hive.HiveContext.configure(HiveContext.scala:539)
at org.apache.spark.sql.hive.HiveContext.metadataHive$lzycompute(HiveContext.scala:252)
at org.apache.spark.sql.hive.HiveContext.metadataHive(HiveContext.scala:239)
at org.apache.spark.sql.hive.HiveContext$$anon$2.<init>(HiveContext.scala:459)
at org.apache.spark.sql.hive.HiveContext.catalog$lzycompute(HiveContext.scala:459)
at org.apache.spark.sql.hive.HiveContext.catalog(HiveContext.scala:458)
at org.apache.spark.sql.hive.HiveContext$$anon$3.<init>(HiveContext.scala:475)
at org.apache.spark.sql.hive.HiveContext.analyzer$lzycompute(HiveContext.scala:475)
at org.apache.spark.sql.hive.HiveContext.analyzer(HiveContext.scala:474)
at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:34)
at org.apache.spark.sql.DataFrame.<init>(DataFrame.scala:133)
at org.apache.spark.sql.DataFrame$.apply(DataFrame.scala:52)
at org.apache.spark.sql.SQLContext.createDataFrame(SQLContext.scala:417)
at org.apache.spark.sql.SQLImplicits.rddToDataFrameHolder(SQLImplicits.scala:155)
at $line46.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(<console>:58)
at $line46.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(<console>:48)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:661)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:661)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:426)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:49)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:49)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:49)
at scala.util.Try$.apply(Try.scala:161)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:224)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:224)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:224)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:223)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
16/08/09 13:16:08 ERROR scheduler.ReceiverTracker: Deregistered receiver for stream 0: Stopped by driver
16/08/09 13:16:05错误调度程序。作业调度程序:运行作业流作业147074856500 ms时出错。1
java.lang.NullPointerException
位于org.apache.spark.sql.hive.client.ClientWrapper.conf(ClientWrapper.scala:205)
位于org.apache.spark.sql.hive.HiveContext.hiveconf$lzycompute(HiveContext.scala:554)
位于org.apache.spark.sql.hive.HiveContext.hiveconf(HiveContext.scala:553)
位于org.apache.spark.sql.hive.HiveContext$$anonfun$configure$1.apply(HiveContext.scala:540)
位于org.apache.spark.sql.hive.HiveContext$$anonfun$configure$1.apply(HiveContext.scala:539)
位于scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
位于scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
位于scala.collection.immutable.List.foreach(List.scala:318)
位于scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
位于scala.collection.AbstractTraversable.map(Traversable.scala:105)
位于org.apache.spark.sql.hive.HiveContext.configure(HiveContext.scala:539)
位于org.apache.spark.sql.hive.HiveContext.metadataHive$lzycompute(HiveContext.scala:252)
位于org.apache.spark.sql.hive.HiveContext.metadataHive(HiveContext.scala:239)
位于org.apache.spark.sql.hive.HiveContext$$anon$2。(HiveContext.scala:459)
位于org.apache.spark.sql.hive.HiveContext.catalog$lzycompute(HiveContext.scala:459)
位于org.apache.spark.sql.hive.HiveContext.catalog(HiveContext.scala:458)
位于org.apache.spark.sql.hive.HiveContext$$anon$3。(HiveContext.scala:475)
位于org.apache.spark.sql.hive.HiveContext.analyzer$lzycompute(HiveContext.scala:475)
位于org.apache.spark.sql.hive.HiveContext.analyzer(HiveContext.scala:474)
位于org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:34)
位于org.apache.spark.sql.DataFrame(DataFrame.scala:133)
位于org.apache.spark.sql.DataFrame$.apply(DataFrame.scala:52)
位于org.apache.spark.sql.SQLContext.createDataFrame(SQLContext.scala:417)
位于org.apache.spark.sql.SQLImplicits.rddToDataFrameHolder(SQLImplicits.scala:155)
在$line46.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1.应用(:58)
在$line46.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1.应用(:48)
在org.apache.spark.streaming.dstream.dstream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(dstream.scala:661)
在org.apache.spark.streaming.dstream.dstream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(dstream.scala:661)
在org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:50)
在org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:50)
在org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:50)
位于org.apache.spark.streaming.dstream.dstream.createRDDWithLocalProperties(dstream.scala:426)
在org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:49)
在org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply上(ForEachDStream.scala:49)
在org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply上(ForEachDStream.scala:49)
在scala.util.Try$.apply处(Try.scala:161)
位于org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
在org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:224)
位于org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:224)
位于org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:224)
在scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)中
位于org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:223)
位于java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
位于java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
运行(Thread.java:745)
2008年8月16日13:16:08错误计划程序。ReceiverTracker:流0的已注销接收器:已被驱动程序停止
有没有人碰巧遇到这种问题并且知道如何帮助?
谢谢:)请参阅帮助他人重现问题您无法在spark ui日志中看到错误吗?问题很长,因此我编辑了该问题。哪一行是ClientWrapper。scala:205?请参阅帮助他人重现问题您无法在spark ui日志中看到错误吗?问题很长,因此我编辑了该问题。哪一行是ClientWrapper.scala:205?