Warning: file_get_contents(/data/phpspider/zhask/data//catemap/3/apache-spark/6.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Apache spark 当我创建数据帧时,我得到;“空指针异常”;_Apache Spark_Apache Spark Sql_Spark Streaming - Fatal编程技术网

Apache spark 当我创建数据帧时,我得到;“空指针异常”;

Apache spark 当我创建数据帧时,我得到;“空指针异常”;,apache-spark,apache-spark-sql,spark-streaming,Apache Spark,Apache Spark Sql,Spark Streaming,我正在从另一个数据帧行创建数据帧。 这意味着我有另一个数据帧,在这里叫做file\u details。 该数据框包含此数据框中的行[“srcpath”、“table_name”、“primary_key”、“文件类型”、“分隔符”] 格式。 我正在创建一个函数。该函数逐个获取数据帧行。我正在检查文件的类型。如果它是csv,则使用Spark.read选项,我为该csv文件创建了src_路径和分隔符。然后我创建了一个 dataframe,我将该dataframe写入我的sql server中。目标表

我正在从另一个数据帧行创建数据帧。 这意味着我有另一个数据帧,在这里叫做file\u details。 该数据框包含此数据框中的行[“srcpath”、“table_name”、“primary_key”、“文件类型”、“分隔符”] 格式。 我正在创建一个函数。该函数逐个获取数据帧行。我正在检查文件的类型。如果它是csv,则使用Spark.read选项,我为该csv文件创建了src_路径和分隔符。然后我创建了一个 dataframe,我将该dataframe写入我的sql server中。目标表名和主键也是在单个dataframe行中获得的。在上面,我编写了行结构

这是我写的代码

 import org.apache.spark.sql.types._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
object fixedlength {
  def main(args: Array[String]): Unit = {

    val spark = SparkSession
      .builder()
      .appName("fixedlength")
      .master("local")
      .getOrCreate()
    val file_details=spark.read.option("header", "true").csv("////")
    def fun (ro:Row) : Unit= {         //Row like that ("srcpath","table_name","primary_key","type_of_file","separator")
      val url="jdbc:mysql://*****:3306/mydb1"
      val typ=ro.getString(3)
    if(typ=="csv")
         {

           val sep=ro.getString(4);
           val src_path=ro.getString(0);
           val tgt_table=ro.getString(1);
           val df1=spark.read.option("header", "true").option("delimiter",sep).csv(src_path)
           df1.write.format("jdbc")
             .mode("overwrite")
             .option("url",url)
             .option("driver","com.mysql.jdbc.Driver")
             .option("dbtable",tgt_table)
             .option("user","username")
             .option("password","password")
             .save()
          }
    }
   val ne=file_details.select("*").rdd.map(x=>fun(x)).collect()
}
}
我犯了一个错误

    ERROR Executor: Exception in task 0.0 in stage 7.0 (TID 7)
java.lang.NullPointerException
    at fixedlength$.fixedlength$$fun$1(fixedlength.scala:66)
    at fixedlength$$anonfun$10.apply(fixedlength.scala:106)
    at fixedlength$$anonfun$10.apply(fixedlength.scala:106)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
    at scala.collection.Iterator$class.foreach(Iterator.scala:891)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
    at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
    at scala.collection.AbstractIterator.to(Iterator.scala:1334)
    at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1334)
    at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1334)
    at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:945)
    at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:945)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
    at org.apache.spark.scheduler.Task.run(Task.scala:121)
    at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)
21/04/12 11:11:09 WARN TaskSetManager: Lost task 0.0 in stage 7.0 (TID 7, localhost, executor driver): java.lang.NullPointerException
    at fixedlength$.fixedlength$$fun$1(fixedlength.scala:66)
    at fixedlength$$anonfun$10.apply(fixedlength.scala:106)
    at fixedlength$$anonfun$10.apply(fixedlength.scala:106)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
    at scala.collection.Iterator$class.foreach(Iterator.scala:891)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
    at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
    at scala.collection.AbstractIterator.to(Iterator.scala:1334)
    at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1334)
    at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1334)
    at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:945)
    at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:945)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
    at org.apache.spark.scheduler.Task.run(Task.scala:121)
    at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)

21/04/12 11:11:09 ERROR TaskSetManager: Task 0 in stage 7.0 failed 1 times; aborting job
21/04/12 11:11:09 INFO TaskSchedulerImpl: Removed TaskSet 7.0, whose tasks have all completed, from pool 
21/04/12 11:11:09 INFO TaskSchedulerImpl: Cancelling stage 7
21/04/12 11:11:09 INFO TaskSchedulerImpl: Killing all running tasks in stage 7: Stage cancelled
21/04/12 11:11:09 INFO DAGScheduler: ResultStage 7 (collect at fixedlength.scala:106) failed in 0.156 s due to Job aborted due to stage failure: Task 0 in stage 7.0 failed 1 times, most recent failure: Lost task 0.0 in stage 7.0 (TID 7, localhost, executor driver): java.lang.NullPointerException
    at fixedlength$.fixedlength$$fun$1(fixedlength.scala:66)
    at fixedlength$$anonfun$10.apply(fixedlength.scala:106)
    at fixedlength$$anonfun$10.apply(fixedlength.scala:106)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
    at scala.collection.Iterator$class.foreach(Iterator.scala:891)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
    at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
    at scala.collection.AbstractIterator.to(Iterator.scala:1334)
    at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1334)
    at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1334)
    at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:945)
    at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:945)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
    at org.apache.spark.scheduler.Task.run(Task.scala:121)
    at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
21/04/12 11:11:09 INFO DAGScheduler: Job 7 failed: collect at fixedlength.scala:106, took 0.164149 s
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 7.0 failed 1 times, most recent failure: Lost task 0.0 in stage 7.0 (TID 7, localhost, executor driver): java.lang.NullPointerException
    at fixedlength$.fixedlength$$fun$1(fixedlength.scala:66)
    at fixedlength$$anonfun$10.apply(fixedlength.scala:106)
    at fixedlength$$anonfun$10.apply(fixedlength.scala:106)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
    at scala.collection.Iterator$class.foreach(Iterator.scala:891)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
    at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
    at scala.collection.AbstractIterator.to(Iterator.scala:1334)
    at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1334)
    at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1334)
    at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:945)
    at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:945)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
    at org.apache.spark.scheduler.Task.run(Task.scala:121)
    at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1887)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1875)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1874)
    at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
    at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1874)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
    at scala.Option.foreach(Option.scala:257)
    at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2108)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2057)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2046)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
    at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
    at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
    at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
    at fixedlength$.main(fixedlength.scala:106)
    at fixedlength.main(fixedlength.scala)
Caused by: java.lang.NullPointerException
    at fixedlength$.fixedlength$$fun$1(fixedlength.scala:66)
    at fixedlength$$anonfun$10.apply(fixedlength.scala:106)
    at fixedlength$$anonfun$10.apply(fixedlength.scala:106)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
    at scala.collection.Iterator$class.foreach(Iterator.scala:891)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
    at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
    at scala.collection.AbstractIterator.to(Iterator.scala:1334)
    at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1334)
    at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1334)
    at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:945)
    at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:945)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
    at org.apache.spark.scheduler.Task.run(Task.scala:121)
    at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)
21/04/12 11:11:09 INFO SparkContext: Invoking stop() from shutdown hook
21/04/12 11:11:09 INFO SparkUI: Stopped Spark web UI at http://LTIN236281.cts.com:4040
21/04/12 11:11:09 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
21/04/12 11:11:10 INFO MemoryStore: MemoryStore cleared
21/04/12 11:11:10 INFO BlockManager: BlockManager stopped
21/04/12 11:11:10 INFO BlockManagerMaster: BlockManagerMaster stopped
21/04/12 11:11:10 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
21/04/12 11:11:10 INFO SparkContext: Successfully stopped SparkContext
21/04/12 11:11:10 INFO ShutdownHookManager: Shutdown hook called
21/04/12 11:11:10 INFO ShutdownHookManager: Deleting directory C:\Users\844605\AppData\Local\Temp\spark-cae157ad-b63e-43b4-831e-9cc61094b4db
我在internet上搜索了那个错误“空指针异常”。我在某个地方发现,当工作节点尝试访问spark上下文对象时,会抛出这个错误

有什么不同的方法来实现这个目标吗

你能请任何人帮我摆脱这个错误吗


我的案例研究是,我有一个数据框,它包含文件的src_路径,文件的类型,如果它是该文件的csv分隔符,则是sql中的目标表。我将这些行传递到我创建的一个函数中,它根据创建的数据帧获取这些参数,并将该数据帧推送到sql目标表中。我在函数中编写这些代码。在iam运行时,我得到了“空指针异常”.

我不理解您的观点。哪个数据框需要更改为列表如果文件\u详细信息数据框不是太大,您可以将其更改为列表并从列表中的每个元素创建数据框,但问题在于函数内部。在这里我创建了一个新的数据框“df1”。那里只有我在访问spark对象。不是吗?在这里
文件\u详细信息。选择(“*”).rdd.map(x=>fun(x))
映射函数在executor上运行,spark会话在驱动程序中初始化,这就是您无法在executor中访问的原因