dentStages(DAGScheduler.scala:1435) 位于org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423) 位于org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422) 位于scala.collection.mutable.resizeblearray$class.foreach(resizeblearray.scala:59) 位于scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) 位于org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422) 位于org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802) 位于org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802) 位于scala.Option.foreach(Option.scala:257) 位于org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802) 位于org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650) 位于org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605) 位于org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594) 位于org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) 位于org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628) 位于org.apache.spark.SparkContext.runJob(SparkContext.scala:1918) 位于org.apache.spark.SparkContext.runJob(SparkContext.scala:1931) 位于org.apache.spark.SparkContext.runJob(SparkContext.scala:1951) 位于org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply$mcV$sp(FileFormatWriter.scala:127) ... 还有22个 原因:java.lang.NullPointerException 位于java.lang.ProcessBuilder.start(未知源) 位于org.apache.hadoop.util.Shell.runCommand(Shell.java:404) 位于org.apache.hadoop.util.Shell.run(Shell.java:379) 位于org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:589) 位于org.apache.hadoop.util.Shell.execCommand(Shell.java:678) 位于org.apache.hadoop.util.Shell.execCommand(Shell.java:661) 位于org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:639) 位于org.apache.hadoop.fs.FilterFileSystem.setPermission(FilterFileSystem.java:468) 位于org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:456) 位于org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:424) 位于org.apache.hadoop.fs.FileSystem.create(FileSystem.java:905) 位于org.apache.hadoop.fs.FileSystem.create(FileSystem.java:886) 位于org.apache.hadoop.fs.FileSystem.create(FileSystem.java:783) 位于org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.getRecordWriter(TextOutputFormat.java:132) 位于org.apache.spark.sql.execution.datasources.csv.CsvOutputWriter(CSVRelation.scala:208) 位于org.apache.spark.sql.execution.datasources.csv.CSVOutputWriterFactory.newInstance(CSVRelation.scala:178) 位于org.apache.spark.sql.execution.datasources.FileFormatWriter$SingleDirectoryWriteTask.(FileFormatWriter.scala:234) 位于org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:182) 位于org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$3.apply(FileFormatWriter.scala:129) 位于org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$3.apply(FileFormatWriter.scala:128) 位于org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) 位于org.apache.spark.scheduler.Task.run(Task.scala:99) 位于org.apache.spark.executor.executor$TaskRunner.run(executor.scala:282) 位于java.util.concurrent.ThreadPoolExecutor.runWorker(未知源) 位于java.util.concurrent.ThreadPoolExecutor$Worker.run(未知源) 位于java.lang.Thread.run(未知源0)

dentStages(DAGScheduler.scala:1435) 位于org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423) 位于org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422) 位于scala.collection.mutable.resizeblearray$class.foreach(resizeblearray.scala:59) 位于scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) 位于org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422) 位于org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802) 位于org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802) 位于scala.Option.foreach(Option.scala:257) 位于org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802) 位于org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650) 位于org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605) 位于org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594) 位于org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) 位于org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628) 位于org.apache.spark.SparkContext.runJob(SparkContext.scala:1918) 位于org.apache.spark.SparkContext.runJob(SparkContext.scala:1931) 位于org.apache.spark.SparkContext.runJob(SparkContext.scala:1951) 位于org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply$mcV$sp(FileFormatWriter.scala:127) ... 还有22个 原因:java.lang.NullPointerException 位于java.lang.ProcessBuilder.start(未知源) 位于org.apache.hadoop.util.Shell.runCommand(Shell.java:404) 位于org.apache.hadoop.util.Shell.run(Shell.java:379) 位于org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:589) 位于org.apache.hadoop.util.Shell.execCommand(Shell.java:678) 位于org.apache.hadoop.util.Shell.execCommand(Shell.java:661) 位于org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:639) 位于org.apache.hadoop.fs.FilterFileSystem.setPermission(FilterFileSystem.java:468) 位于org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:456) 位于org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:424) 位于org.apache.hadoop.fs.FileSystem.create(FileSystem.java:905) 位于org.apache.hadoop.fs.FileSystem.create(FileSystem.java:886) 位于org.apache.hadoop.fs.FileSystem.create(FileSystem.java:783) 位于org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.getRecordWriter(TextOutputFormat.java:132) 位于org.apache.spark.sql.execution.datasources.csv.CsvOutputWriter(CSVRelation.scala:208) 位于org.apache.spark.sql.execution.datasources.csv.CSVOutputWriterFactory.newInstance(CSVRelation.scala:178) 位于org.apache.spark.sql.execution.datasources.FileFormatWriter$SingleDirectoryWriteTask.(FileFormatWriter.scala:234) 位于org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:182) 位于org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$3.apply(FileFormatWriter.scala:129) 位于org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$3.apply(FileFormatWriter.scala:128) 位于org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) 位于org.apache.spark.scheduler.Task.run(Task.scala:99) 位于org.apache.spark.executor.executor$TaskRunner.run(executor.scala:282) 位于java.util.concurrent.ThreadPoolExecutor.runWorker(未知源) 位于java.util.concurrent.ThreadPoolExecutor$Worker.run(未知源) 位于java.lang.Thread.run(未知源0),java,apache-spark,apache-spark-sql,export-to-csv,Java,Apache Spark,Apache Spark Sql,Export To Csv,我尝试使用较小的数据集(此数据集包含57548行),但出现了相同的错误 这是pom文件 4.0.0 com.example 演示 0.0.1-快照 罐子 演示 SpringBoot的演示项目 org.springframework.boot spring启动程序父级 2.1.0.1发布 UTF-8 UTF-8 1.8 org.springframework.boot 弹簧靴起动器 org.springframework.boot 弹簧起动试验 测试 org.apache.spark spark

我尝试使用较小的数据集(此数据集包含57548行),但出现了相同的错误

这是pom文件


4.0.0
com.example
演示
0.0.1-快照
罐子
演示
SpringBoot的演示项目
org.springframework.boot
spring启动程序父级
2.1.0.1发布
UTF-8
UTF-8
1.8
org.springframework.boot
弹簧靴起动器
org.springframework.boot
弹簧起动试验
测试
org.apache.spark
spark-core_2.11
2.1.0
org.apache.spark
spark-sql_2.11
2.1.0
org.codehaus.janino
贾尼诺
org.codehaus.janino
通用编译器
org.codehaus.janino
通用编译器
3.0.6
org.codehaus.janino
贾尼诺
org.springframework.boot
springbootmaven插件

我对Spark处理还是新手,所以这可能只是一个误会造成的误会


谢谢你的帮助

多亏了@user6910411,我所做的就是将版本更新到2.4.0,并在命令中给出了清晰的错误日志IOException(null)字符串 因此,我只是将winutils.exe文件复制粘贴到Haddop根文件夹,并确保环境变量指向根文件夹而不是其他分支。
谢谢。

阅读CSV文件并进行一些操作后。我们可以使用它来写入CSV

Dataset<Row> df = spark.read().option("mode", "DROPMALFORMED").schema(schema).csv("C:\\In\\input.csv");
df.write().format("csv").save("C:\\Out\\output.csv");
Dataset df=spark.read().option(“mode”,“dropmorformed”).schema(schema).csv(“C:\\In\\input.csv”);
df.write().format(“csv”).save(“C:\\Out\\output.csv”);

据我所知,这段代码没有明显的错误。你能试着把它简化成一个简单的句子,并可能添加相关的上下文吗?还有一个注意事项-2.1是一个非常旧的版本(2.4已经发布),甚至在2.1分支上也过时了。非常感谢你的评论,似乎我没有把Hadop winutils.exe文件放在正确的位置。我很高兴你解决了它。供将来参考-如果使用MS Windows进行开发,最好在问题中提到这一点。火花与窗户
Exception in thread "main" org.apache.spark.SparkException: Job aborted.
at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply$mcV$sp(FileFormatWriter.scala:147)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:121)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:121)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:121)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:101)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:58)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:56)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:74)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:114)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:114)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:135)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:132)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:113)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:87)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:87)
at org.apache.spark.sql.execution.datasources.DataSource.write(DataSource.scala:492)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:215)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:198)
at org.apache.spark.sql.DataFrameWriter.csv(DataFrameWriter.scala:579)
at com.example.demo.DemoApplication.deleteCleanedAndUnsubscribedFromCustomers(DemoApplication.java:114)
at com.example.demo.DemoApplication.main(DemoApplication.java:124)

Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 8.0 failed 1 times, most recent failure: Lost task 0.0 in stage 8.0 (TID 8, localhost, executor driver): java.lang.NullPointerException
at java.lang.ProcessBuilder.start(Unknown Source)
at org.apache.hadoop.util.Shell.runCommand(Shell.java:404)
at org.apache.hadoop.util.Shell.run(Shell.java:379)
at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:589)
at org.apache.hadoop.util.Shell.execCommand(Shell.java:678)
at org.apache.hadoop.util.Shell.execCommand(Shell.java:661)
at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:639)
at org.apache.hadoop.fs.FilterFileSystem.setPermission(FilterFileSystem.java:468)
at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:456)
at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:424)
at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:905)
at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:886)
at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:783)
at org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.getRecordWriter(TextOutputFormat.java:132)
at org.apache.spark.sql.execution.datasources.csv.CsvOutputWriter.<init>(CSVRelation.scala:208)
at org.apache.spark.sql.execution.datasources.csv.CSVOutputWriterFactory.newInstance(CSVRelation.scala:178)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$SingleDirectoryWriteTask.<init>(FileFormatWriter.scala:234)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:182)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$3.apply(FileFormatWriter.scala:129)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$3.apply(FileFormatWriter.scala:128)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)

Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1918)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1931)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1951)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply$mcV$sp(FileFormatWriter.scala:127)
... 22 more

Caused by: java.lang.NullPointerException
at java.lang.ProcessBuilder.start(Unknown Source)
at org.apache.hadoop.util.Shell.runCommand(Shell.java:404)
at org.apache.hadoop.util.Shell.run(Shell.java:379)
at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:589)
at org.apache.hadoop.util.Shell.execCommand(Shell.java:678)
at org.apache.hadoop.util.Shell.execCommand(Shell.java:661)
at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:639)
at org.apache.hadoop.fs.FilterFileSystem.setPermission(FilterFileSystem.java:468)
at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:456)
at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:424)
at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:905)
at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:886)
at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:783)
at org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.getRecordWriter(TextOutputFormat.java:132)
at org.apache.spark.sql.execution.datasources.csv.CsvOutputWriter.<init>(CSVRelation.scala:208)
at org.apache.spark.sql.execution.datasources.csv.CSVOutputWriterFactory.newInstance(CSVRelation.scala:178)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$SingleDirectoryWriteTask.<init>(FileFormatWriter.scala:234)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:182)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$3.apply(FileFormatWriter.scala:129)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$3.apply(FileFormatWriter.scala:128)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source0)
Dataset<Row> df = spark.read().option("mode", "DROPMALFORMED").schema(schema).csv("C:\\In\\input.csv");
df.write().format("csv").save("C:\\Out\\output.csv");