Scala 与冒号抗争';:';以文件名

Scala 与冒号抗争';:';以文件名,scala,hadoop,apache-spark,Scala,Hadoop,Apache Spark,我有以下代码,用于加载大量“csv.gz”,并将它们转储到其他文件夹中,源文件名作为列 object DailyMerger extends App { def allFiles(path:File):List[File]= { val parts=path.listFiles.toList.partition(_.isDirectory) parts._2 ::: parts._1.flatMap(allFiles) } val sqlContext = Spar

我有以下代码,用于加载大量“csv.gz”,并将它们转储到其他文件夹中,源文件名作为列

object DailyMerger extends App {
  def allFiles(path:File):List[File]= {
    val parts=path.listFiles.toList.partition(_.isDirectory)
    parts._2 ::: parts._1.flatMap(allFiles)
  }

  val sqlContext = SparkSession.builder().appName("DailyMerger").master("local").getOrCreate()
  val files = allFiles(new File("/Logs/"))
      .map(_.getAbsolutePath())
      .filter(_.endsWith(".csv.gz"))

  val df = sqlContext
      .read
      .format("com.databricks.spark.csv")
      .option("header", "true")
      .option("inferSchema", "true").load(files:_*)
      .withColumn("SENSOR", input_file_name())
      .write
      .option("header", "true")
      .option("compression", "gzip")
      .csv("/tmp/out")
 }
它对我的测试数据很有吸引力。但在我的“真实”数据中,我有很多文件,文件名中包含“:”

当Hadoop尝试生成关联的crc文件时,会导致以下异常:

java.lang.IllegalArgumentException: java.net.URISyntaxException: Relative path in absolute URI: .ac:a3:1e:c6:5c:7c.csv.gz.crc
    at org.apache.hadoop.fs.Path.initialize(Path.java:206)
    at org.apache.hadoop.fs.Path.<init>(Path.java:172)
    at org.apache.hadoop.fs.Path.<init>(Path.java:94)
    at org.apache.hadoop.fs.ChecksumFileSystem.getChecksumFile(ChecksumFileSystem.java:90)
    at org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSInputChecker.<init>(ChecksumFileSystem.java:145)
    at org.apache.hadoop.fs.ChecksumFileSystem.open(ChecksumFileSystem.java:346)
    at org.apache.hadoop.fs.FileSystem.open(FileSystem.java:766)
    at org.apache.hadoop.mapreduce.lib.input.LineRecordReader.initialize(LineRecordReader.java:85)
    at org.apache.spark.sql.execution.datasources.HadoopFileLinesReader.<init>(HadoopFileLinesReader.scala:46)
    at org.apache.spark.sql.execution.datasources.text.TextFileFormat$$anonfun$buildReader$2.apply(TextFileFormat.scala:105)
    at org.apache.spark.sql.execution.datasources.text.TextFileFormat$$anonfun$buildReader$2.apply(TextFileFormat.scala:104)
    at org.apache.spark.sql.execution.datasources.FileFormat$$anon$1.apply(FileFormat.scala:136)
    at org.apache.spark.sql.execution.datasources.FileFormat$$anon$1.apply(FileFormat.scala:120)
    at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:124)
    at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:174)
    at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:105)
    at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
    at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
    at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:395)
    at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:234)
    at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:228)
    at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:827)
    at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:827)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
    at org.apache.spark.scheduler.Task.run(Task.scala:108)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.URISyntaxException: Relative path in absolute URI: .ac:a3:1e:c6:5c:7c.csv.gz.crc
    at java.net.URI.checkPath(URI.java:1823)
    at java.net.URI.<init>(URI.java:745)
    at org.apache.hadoop.fs.Path.initialize(Path.java:203)
java.lang.IllegalArgumentException:java.net.URISyntaxException:绝对URI中的相对路径:.ac:a3:1e:c6:5c:7c.csv.gz.crc
位于org.apache.hadoop.fs.Path.initialize(Path.java:206)
位于org.apache.hadoop.fs.Path(Path.java:172)
位于org.apache.hadoop.fs.Path(Path.java:94)
位于org.apache.hadoop.fs.ChecksumFileSystem.getChecksumFile(ChecksumFileSystem.java:90)
位于org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSInputChecker。(ChecksumFileSystem.java:145)
位于org.apache.hadoop.fs.ChecksumFileSystem.open(ChecksumFileSystem.java:346)
位于org.apache.hadoop.fs.FileSystem.open(FileSystem.java:766)
位于org.apache.hadoop.mapreduce.lib.input.LineRecordReader.initialize(LineRecordReader.java:85)
位于org.apache.spark.sql.execution.datasources.HadoopFileLinesReader。(HadoopFileLinesReader.scala:46)
位于org.apache.spark.sql.execution.datasources.text.TextFileFormat$$anonfun$buildReader$2.apply(TextFileFormat.scala:105)
位于org.apache.spark.sql.execution.datasources.text.TextFileFormat$$anonfun$buildReader$2.apply(TextFileFormat.scala:104)
位于org.apache.spark.sql.execution.datasources.FileFormat$$anon$1.apply(FileFormat.scala:136)
位于org.apache.spark.sql.execution.datasources.FileFormat$$anon$1.apply(FileFormat.scala:120)
位于org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:124)
位于org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:174)
位于org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:105)
位于org.apache.spark.sql.catalyst.expressions.GeneratedClass$GenerateEditor.processNext(未知源)
位于org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
位于org.apache.spark.sql.execution.whisttagecodegenexec$$anonfun$8$$anon$1.hasNext(whisttagecodegenexec.scala:395)
位于org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:234)
位于org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:228)
位于org.apache.spark.rdd.rdd$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(rdd.scala:827)
位于org.apache.spark.rdd.rdd$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(rdd.scala:827)
在org.apache.spark.rdd.MapPartitionsRDD.compute上(MapPartitionsRDD.scala:38)
在org.apache.spark.rdd.rdd.computeOrReadCheckpoint(rdd.scala:323)上
位于org.apache.spark.rdd.rdd.iterator(rdd.scala:287)
位于org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
位于org.apache.spark.scheduler.Task.run(Task.scala:108)
位于org.apache.spark.executor.executor$TaskRunner.run(executor.scala:338)
位于java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
位于java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
运行(Thread.java:748)
原因:java.net.URISyntaxException:绝对URI中的相对路径:.ac:a3:1e:c6:5c:7c.csv.gz.crc
位于java.net.URI.checkPath(URI.java:1823)
位于java.net.URI。(URI.java:745)
位于org.apache.hadoop.fs.Path.initialize(Path.java:203)

重命名输入文件不是一个选项,我剩下的是什么?

正如@tzach zohar所说的,没有太多。试图解决这个问题已经有很长的历史了,这是一个非常重要的问题

相关的JIRA包括:

由于所有JIRAs仍然打开,我想说重命名文件或使用HDFS以外的其他工具是唯一的选择。

表明,如果不更改文件名,您将无能为力。。。HDFS在这些文件名上失败;但也许其他的回答者有治愈的方法。