Java 使用azure blob存储进行spark检查点
我试图在spark结构化流媒体应用程序中使用azure存储作为检查点位置。 我看过一些关于读/写azure存储的文章,但我没有看到任何人解释如何使用azure存储作为检查点位置。下面是我的简单代码,从一个卡夫卡主题阅读,然后写回另一个主题,添加了检查点位置Java 使用azure blob存储进行spark检查点,java,apache-spark,azure-storage-blobs,spark-structured-streaming,Java,Apache Spark,Azure Storage Blobs,Spark Structured Streaming,我试图在spark结构化流媒体应用程序中使用azure存储作为检查点位置。 我看过一些关于读/写azure存储的文章,但我没有看到任何人解释如何使用azure存储作为检查点位置。下面是我的简单代码,从一个卡夫卡主题阅读,然后写回另一个主题,添加了检查点位置 SparkConf conf = new SparkConf().setMaster("local[*]"); conf.set( "fs.azure.account.key.&l
SparkConf conf = new SparkConf().setMaster("local[*]");
conf.set(
"fs.azure.account.key.<storage-name>.blob.core.windows.net",
"<storage-key>");
conf.set("fs.wasbs.impl", "org.apache.hadoop.fs.azure.NativeAzureFileSystem");
SparkSession spark = SparkSession.builder().appName("app-name").config(conf).getOrCreate();
Dataset<Row> df = spark
.readStream()
.format("kafka")
.option("kafka.bootstrap.servers", "localhost:9092")
.option("subscribe", "input")
.load();
StreamingQuery ds = df
.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
.writeStream()
.format("kafka")
.option("kafka.bootstrap.servers", "localhost:9092")
.option("topic", "output")
.option("checkpointLocation", "wasbs://<container-name>@<storage-account-name>.blob.core.windows.net/<directory-name>")
.start();
ds.awaitTermination();
Exception in thread "main" java.lang.IllegalArgumentException: Self-suppression not permitted
at java.lang.Throwable.addSuppressed(Throwable.java:1043)
at java.io.FilterOutputStream.close(FilterOutputStream.java:159)
at org.apache.hadoop.fs.azure.NativeAzureFileSystem$NativeAzureFsOutputStream.close(NativeAzureFileSystem.java:818)
at org.apache.hadoop.fs.FSDataOutputStream$PositionCache.close(FSDataOutputStream.java:72)
at org.apache.hadoop.fs.FSDataOutputStream.close(FSDataOutputStream.java:106)
at org.apache.commons.io.IOUtils.closeQuietly(IOUtils.java:339)
at org.apache.commons.io.IOUtils.closeQuietly(IOUtils.java:298)
at org.apache.spark.sql.execution.streaming.StreamMetadata$.write(StreamMetadata.scala:85)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$2.apply(StreamExecution.scala:124)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$2.apply(StreamExecution.scala:122)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.execution.streaming.StreamExecution.<init>(StreamExecution.scala:122)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.<init>(MicroBatchExecution.scala:49)
at org.apache.spark.sql.streaming.StreamingQueryManager.createQuery(StreamingQueryManager.scala:258)
at org.apache.spark.sql.streaming.StreamingQueryManager.startQuery(StreamingQueryManager.scala:299)
at org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:296)
at com.test.Test.main(Test.java:73)
Caused by: java.io.IOException: Stream is already closed.
at com.microsoft.azure.storage.blob.BlobOutputStreamInternal.close(BlobOutputStreamInternal.java:332)
at java.io.FilterOutputStream.close(FilterOutputStream.java:159)
at org.apache.hadoop.fs.azure.NativeAzureFileSystem$NativeAzureFsOutputStream.close(NativeAzureFileSystem.java:818)
at org.apache.hadoop.fs.FSDataOutputStream$PositionCache.close(FSDataOutputStream.java:72)
at org.apache.hadoop.fs.FSDataOutputStream.close(FSDataOutputStream.java:106)
at sun.nio.cs.StreamEncoder.implClose(StreamEncoder.java:320)
at sun.nio.cs.StreamEncoder.close(StreamEncoder.java:149)
at java.io.OutputStreamWriter.close(OutputStreamWriter.java:233)
at com.fasterxml.jackson.core.json.WriterBasedJsonGenerator.close(WriterBasedJsonGenerator.java:883)
at com.fasterxml.jackson.databind.ObjectMapper._configAndWriteValue(ObjectMapper.java:3561)
at com.fasterxml.jackson.databind.ObjectMapper.writeValue(ObjectMapper.java:2909)
at org.json4s.jackson.Serialization$.write(Serialization.scala:27)
at org.apache.spark.sql.execution.streaming.StreamMetadata$.write(StreamMetadata.scala:78)
... 9 more
Spark: 2.3.0
hadoop-azure : 2.7
azure-storage : 8.0