Apache spark spark结构化流式处理批数据刷新问题（partition by子句）_Apache Spark_Apache Spark Sql_Spark Structured Streaming

Apache spark spark结构化流式处理批数据刷新问题（partition by子句）

apache-spark

Apache spark spark结构化流式处理批数据刷新问题（partition by子句）,apache-spark,apache-spark-sql,spark-structured-streaming,Apache Spark,Apache Spark Sql,Spark Structured Streaming,我在连接spark结构化流式数据帧和批处理数据帧时遇到了一个问题，我的场景中有一个S3流，需要与历史数据进行左反连接，返回历史中不存在的记录（找出新记录），我将这些记录作为新的附加写入历史（按列分区磁盘数据分区而不是内存）当我刷新已分区的历史数据框时，我的历史数据框不会更新下面是两段代码，一段有效，另一段无效工作代码和非工作代码之间的唯一区别是partition_by子句工作代码：-（刷新历史记录）不工作：-（历史记录未刷新）谢谢 Sri我通过使用union by name函数而不是

我在连接spark结构化流式数据帧和批处理数据帧时遇到了一个问题，我的场景中有一个S3流，需要与历史数据进行左反连接，返回历史中不存在的记录（找出新记录），我将这些记录作为新的附加写入历史（按列分区磁盘数据分区而不是内存）

当我刷新已分区的历史数据框时，我的历史数据框不会更新

下面是两段代码，一段有效，另一段无效

工作代码和非工作代码之间的唯一区别是partition_by子句

工作代码：-（刷新历史记录）

不工作：-（历史记录未刷新）

谢谢

Sri

我通过使用union by name函数而不是从磁盘读取刷新的数据来解决这个问题

步骤1：- 阅读历史S3

步骤2：- 读卡夫卡，查阅历史

步骤3：- 使用union by name spark函数将处理后的数据写入磁盘，并附加到步骤1中创建的数据帧

步骤1代码（读取历史数据帧）：-

步骤2（使用流数据刷新历史数据帧）：-

谢谢

Sri

当我刷新我的历史数据框架时，你是如何做到的？这是关键（也许这就是三角洲湖的原因）。如果您看到我的代码，我会在foreach批处理分区中写入历史记录并更新历史记录。这里的更新是指数据框的刷新，我是指在取消持久化并持久化之后，新添加的数据将显示在spark数据框中。我们会对基础文件夹进行分区，但当文件夹取消分区后，刷新工作正常

import spark.implicits._

    val inputSchema = StructType(
      Array(
        StructField("spark_id", StringType),
        StructField("account_id", StringType),
        StructField("run_dt", StringType),
        StructField("trxn_ref_id", StringType),
        StructField("trxn_dt", StringType),
        StructField("trxn_amt", StringType)
      )
    )
    val historySchema = StructType(
      Array(
        StructField("spark_id", StringType),
        StructField("account_id", StringType),
        StructField("run_dt", StringType),
        StructField("trxn_ref_id", StringType),
        StructField("trxn_dt", StringType),
        StructField("trxn_amt", StringType)
      )
    )
    val source = spark.readStream
      .schema(inputSchema)
      .option("header", "false")
      .csv("src/main/resources/Input/")

    val history = spark.read
      .schema(inputSchema)
      .option("header", "true")
      .csv("src/main/resources/history/")
      .withColumnRenamed("spark_id", "spark_id_2")
      .withColumnRenamed("account_id", "account_id_2")
      .withColumnRenamed("run_dt", "run_dt_2")
      .withColumnRenamed("trxn_ref_id", "trxn_ref_id_2")
      .withColumnRenamed("trxn_dt", "trxn_dt_2")
      .withColumnRenamed("trxn_amt", "trxn_amt_2")

    val readFilePersisted = history.persist()
    readFilePersisted.createOrReplaceTempView("hist")

    val recordsNotPresentInHist = source
      .join(
        history,
        source.col("account_id") === history.col("account_id_2") &&
          source.col("run_dt") === history.col("run_dt_2") &&
          source.col("trxn_ref_id") === history.col("trxn_ref_id_2") &&
          source.col("trxn_dt") === history.col("trxn_dt_2") &&
          source.col("trxn_amt") === history.col("trxn_amt_2"),
        "leftanti"
      )

    recordsNotPresentInHist.writeStream
      .foreachBatch { (batchDF: DataFrame, batchId: Long) =>
        batchDF.write
          .mode(SaveMode.Append)
          //.partitionBy("spark_id", "account_id", "run_dt")
          .csv("src/main/resources/history/")

        val lkpChacheFileDf1 = spark.read
          .schema(inputSchema)
          .parquet("src/main/resources/history")

        val lkpChacheFileDf = lkpChacheFileDf1
        lkpChacheFileDf.unpersist(true)
        val histLkpPersist = lkpChacheFileDf.persist()
        histLkpPersist.createOrReplaceTempView("hist")

      }
      .start()

    println("This is the kafka dataset:")
    source
      .withColumn("Input", lit("Input-source"))
      .writeStream
      .format("console")
      .outputMode("append")
      .start()

    recordsNotPresentInHist
      .withColumn("reject", lit("recordsNotPresentInHist"))
      .writeStream
      .format("console")
      .outputMode("append")
      .start()

    spark.streams.awaitAnyTermination()

import spark.implicits._

    val inputSchema = StructType(
      Array(
        StructField("spark_id", StringType),
        StructField("account_id", StringType),
        StructField("run_dt", StringType),
        StructField("trxn_ref_id", StringType),
        StructField("trxn_dt", StringType),
        StructField("trxn_amt", StringType)
      )
    )
    val historySchema = StructType(
      Array(
        StructField("spark_id", StringType),
        StructField("account_id", StringType),
        StructField("run_dt", StringType),
        StructField("trxn_ref_id", StringType),
        StructField("trxn_dt", StringType),
        StructField("trxn_amt", StringType)
      )
    )
    val source = spark.readStream
      .schema(inputSchema)
      .option("header", "false")
      .csv("src/main/resources/Input/")

    val history = spark.read
      .schema(inputSchema)
      .option("header", "true")
      .csv("src/main/resources/history/")
      .withColumnRenamed("spark_id", "spark_id_2")
      .withColumnRenamed("account_id", "account_id_2")
      .withColumnRenamed("run_dt", "run_dt_2")
      .withColumnRenamed("trxn_ref_id", "trxn_ref_id_2")
      .withColumnRenamed("trxn_dt", "trxn_dt_2")
      .withColumnRenamed("trxn_amt", "trxn_amt_2")

    val readFilePersisted = history.persist()
    readFilePersisted.createOrReplaceTempView("hist")

    val recordsNotPresentInHist = source
      .join(
        history,
        source.col("account_id") === history.col("account_id_2") &&
          source.col("run_dt") === history.col("run_dt_2") &&
          source.col("trxn_ref_id") === history.col("trxn_ref_id_2") &&
          source.col("trxn_dt") === history.col("trxn_dt_2") &&
          source.col("trxn_amt") === history.col("trxn_amt_2"),
        "leftanti"
      )

    recordsNotPresentInHist.writeStream
      .foreachBatch { (batchDF: DataFrame, batchId: Long) =>
        batchDF.write
          .mode(SaveMode.Append)
          .partitionBy("spark_id", "account_id","run_dt")
          .csv("src/main/resources/history/")

        val lkpChacheFileDf1 = spark.read
          .schema(inputSchema)
          .parquet("src/main/resources/history")

        val lkpChacheFileDf = lkpChacheFileDf1
        lkpChacheFileDf.unpersist(true)
        val histLkpPersist = lkpChacheFileDf.persist()
        histLkpPersist.createOrReplaceTempView("hist")

      }
      .start()

    println("This is the kafka dataset:")
    source
      .withColumn("Input", lit("Input-source"))
      .writeStream
      .format("console")
      .outputMode("append")
      .start()

    recordsNotPresentInHist
      .withColumn("reject", lit("recordsNotPresentInHist"))
      .writeStream
      .format("console")
      .outputMode("append")
      .start()

    spark.streams.awaitAnyTermination()

val acctHistDF = sparkSession.read
.schema(schema)
.parquet(S3path)
val acctHistDFPersisted = acctHistDF.persist()
acctHistDFPersisted.createOrReplaceTempView("acctHist")

val history = sparkSession.table("acctHist")
history.unionByName(stream)
history.createOrReplaceTempView("acctHist")