如何使用spark scala以附加模式保存单个JSON文件下多个查询的输出

如何使用spark scala以附加模式保存单个JSON文件下多个查询的输出,scala,apache-spark,hadoop,Scala,Apache Spark,Hadoop,我有以下5个问题: select * from table1 select * from table2 select * from table3 select * from table4 select * from table5 现在,我想要的是,我必须以顺序方式执行这些查询,然后继续将输出存储在单个JSON文件中的追加模式中。我编写了下面的代码,但它将每个查询的输出存储在不同的零件文件中,而不是一个。 下面是我的代码: def store(jobEntity: JobDe

我有以下5个问题:

select * from table1  
select * from table2  
select * from table3  
select * from table4  
select * from table5  
现在,我想要的是,我必须以顺序方式执行这些查询,然后继续将输出存储在
单个JSON
文件中的
追加模式中。我编写了下面的代码,但它将每个查询的输出存储在
不同的零件文件中,而不是一个。
下面是我的代码:

def store(jobEntity: JobDetails, jobRunId: Int): Unit = {
    UDFUtil.registerUdfFunctions()
    var outputTableName: String = null
    val jobQueryMap = jobEntity.jobQueryList.map(jobQuery => (jobQuery.sequenceId, jobQuery))
    val sortedQueries = scala.collection.immutable.TreeMap(jobQueryMap.toSeq: _*).toMap
    LOGGER.debug("sortedQueries ===>" + sortedQueries)
    try {
      outputTableName = jobEntity.destinationEntity
      var resultDF: DataFrame = null
      sortedQueries.values.foreach(jobQuery => {
        LOGGER.debug(s"jobQuery.query ===> ${jobQuery.query}")
        resultDF = SparkSession.builder.getOrCreate.sqlContext.sql(jobQuery.query)

        if (jobQuery.partitionColumn != null && !jobQuery.partitionColumn.trim.isEmpty) {
          resultDF = resultDF.repartition(jobQuery.partitionColumn.split(",").map(col): _*)
        }
        if (jobQuery.isKeepInMemory) {
          resultDF = resultDF.persist(StorageLevel.MEMORY_AND_DISK_SER)
        }
        if (jobQuery.isCheckpointEnabled) {
          val checkpointDir = ApplicationConfig.getAppConfig(JobConstants.CHECKPOINT_DIR)
          val fs = FileSystem.get(new Storage(JsonUtil.toMap[String](jobEntity.sourceConnection)).asHadoopConfig())
          val path = new Path(checkpointDir)
          if (!fs.exists(path)) {
            fs.mkdirs(path)
          }
          resultDF.explain(true)
          SparkSession.builder.getOrCreate.sparkContext.setCheckpointDir(checkpointDir)
          resultDF = resultDF.checkpoint
        }
        resultDF = {
          if (jobQuery.isBroadCast) {
            import org.apache.spark.sql.functions.broadcast
            broadcast(resultDF)
          } else
            resultDF
        }
        tempViewsList.+=(jobQuery.queryAliasName)
        resultDF.createOrReplaceTempView(jobQuery.queryAliasName)
        //      resultDF.explain(true)
        val map: Map[String, String] = JsonUtil.toMap[String](jobEntity.sinkConnection)
        LOGGER.debug("sink details :: " + map)
        if (resultDF != null && !resultDF.take(1).isEmpty) {
          resultDF.show(false)
          val sinkDetails = new Storage(JsonUtil.toMap[String](jobEntity.sinkConnection))
          val path = sinkDetails.basePath + File.separator + jobEntity.destinationEntity
          println("path::: " + path)
          resultDF.repartition(1).write.mode(SaveMode.Append).json(path)
        }
      }
      )

只需忽略我在此方法中在读写时正在做的其他事情(
检查点
日志
审核
)。

首先,您需要合并所有架构:

import org.apache.spark.sql.functions._
val df1 = sc.parallelize(List(
  (42, 11),
  (43, 21)
)).toDF("foo", "bar")

val df2 = sc.parallelize(List(
  (44, true, 1.0),
  (45, false, 3.0)
)).toDF("foo", "foo0", "foo1")

val cols1 = df1.columns.toSet
val cols2 = df2.columns.toSet
val total = cols1 ++ cols2 // union

def expr(myCols: Set[String], allCols: Set[String]) = {
  allCols.toList.map(x => x match {
    case x if myCols.contains(x) => col(x)
    case _ => lit(null).as(x)
  })
}

val total = df1.select(expr(cols1, total):_*).unionAll(df2.select(expr(cols2, total):_*))

total.show()
和OBV保存到单个JSON文件:

df.coalesce(1).write.mode('append').json("/some/path")
UPD

如果您不使用DFs,只需使用简单的SQL查询(写入单个文件保持不变-
coalesce(1)
repartition(1)
):


使用以下示例作为问题的参考

我有三个表,其中包含
Json
数据(具有不同的模式),如下所示:

  • 表1
    -->个人数据表
  • 表2
    -->公司数据表
  • 表3
    -->薪资数据表
  • 我正在按照您的要求以顺序模式逐一阅读这三个表,并借助List
    TableColList
    对数据进行一些转换(分解Json数组列),其中包含与带有分号(“:”)分隔符的表相对应的数组列名

    OutDFList
    是所有转换数据帧的列表

    最后,我将所有数据帧从
    OutDFList
    缩减为单个数据帧,并将其写入一个
    JSON
    文件

    注意:我已经使用join减少了所有数据帧,您也可以使用 联合(如果有相同的列)或按照要求的其他方式。

    检查以下代码:

    scala> spark.sql("select * from table1").printSchema
    root
     |-- Personal: array (nullable = true)
     |    |-- element: struct (containsNull = true)
     |    |    |-- DOB: string (nullable = true)
     |    |    |-- EmpID: string (nullable = true)
     |    |    |-- Name: string (nullable = true)
    
    
    scala> spark.sql("select * from table2").printSchema
    root
     |-- Company: array (nullable = true)
     |    |-- element: struct (containsNull = true)
     |    |    |-- EmpID: string (nullable = true)
     |    |    |-- JoinDate: string (nullable = true)
     |    |    |-- Project: string (nullable = true)
    
    
    scala> spark.sql("select * from table3").printSchema
    root
     |-- Salary: array (nullable = true)
     |    |-- element: struct (containsNull = true)
     |    |    |-- EmpID: string (nullable = true)
     |    |    |-- Monthly: string (nullable = true)
     |    |    |-- Yearly: string (nullable = true)
    
    scala> val TableColList = List("table1:Personal", "table2:Company", "table3:Salary")
    TableColList: List[String] = List(table1:Personal, table2:Company, table3:Salary)
    
    
    scala>  val OutDFList = TableColList.map{ X =>
         |  val table = X.split(":")(0)
         |  val arrayColumn = X.split(":")(1)
         |  val df = spark.sql(s"""SELECT * FROM """ + table).select(explode(col(arrayColumn)) as "data").select("data.*")
         | df}
    OutDFList: List[org.apache.spark.sql.DataFrame] = List([DOB: string, EmpID: string ... 1 more field], [EmpID: string, JoinDate: string ... 1 more field], [EmpID: string, Monthly: string ... 1 more field])
    
    scala> val FinalOutDF  = OutDFList.reduce((df1, df2) => df1.join(df2, "EmpID"))
    FinalOutDF: org.apache.spark.sql.DataFrame = [EmpID: string, DOB: string ... 5 more fields]
    
    scala> FinalOutDF.printSchema
    root
     |-- EmpID: string (nullable = true)
     |-- DOB: string (nullable = true)
     |-- Name: string (nullable = true)
     |-- JoinDate: string (nullable = true)
     |-- Project: string (nullable = true)
     |-- Monthly: string (nullable = true)
     |-- Yearly: string (nullable = true)
    
    
    scala> FinalOutDF.write.json("/FinalJsonOut")
    

    它们都有相同的模式吗?如果是这样的话,你为什么不合并所有的数据框架呢?不,他们有不同的SchemaStanks供你帮助@Artem。在这里,在我的问题中,我没有不同的dfs,所以如何使用union?此外,在目录下,我需要一个文件下的数据,对于该文件,我无法在您的代码中看到任何部分,以便使用单个文件
    合并(1)
    重新分区(1)
    。如果您试图避免使用DFs,只需在纯SQL-CHECKUPD部分中创建union即可。我试过了。实际上,5个不同的查询创建5个不同的文件,而不考虑重新分区或coalesce@Debuggerrr尝试在单个查询中运行联接(请参阅UPD部分),并使用
    coalesce(1)
    将其结果写入磁盘,您将只处理一个DF。这就是我需要的。感谢@Nikk
    scala> spark.sql("select * from table1").printSchema
    root
     |-- Personal: array (nullable = true)
     |    |-- element: struct (containsNull = true)
     |    |    |-- DOB: string (nullable = true)
     |    |    |-- EmpID: string (nullable = true)
     |    |    |-- Name: string (nullable = true)
    
    
    scala> spark.sql("select * from table2").printSchema
    root
     |-- Company: array (nullable = true)
     |    |-- element: struct (containsNull = true)
     |    |    |-- EmpID: string (nullable = true)
     |    |    |-- JoinDate: string (nullable = true)
     |    |    |-- Project: string (nullable = true)
    
    
    scala> spark.sql("select * from table3").printSchema
    root
     |-- Salary: array (nullable = true)
     |    |-- element: struct (containsNull = true)
     |    |    |-- EmpID: string (nullable = true)
     |    |    |-- Monthly: string (nullable = true)
     |    |    |-- Yearly: string (nullable = true)
    
    scala> val TableColList = List("table1:Personal", "table2:Company", "table3:Salary")
    TableColList: List[String] = List(table1:Personal, table2:Company, table3:Salary)
    
    
    scala>  val OutDFList = TableColList.map{ X =>
         |  val table = X.split(":")(0)
         |  val arrayColumn = X.split(":")(1)
         |  val df = spark.sql(s"""SELECT * FROM """ + table).select(explode(col(arrayColumn)) as "data").select("data.*")
         | df}
    OutDFList: List[org.apache.spark.sql.DataFrame] = List([DOB: string, EmpID: string ... 1 more field], [EmpID: string, JoinDate: string ... 1 more field], [EmpID: string, Monthly: string ... 1 more field])
    
    scala> val FinalOutDF  = OutDFList.reduce((df1, df2) => df1.join(df2, "EmpID"))
    FinalOutDF: org.apache.spark.sql.DataFrame = [EmpID: string, DOB: string ... 5 more fields]
    
    scala> FinalOutDF.printSchema
    root
     |-- EmpID: string (nullable = true)
     |-- DOB: string (nullable = true)
     |-- Name: string (nullable = true)
     |-- JoinDate: string (nullable = true)
     |-- Project: string (nullable = true)
     |-- Monthly: string (nullable = true)
     |-- Yearly: string (nullable = true)
    
    
    scala> FinalOutDF.write.json("/FinalJsonOut")