分解深度嵌套的JSON，返回Spark Scala中的重复项_Json_Scala_Apache Spark_Explode

分解深度嵌套的JSON，返回Spark Scala中的重复项

json scala apache-spark

分解深度嵌套的JSON，返回Spark Scala中的重复项,json,scala,apache-spark,explode,Json,Scala,Apache Spark,Explode,我有一个实用程序，它可以很好地解析简单的JSON，但是在JSON中存在多个数组[structs]的情况下可以交叉连接我还尝试了distinct（）或dropDuplicates（）来删除由于代码中包含的交叉连接而产生的重复项，但返回的DF为空 def flattenDataFrame(df: DataFrame): DataFrame = { var flattenedDf: DataFrame = df if (isNested(df)) { val flattenedSchema:

我有一个实用程序，它可以很好地解析简单的JSON，但是在JSON中存在多个数组[structs]的情况下可以交叉连接

我还尝试了distinct（）或dropDuplicates（）来删除由于代码中包含的交叉连接而产生的重复项，但返回的DF为空

def flattenDataFrame(df: DataFrame): DataFrame = {

var flattenedDf: DataFrame = df
if (isNested(df)) {
  val flattenedSchema: Array[(Column, Boolean)] = flattenSchema(df.schema)
  var simpleColumns: List[Column] = List.empty[Column]
  var complexColumns: List[Column] = List.empty[Column]

  flattenedSchema.foreach {
    case (col, isComplex) => {
      if (isComplex) {
        complexColumns = complexColumns :+ col
      } else {
        simpleColumns = simpleColumns :+ col
      }
    }
  }

  var crossJoinedDataFrame = df.select(simpleColumns: _*)
  complexColumns.foreach(col => {
    crossJoinedDataFrame = crossJoinedDataFrame.crossJoin(df.select(col))
    crossJoinedDataFrame = flattenDataFrame(crossJoinedDataFrame)
  })
  crossJoinedDataFrame
} else {
  flattenedDf
}
  }

private def flattenSchema(schema: StructType, prefix: String = null): Array[(Column, Boolean)] = {

schema.fields.flatMap(field => {

  val columnName = if (prefix == null) field.name else prefix + "." + field.name
  field.dataType match {
    case arrayType: ArrayType => {
      val cols: Array[(Column, Boolean)] = Array[(Column, Boolean)](((explode_outer(col(columnName)).as(columnName.replace(".", "_"))), true))
      cols
      }
    case structType: StructType => {
      flattenSchema(structType, columnName)
    }
    case _ => {
      val columnNameWithUnderscores = columnName.replace(".", "_")
      val metadata = new MetadataBuilder().putString("encoding", "ZSTD").build()
      Array(((col(columnName).as(columnNameWithUnderscores, metadata)), false))
    }
  }
}).filter(field => field != None)
}

def isNested(df: DataFrame): Boolean = {
df.schema.fields.flatMap(field => {
  field.dataType match {
    case arrayType: ArrayType => {
      Array(true)
    }
    case mapType: MapType => {
      Array(true)
    }
    case structType: StructType => {
      Array(true)
    }
    case _ => {
      Array(false)
    }
  }
}).exists(b => b)
}

我所面临问题的JSON示例：

[
    {
        "id": "0001",
        "type": "donut",
        "name": "Cake",
        "ppu": 0.55,
        "batters":
            {
                "batter":
                    [
                        { "id": "1001", "type": "Regular" },
                        { "id": "1002", "type": "Chocolate" },
                        { "id": "1003", "type": "Blueberry" },
                        { "id": "1004", "type": "Devil's Food" }
                    ]
            },
        "topping":
            [
                { "id": "5001", "type": "None" },
                { "id": "5002", "type": "Glazed" },
                { "id": "5005", "type": "Sugar" },
                { "id": "5007", "type": "Powdered Sugar" },
                { "id": "5006", "type": "Chocolate with Sprinkles" },
                { "id": "5003", "type": "Chocolate" },
                { "id": "5004", "type": "Maple" }
            ]
    },
    {
        "id": "0002",
        "type": "donut",
        "name": "Raised",
        "ppu": 0.55,
        "batters":
            {
                "batter":
                    [
                        { "id": "1001", "type": "Regular" }
                    ]
            },
        "topping":
            [
                { "id": "5001", "type": "None" },
                { "id": "5002", "type": "Glazed" },
                { "id": "5005", "type": "Sugar" },
                { "id": "5003", "type": "Chocolate" },
                { "id": "5004", "type": "Maple" }
            ]
    }
]

没有连接的解决方案，除此之外，没有交叉连接，这是您的问题：

很抱歉格式化，无法很好地格式化堆栈溢出 def flattenDataFrame(df: DataFrame): DataFrame = {

val flattenedDf: DataFrame = df

if (isNested(df)) {
  val flattenedSchema: Array[(Column, Boolean)] = flattenSchema(flattenedDf.schema)

  var simpleColumns: List[Column] = List.empty[Column]
  var complexColumns: List[Column] = List.empty[Column]

  flattenedSchema.foreach {
    case (col, isComplex) =>
      if (isComplex) {
        complexColumns = complexColumns :+ col
      } else {
        simpleColumns = simpleColumns :+ col
      }
  }

  val complexUnderlyingCols = complexColumns.map { column =>
    val name = column.expr.asInstanceOf[UnresolvedAttribute].name
    val unquotedColName = s"${name.replaceAll("`","")}"
    val explodeSelectColName = s"`${name.replaceAll("`","")}`"
    (unquotedColName, col(name).as(unquotedColName), explode_outer(col(explodeSelectColName)).as(unquotedColName))
  }

  var joinDataFrame = flattenedDf.select(simpleColumns ++ complexUnderlyingCols.map(_._2): _*)

  complexUnderlyingCols.foreach { case (name, tempCol, column) =>
    val nonTransformedColumns = joinDataFrame.schema.fieldNames.diff(List(name)).map(fieldName => s"`${fieldName.replaceAll("`", "")}`").map(col)
    joinDataFrame = joinDataFrame.select(nonTransformedColumns :+ column :_*)
  }
  flattenDataFrame(joinDataFrame)
} else {
  flattenedDf
}

}
私有模式（模式：StructType，前缀：String=null，级别：Int=0）：数组[（列，布尔）]={
val unquotedPrefix=if（prefix！=null）前缀。替换（“”，“”）否则为null
println（级别）
schema.fields.flatMap（字段=>{
val fieldName=field.name
val columnName=if（级别==0）{
s“
$fieldName”
}否则{
val fullName=s“$unquotedPrefix.$fieldName”
val x=fullName.split（'.'）.reverse.zipWithIndex.reverse.foldLeft（新StringBuilder（）{case（builder，（fieldPart，index））=>
如果（索引>级别）{
生成器。附加“$fieldPart”）
}else if（索引==级别）{
生成器。追加“$fieldPart”）
}否则{
生成器。附加“$fieldPart”）
}
}
x、 替换（1,2，“”）.toString（）
}
val unquotedColumnName=columnName.replace（“”，“”）
field.dataType匹配{
案例：ArrayType=>
val cols:Array[（Column，Boolean）]=Array[（Column，Boolean）]（（col（columnName，true））//我们只传递列，因为我们将在展开DF时生成分解函数
科尔斯
案例structType:structType=>
扁平架构（structType，columnName，级别+1）
案例=>
val metadata=new MetadataBuilder（）.putString（“encoding”，“ZSTD”）.build（）
数组（（col（columnName）.as（unquotedColumnName，元数据），false））
}
})
}
def isNested（df:DataFrame）：布尔={
df.schema.fields.flatMap（字段=>{
field.dataType匹配{
案例：ArrayType=>
数组（x=true）
案例：MapType=>
数组（x=true）
案例：StructType=>
数组（x=true）
案例=>
数组（x=false）
}
}).存在（b=>b）
}