分解深度嵌套的JSON,返回Spark Scala中的重复项
我有一个实用程序,它可以很好地解析简单的JSON,但是在JSON中存在多个数组[structs]的情况下可以交叉连接 我还尝试了distinct()或dropDuplicates()来删除由于代码中包含的交叉连接而产生的重复项,但返回的DF为空分解深度嵌套的JSON,返回Spark Scala中的重复项,json,scala,apache-spark,explode,Json,Scala,Apache Spark,Explode,我有一个实用程序,它可以很好地解析简单的JSON,但是在JSON中存在多个数组[structs]的情况下可以交叉连接 我还尝试了distinct()或dropDuplicates()来删除由于代码中包含的交叉连接而产生的重复项,但返回的DF为空 def flattenDataFrame(df: DataFrame): DataFrame = { var flattenedDf: DataFrame = df if (isNested(df)) { val flattenedSchema:
def flattenDataFrame(df: DataFrame): DataFrame = {
var flattenedDf: DataFrame = df
if (isNested(df)) {
val flattenedSchema: Array[(Column, Boolean)] = flattenSchema(df.schema)
var simpleColumns: List[Column] = List.empty[Column]
var complexColumns: List[Column] = List.empty[Column]
flattenedSchema.foreach {
case (col, isComplex) => {
if (isComplex) {
complexColumns = complexColumns :+ col
} else {
simpleColumns = simpleColumns :+ col
}
}
}
var crossJoinedDataFrame = df.select(simpleColumns: _*)
complexColumns.foreach(col => {
crossJoinedDataFrame = crossJoinedDataFrame.crossJoin(df.select(col))
crossJoinedDataFrame = flattenDataFrame(crossJoinedDataFrame)
})
crossJoinedDataFrame
} else {
flattenedDf
}
}
private def flattenSchema(schema: StructType, prefix: String = null): Array[(Column, Boolean)] = {
schema.fields.flatMap(field => {
val columnName = if (prefix == null) field.name else prefix + "." + field.name
field.dataType match {
case arrayType: ArrayType => {
val cols: Array[(Column, Boolean)] = Array[(Column, Boolean)](((explode_outer(col(columnName)).as(columnName.replace(".", "_"))), true))
cols
}
case structType: StructType => {
flattenSchema(structType, columnName)
}
case _ => {
val columnNameWithUnderscores = columnName.replace(".", "_")
val metadata = new MetadataBuilder().putString("encoding", "ZSTD").build()
Array(((col(columnName).as(columnNameWithUnderscores, metadata)), false))
}
}
}).filter(field => field != None)
}
def isNested(df: DataFrame): Boolean = {
df.schema.fields.flatMap(field => {
field.dataType match {
case arrayType: ArrayType => {
Array(true)
}
case mapType: MapType => {
Array(true)
}
case structType: StructType => {
Array(true)
}
case _ => {
Array(false)
}
}
}).exists(b => b)
}
我所面临问题的JSON示例:
[
{
"id": "0001",
"type": "donut",
"name": "Cake",
"ppu": 0.55,
"batters":
{
"batter":
[
{ "id": "1001", "type": "Regular" },
{ "id": "1002", "type": "Chocolate" },
{ "id": "1003", "type": "Blueberry" },
{ "id": "1004", "type": "Devil's Food" }
]
},
"topping":
[
{ "id": "5001", "type": "None" },
{ "id": "5002", "type": "Glazed" },
{ "id": "5005", "type": "Sugar" },
{ "id": "5007", "type": "Powdered Sugar" },
{ "id": "5006", "type": "Chocolate with Sprinkles" },
{ "id": "5003", "type": "Chocolate" },
{ "id": "5004", "type": "Maple" }
]
},
{
"id": "0002",
"type": "donut",
"name": "Raised",
"ppu": 0.55,
"batters":
{
"batter":
[
{ "id": "1001", "type": "Regular" }
]
},
"topping":
[
{ "id": "5001", "type": "None" },
{ "id": "5002", "type": "Glazed" },
{ "id": "5005", "type": "Sugar" },
{ "id": "5003", "type": "Chocolate" },
{ "id": "5004", "type": "Maple" }
]
}
]
没有连接的解决方案,除此之外,没有交叉连接,这是您的问题: 很抱歉格式化,无法很好地格式化堆栈溢出 def flattenDataFrame(df: DataFrame): DataFrame = {
} 私有模式(模式:StructType,前缀:String=null,级别:Int=0):数组[(列,布尔)]={ val unquotedPrefix=if(prefix!=null)前缀。替换(“val flattenedDf: DataFrame = df if (isNested(df)) { val flattenedSchema: Array[(Column, Boolean)] = flattenSchema(flattenedDf.schema) var simpleColumns: List[Column] = List.empty[Column] var complexColumns: List[Column] = List.empty[Column] flattenedSchema.foreach { case (col, isComplex) => if (isComplex) { complexColumns = complexColumns :+ col } else { simpleColumns = simpleColumns :+ col } } val complexUnderlyingCols = complexColumns.map { column => val name = column.expr.asInstanceOf[UnresolvedAttribute].name val unquotedColName = s"${name.replaceAll("`","")}" val explodeSelectColName = s"`${name.replaceAll("`","")}`" (unquotedColName, col(name).as(unquotedColName), explode_outer(col(explodeSelectColName)).as(unquotedColName)) } var joinDataFrame = flattenedDf.select(simpleColumns ++ complexUnderlyingCols.map(_._2): _*) complexUnderlyingCols.foreach { case (name, tempCol, column) => val nonTransformedColumns = joinDataFrame.schema.fieldNames.diff(List(name)).map(fieldName => s"`${fieldName.replaceAll("`", "")}`").map(col) joinDataFrame = joinDataFrame.select(nonTransformedColumns :+ column :_*) } flattenDataFrame(joinDataFrame) } else { flattenedDf }
$fieldName”,“”)否则为null println(级别) schema.fields.flatMap(字段=>{ val fieldName=field.name val columnName=if(级别==0){ s“
){case(builder,(fieldPart,index))=> 如果(索引>级别){ 生成器。附加“$fieldPart”) }else if(索引==级别){ 生成器。追加“$fieldPart” }否则{ val fullName=s“$unquotedPrefix.$fieldName” val x=fullName.split('.').reverse.zipWithIndex.reverse.foldLeft(新StringBuilder(
”,“”) field.dataType匹配{ 案例:ArrayType=> val cols:Array[(Column,Boolean)]=Array[(Column,Boolean)]((col(columnName,true))//我们只传递列,因为我们将在展开DF时生成分解函数 科尔斯 案例structType:structType=> 扁平架构(structType,columnName,级别+1) 案例=> val metadata=new MetadataBuilder().putString(“encoding”,“ZSTD”).build() 数组((col(columnName).as(unquotedColumnName,元数据),false)) } }) } def isNested(df:DataFrame):布尔={ df.schema.fields.flatMap(字段=>{”) }否则{ 生成器。附加“$fieldPart”) } } x、 替换(1,2,“”).toString() } val unquotedColumnName=columnName.replace(“
field.dataType匹配{ 案例:ArrayType=> 数组(x=true) 案例:MapType=> 数组(x=true) 案例:StructType=> 数组(x=true) 案例=> 数组(x=false) } }).存在(b=>b) }