Apache spark 在spark中,如何从结构中提取数组并使用它创建新字段

Apache spark 在spark中,如何从结构中提取数组并使用它创建新字段,apache-spark,apache-spark-sql,Apache Spark,Apache Spark Sql,我有一个带有模式的结构: root |-- id: long (nullable = true) |-- products: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- id: integer (nullable = true) | | |-- name: string (nullable = true) | | |-- created

我有一个带有模式的结构:

root
 |-- id: long (nullable = true)
 |-- products: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: integer (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- created_at: long (nullable = true)
 |    |    |-- updated_at: long (nullable = true)
 |    |    |-- product_color: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- id: integer (nullable = true)
 |    |    |    |    |-- color: string (nullable = true)
 |    |    |    |    |-- created_at: long (nullable = true)
 |    |    |    |    |-- updated_at: long (nullable = true)
 |    |    |    |    |-- products_id: long (nullable = true)
 |    |    |-- orders_id: long (nullable = true)
现在,我想用product_颜色创建一个新列,因此在我的数据框中添加一个新列,如

  df.withColumn("product_color", col(currentNode + "." + fieldName))
使用新列模式:

root
 |-- id: long (nullable = true)
 |-- products: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: integer (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- created_at: long (nullable = true)
 |    |    |-- updated_at: long (nullable = true)
 |    |    |-- product_color: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- id: integer (nullable = true)
 |    |    |    |    |-- color: string (nullable = true)
 |    |    |    |    |-- created_at: long (nullable = true)
 |    |    |    |    |-- updated_at: long (nullable = true)
 |    |    |    |    |-- products_id: long (nullable = true)
 |    |    |-- orders_id: long (nullable = true)
 |-- product_color: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- id: integer (nullable = true)
 |    |    |    |-- color: string (nullable = true)
 |    |    |    |-- created_at: long (nullable = true)
 |    |    |    |-- updated_at: long (nullable = true)
 |    |    |    |-- products_id: long (nullable = true)
如果查看product_color的模式,则会添加数组的一个元素

|-- element: array (containsNull = true)
我正在寻求帮助,以了解如何使用产品结构中的确切模式创建新列

预期架构:

root
 |-- id: long (nullable = true)
 |-- products: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: integer (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- created_at: long (nullable = true)
 |    |    |-- updated_at: long (nullable = true)
 |    |    |-- product_color: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- id: integer (nullable = true)
 |    |    |    |    |-- color: string (nullable = true)
 |    |    |    |    |-- created_at: long (nullable = true)
 |    |    |    |    |-- updated_at: long (nullable = true)
 |    |    |    |    |-- products_id: long (nullable = true)
 |    |    |-- orders_id: long (nullable = true)
 |-- product_color: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: integer (nullable = true)
 |    |    |-- color: string (nullable = true)
 |    |    |-- created_at: long (nullable = true)
 |    |    |-- updated_at: long (nullable = true)
 |    |    |-- products_id: long (nullable = true)
火花:2.4.5
语言:Scala

在添加为
新列时,分解数组以获得所需的模式

示例:

//sample df schema

df.printSchema
//root
// |-- id: long (nullable = true)
// |-- products: array (nullable = true)
// |    |-- element: struct (containsNull = true)
// |    |    |-- id: long (nullable = true)
// |    |    |-- order_id: long (nullable = true)
// |    |    |-- product_color: array (nullable = true)
// |    |    |    |-- element: struct (containsNull = true)
// |    |    |    |    |-- color: string (nullable = true)
// |    |    |    |    |-- id: long (nullable = true)
// |    |    |    |    |-- products_id: long (nullable = true)

df.withColumn("product_color",explode(col("products.product_color"))).printSchema
//root
// |-- id: long (nullable = true)
// |-- products: array (nullable = true)
// |    |-- element: struct (containsNull = true)
// |    |    |-- id: long (nullable = true)
// |    |    |-- order_id: long (nullable = true)
// |    |    |-- product_color: array (nullable = true)
// |    |    |    |-- element: struct (containsNull = true)
// |    |    |    |    |-- color: string (nullable = true)
// |    |    |    |    |-- id: long (nullable = true)
// |    |    |    |    |-- products_id: long (nullable = true)
// |-- product_color: array (nullable = true)
// |    |-- element: struct (containsNull = true)
// |    |    |-- color: string (nullable = true)
// |    |    |-- id: long (nullable = true)
// |    |    |-- products_id: long (nullable = true)

您可以添加预期的输出模式吗?添加了预期的模式@Shu