将字符串类型的列转换为struct，并使用PySpark解压该列_Pyspark_Apache Spark Sql

将字符串类型的列转换为struct，并使用PySpark解压该列

pyspark

将字符串类型的列转换为struct，并使用PySpark解压该列,pyspark,apache-spark-sql,Pyspark,Apache Spark Sql,输入DF： +------+-----------------------------------------------------+ |rowNum|infoCol | +------+-----------------------------------------------------+ |100 |[('john', 'customer'), ('abc, mno, xyz', 'purch

输入DF：

+------+-----------------------------------------------------+
|rowNum|infoCol                                              |
+------+-----------------------------------------------------+
|100   |[('john', 'customer'), ('abc, mno, xyz', 'purchase')]|
|200   |[('doe', 'customer')]                                |
+------+-----------------------------------------------------+

（预期）输出DF：

+------+--------+-----------------+
|rowNum|customer|         purchase|
+------+--------+-----------------+
|   100|['john']|['abc, mno, xyz']|
|   100| ['doe']|             null|
+------+--------+-----------------+

我尝试过使用

split

函数，但这并不能满足我的需要

inputdf = spark.createDataFrame(
    [
        ("100", "[('john', 'customer'), ('abc, mno, xyz', 'purchase')]"),
        ("200", "[('doe', 'customer')]"),
    ],
    ['rowNum', 'infoCol'] 
)

from pyspark.sql.functions import col, regexp_replace, split
outputdf = inputdf.withColumn("newcol", split(col("infoCol"), ","))

下面是我对spark内置函数的尝试

这里的想法是首先创建2列
，与

customer一起购买

，作为另一列中的值和其他值，以获得我使用的这些列，拆分然后分解

一旦我们得到

客户，购买值，然后groupBy+Piv
ot以透视数据，最后拆分列以获得数组
示例：
inputdf = spark.createDataFrame(
    [
        ("100", "[('john', 'customer'), ('abc, mno, xyz', 'purchase')]"),
        ("200", "[('doe', 'customer')]"),
    ],
    ['rowNum', 'infoCol'] 
)

from pyspark.sql.functions import *

inputdf.withColumn("newcol", split(col("infoCol"), "\),")).\
selectExpr("explode(newcol)","rowNum").\
withColumn("newCol1",split(regexp_replace(col("col"),"[\[|\]|\(|\)]",""),"',")).\
withColumn("new1",regexp_replace(trim(element_at(col("newCol1"),1)),"[']","")).\
withColumn("new2",regexp_replace(trim(element_at(col("newCol1"),2)),"[']","")).\
groupby("rowNum").\
pivot("new2").\
agg(first(col("new1"))).\
withColumn("customer",split(col("customer"),",")).\
withColumn("purchase",split(col("purchase"),",")).\
show()

#+------+--------+-----------------+
#|rowNum|customer|         purchase|
#+------+--------+-----------------+
#|   200|   [doe]|             null|
#|   100|  [john]|[abc,  mno,  xyz]|
#+------+--------+-----------------+

inputdf = spark.createDataFrame(
    [
        ("100", "[('john', 'customer'), ('abc, mno, xyz', 'purchase')]"),
        ("200", "[('doe', 'customer')]"),
    ],
    ['rowNum', 'infoCol'] 
)

from pyspark.sql.functions import *

inputdf.withColumn("newcol", split(col("infoCol"), "\),")).\
selectExpr("explode(newcol)","rowNum").\
withColumn("newCol1",split(regexp_replace(col("col"),"[\[|\]|\(|\)]",""),"',")).\
withColumn("new1",regexp_replace(trim(element_at(col("newCol1"),1)),"[']","")).\
withColumn("new2",regexp_replace(trim(element_at(col("newCol1"),2)),"[']","")).\
groupby("rowNum").\
pivot("new2").\
agg(first(col("new1"))).\
withColumn("customer",col("customer")).\
withColumn("purchase",col("purchase")).\
show()

#+------+--------+-------------+
#|rowNum|customer|     purchase|
#+------+--------+-------------+
#|   200|     doe|         null|
#|   100|    john|abc, mno, xyz|
#+------+--------+-------------+

更新：
inputdf = spark.createDataFrame(
    [
        ("100", "[('john', 'customer'), ('abc, mno, xyz', 'purchase')]"),
        ("200", "[('doe', 'customer')]"),
    ],
    ['rowNum', 'infoCol'] 
)

from pyspark.sql.functions import *

inputdf.withColumn("newcol", split(col("infoCol"), "\),")).\
selectExpr("explode(newcol)","rowNum").\
withColumn("newCol1",split(regexp_replace(col("col"),"[\[|\]|\(|\)]",""),"',")).\
withColumn("new1",regexp_replace(trim(element_at(col("newCol1"),1)),"[']","")).\
withColumn("new2",regexp_replace(trim(element_at(col("newCol1"),2)),"[']","")).\
groupby("rowNum").\
pivot("new2").\
agg(first(col("new1"))).\
withColumn("customer",split(col("customer"),",")).\
withColumn("purchase",split(col("purchase"),",")).\
show()

#+------+--------+-----------------+
#|rowNum|customer|         purchase|
#+------+--------+-----------------+
#|   200|   [doe]|             null|
#|   100|  [john]|[abc,  mno,  xyz]|
#+------+--------+-----------------+

inputdf = spark.createDataFrame(
    [
        ("100", "[('john', 'customer'), ('abc, mno, xyz', 'purchase')]"),
        ("200", "[('doe', 'customer')]"),
    ],
    ['rowNum', 'infoCol'] 
)

from pyspark.sql.functions import *

inputdf.withColumn("newcol", split(col("infoCol"), "\),")).\
selectExpr("explode(newcol)","rowNum").\
withColumn("newCol1",split(regexp_replace(col("col"),"[\[|\]|\(|\)]",""),"',")).\
withColumn("new1",regexp_replace(trim(element_at(col("newCol1"),1)),"[']","")).\
withColumn("new2",regexp_replace(trim(element_at(col("newCol1"),2)),"[']","")).\
groupby("rowNum").\
pivot("new2").\
agg(first(col("new1"))).\
withColumn("customer",col("customer")).\
withColumn("purchase",col("purchase")).\
show()

#+------+--------+-------------+
#|rowNum|customer|     purchase|
#+------+--------+-------------+
#|   200|     doe|         null|
#|   100|    john|abc, mno, xyz|
#+------+--------+-------------+

更新2:
inputdf = spark.createDataFrame(
    [
        ("100", "[('john', 'customer'), ('abc, mno, xyz', 'purchase'), ('abc123', 'purchase')]"),
        ("200", "[('doe', 'customer')]"),
    ],
    ['rowNum', 'infoCol'] 
)

from pyspark.sql.functions import *


inputdf.withColumn("newcol", split(col("infoCol"), "\),")).\
selectExpr("explode(newcol)","rowNum").\
withColumn("newCol1",expr("""transform(split(regexp_replace(col,"[\[|\]|\(|\)]",""),"',"),x -> regexp_replace(trim(x),"[']",""))""")).\
withColumn("new1",regexp_replace(element_at(col("newCol1"),-1),"[\]]","")).\
withColumn("new2",array_except(col("newCol1"),array(lit('purchase'),lit('customer'),lit('purchase]'),lit('customer]')))).\
withColumn("new2",expr("""transform(new2,x -> concat("'",regexp_replace(x,"[\\\\[]",""),"'"))""")).\
drop(*['col','newCol1']).\
groupby("new1","rowNum").agg(flatten(collect_list(col("new2"))).alias("new2")).\
groupby("rowNum").pivot("new1").agg(first(col("new2"))).\
show(10,False)

#+------+--------+---------------------------+
#|rowNum|customer|purchase                   |
#+------+--------+---------------------------+
#|200   |['doe'] |null                       |
#|100   |['john']|['abc, mno, xyz', 'abc123']|
#+------+--------+---------------------------+

这是我的一个尝试，它不仅可以用于客户
，购买
，而且可以用于许多列，如果列名位于最后一列
import pyspark.sql.functions as f

df = inputdf \
  .withColumn('infoCol', f.regexp_replace('infoCol', '[\[\]]', '')) \
  .withColumn('infoCol', f.regexp_replace('infoCol', '(\),)', ') ,')) \
  .withColumn('infoCol', f.explode(f.split('infoCol', ' , '))) \
  .withColumn('infoCol', f.regexp_replace('infoCol', '[\(\)]', '')) \
  .withColumn('infoCol', f.regexp_replace('infoCol', '(\',)', '\' ,')) \
  .withColumn('cols', f.split('infoCol', ' , ')[1]) \
  .withColumn('cols', f.regexp_replace('cols', '\'', '')) \
  .withColumn('infoCol', f.split('infoCol', ' , ')[0]) \
  .withColumn('infoCol', f.concat(f.lit('['), f.col('infoCol'), f.lit(']'))) \

values = df.select('cols').distinct().rdd.map(lambda x: x.cols).collect()

df.groupBy('rowNum') \
  .pivot('cols', values) \
  .agg(f.first('infoCol')) \
  .show(10, False)

+------+--------+-----------------+
|rowNum|customer|purchase         |
+------+--------+-----------------+
|200   |['doe'] |null             |
|100   |['john']|['abc, mno, xyz']|
+------+--------+-----------------+

谢谢@484唯一的问题是“abc、mno、xyz”被分为3项。我们能做些什么来处理这个问题吗？@编码员，你需要把它们作为一个项目吗？i、 e字符串类型？@thecoder，检查编辑后的答案update
部分。我希望最终的输出列是：customer和purchase。因此，对于例如[（'john'，'customer'），（'abc，mno，xyz'，'purchase'），（'abc123'，'purchase'）]，purchase列应该有['abc，mno，xyz'，'abc123']-这就是为什么数组会很好的原因。@coder，检查编辑后的答案update2
部分。.谢谢@lamanus！输出与预期不同，因为“abc、mno、xyz”被拆分为3个元素，而不是一个值。此外，如果我更改（“100”、“[（'john'，'customer'）、（'abc，mno，xyz'，'purchase'）”）->（“100”、“[（'john'，'customer'）、（'abc，mno，xyz'，'purchase'）、（'abccc'，'purchase'））更改，似乎也会出现问题。更直截了当，但太特别了