Python 使用PySpark将Spark数据帧中的嵌套结构列重命名为小写的all

Python 使用PySpark将Spark数据帧中的嵌套结构列重命名为小写的all,python,pyspark,pyspark-dataframes,Python,Pyspark,Pyspark Dataframes,使用scala已经有类似的解决方案,但我需要pyspark中的解决方案。我是python新手,同样需要您的帮助 下面是scala解决方案的链接,以便更好地理解需求 我试图在python中更改DataFrame列的名称。我可以轻松地更改直接字段的列名,但在转换数组结构列时遇到了困难 下面是我的数据帧模式 |-- VkjLmnVop: string (nullable = true) |-- KaTasLop: string (nullable = true) |-- AbcDef: struct

使用scala已经有类似的解决方案,但我需要pyspark中的解决方案。我是python新手,同样需要您的帮助

下面是scala解决方案的链接,以便更好地理解需求

我试图在python中更改DataFrame列的名称。我可以轻松地更改直接字段的列名,但在转换数组结构列时遇到了困难

下面是我的数据帧模式

|-- VkjLmnVop: string (nullable = true)
|-- KaTasLop: string (nullable = true)
|-- AbcDef: struct (nullable = true)
 |    |-- UvwXyz: struct (nullable = true)
 |    |    |-- MnoPqrstUv: string (nullable = true)
 |    |    |-- ManDevyIxyz: string (nullable = true)
但是我需要下面这样的模式

|-- vkjlmnvop: string (nullable = true)
|-- kataslop: string (nullable = true)
|-- abcdef: struct (nullable = true)
 |    |-- uvwxyz: struct (nullable = true)
 |    |    |-- mnopqrstuv: string (nullable = true)
 |    |    |-- mandevyixyz: string (nullable = true)

如何动态更改结构列名?

我想这就是您想要的。希望有帮助


def get_column_wise_schema(df_string_schema, df_columns):
    # Returns a dictionary containing column name and corresponding column schema as string.
    column_schema_dict = {}
    i = 0
    while i < len(df_columns):
        current_col = df_columns[i]
        next_col = df_columns[i + 1] if i < len(df_columns) - 1 else None
        current_col_split_key = '[' + current_col + ': ' if i == 0 else ' ' + current_col + ': '
        next_col_split_key = ']' if i == len(df_columns) - 1 else ', ' + next_col + ': '
        column_schema_dict[current_col] = df_string_schema.split(current_col_split_key)[1].\
            split(next_col_split_key)[0]
        i += 1
    return column_schema_dict


def convert_colnames_to_lower(spark_df):
    columns = spark_df.columns
    column_wise_schema_dict = get_column_wise_schema(spark_df.__str__(), columns)
    col_exprs = []
    for column_name in columns:
        column_schema_lowercase = column_wise_schema_dict[column_name]
        col_exprs.append(spf.col(column_name).cast(column_schema_lowercase).
                         alias(column_name.lower()))
    return spark_df.select(*col_exprs)

ds = {'AbcDef': {'UvwXyz': {'VkjLmnVop': 'abcd'}}, 'HijKS': 'fgds'}
df = spark.read.json(sc.parallelize([ds]))
df.printSchema()
"""
root
 |-- AbcDef: struct (nullable = true)
 |    |-- UvwXyz: struct (nullable = true)
 |    |    |-- VkjLmnVop: string (nullable = true)
 |-- HijKS: string (nullable = true)
 """
converted_df = convert_colnames_to_lower(df)
converted_df.printSchema()
"""
root
 |-- abcdef: struct (nullable = true)
 |    |-- uvwxyz: struct (nullable = true)
 |    |    |-- vkjlmnvop: string (nullable = true)
 |-- hijks: string (nullable = true)
 """

我想这就是你想要的。希望有帮助


def get_column_wise_schema(df_string_schema, df_columns):
    # Returns a dictionary containing column name and corresponding column schema as string.
    column_schema_dict = {}
    i = 0
    while i < len(df_columns):
        current_col = df_columns[i]
        next_col = df_columns[i + 1] if i < len(df_columns) - 1 else None
        current_col_split_key = '[' + current_col + ': ' if i == 0 else ' ' + current_col + ': '
        next_col_split_key = ']' if i == len(df_columns) - 1 else ', ' + next_col + ': '
        column_schema_dict[current_col] = df_string_schema.split(current_col_split_key)[1].\
            split(next_col_split_key)[0]
        i += 1
    return column_schema_dict


def convert_colnames_to_lower(spark_df):
    columns = spark_df.columns
    column_wise_schema_dict = get_column_wise_schema(spark_df.__str__(), columns)
    col_exprs = []
    for column_name in columns:
        column_schema_lowercase = column_wise_schema_dict[column_name]
        col_exprs.append(spf.col(column_name).cast(column_schema_lowercase).
                         alias(column_name.lower()))
    return spark_df.select(*col_exprs)

ds = {'AbcDef': {'UvwXyz': {'VkjLmnVop': 'abcd'}}, 'HijKS': 'fgds'}
df = spark.read.json(sc.parallelize([ds]))
df.printSchema()
"""
root
 |-- AbcDef: struct (nullable = true)
 |    |-- UvwXyz: struct (nullable = true)
 |    |    |-- VkjLmnVop: string (nullable = true)
 |-- HijKS: string (nullable = true)
 """
converted_df = convert_colnames_to_lower(df)
converted_df.printSchema()
"""
root
 |-- abcdef: struct (nullable = true)
 |    |-- uvwxyz: struct (nullable = true)
 |    |    |-- vkjlmnvop: string (nullable = true)
 |-- hijks: string (nullable = true)
 """

我还发现了类似逻辑的不同解决方案,行数更少

import pyspark.sql.functions as spf
ds = {'AbcDef': {'UvwXyz': {'VkjLmnVop': 'abcd'}}, 'HijKS': 'fgds'}
df = spark.read.json(sc.parallelize([ds]))
df.printSchema()
"""
root
 |-- AbcDef: struct (nullable = true)
 |    |-- UvwXyz: struct (nullable = true)
 |    |    |-- VkjLmnVop: string (nullable = true)
 |-- HijKS: string (nullable = true)
"""
for i in df.columns : df = df.withColumnRenamed(i, i.lower()) 
schemaDef = [y.replace("]","") for y in [x.replace("DataFrame[","") for x in df.__str__().split(", ")]]

for j in schemaDef :
  columnName = j.split(": ")[0]
  dataType = j.split(": ")[1]
  df = df.withColumn(columnName, spf.col(columnName).cast(dataType.lower())) 

df.printSchema()

"""
root
 |-- abcdef: struct (nullable = true)
 |    |-- uvwxyz: struct (nullable = true)
 |    |    |-- vkjlmnvop: string (nullable = true)
 |-- hijks: string (nullable = true)
"""

我还发现了类似逻辑的不同解决方案,行数更少

import pyspark.sql.functions as spf
ds = {'AbcDef': {'UvwXyz': {'VkjLmnVop': 'abcd'}}, 'HijKS': 'fgds'}
df = spark.read.json(sc.parallelize([ds]))
df.printSchema()
"""
root
 |-- AbcDef: struct (nullable = true)
 |    |-- UvwXyz: struct (nullable = true)
 |    |    |-- VkjLmnVop: string (nullable = true)
 |-- HijKS: string (nullable = true)
"""
for i in df.columns : df = df.withColumnRenamed(i, i.lower()) 
schemaDef = [y.replace("]","") for y in [x.replace("DataFrame[","") for x in df.__str__().split(", ")]]

for j in schemaDef :
  columnName = j.split(": ")[0]
  dataType = j.split(": ")[1]
  df = df.withColumn(columnName, spf.col(columnName).cast(dataType.lower())) 

df.printSchema()

"""
root
 |-- abcdef: struct (nullable = true)
 |    |-- uvwxyz: struct (nullable = true)
 |    |    |-- vkjlmnvop: string (nullable = true)
 |-- hijks: string (nullable = true)
"""

也许这能帮上忙?也许这能帮上忙?谢谢兄弟,非常感谢。实际上下面的Manish代码对我有用,但给你们绿色的记号,因为我们只从你们的代码中得到了想法。再次感谢,太好了!谢谢@Manish为您提供的解决方案:谢谢兄弟,非常感谢。实际上下面的Manish代码对我有用,但给你们绿色的记号,因为我们只从你们的代码中得到了想法。再次感谢,太好了!感谢@Manish为您提供的解决方案: