Warning: file_get_contents(/data/phpspider/zhask/data//catemap/3/apache-spark/6.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Apache spark Pyspark:从CSV文件中读取嵌套列,并为数据帧分配模式_Apache Spark_Pyspark_Apache Spark Sql_Pyspark Dataframes - Fatal编程技术网

Apache spark Pyspark:从CSV文件中读取嵌套列,并为数据帧分配模式

Apache spark Pyspark:从CSV文件中读取嵌套列,并为数据帧分配模式,apache-spark,pyspark,apache-spark-sql,pyspark-dataframes,Apache Spark,Pyspark,Apache Spark Sql,Pyspark Dataframes,我试图读取包含嵌套列的csv文件 例如: name,age,addresses_values person_1,30,["France","street name",75000] 阅读时,我尝试分配一个模式,如下所示: csv_schema = StructType([ StructField('name', StringType(), True), StructField('age', LongType(

我试图读取包含嵌套列的csv文件

例如:

name,age,addresses_values
person_1,30,["France","street name",75000]
阅读时,我尝试分配一个模式,如下所示:

csv_schema = StructType([
            StructField('name', StringType(), True),
            StructField('age', LongType(), True),
            StructField('addresses_values', StructType([
                    StructField('country', StringType(), True),
                    StructField('street', StringType(), True),
                   StructField('ZipCode', StringType(), True),
                   ]), True),
        ])

path = "file:///path_to_my_file"

dataset_df = spark.read.csv(path=path, header=True,schema=csv_schema)
提出了以下例外情况:

pyspark.sql.utils.AnalysisException:CSV数据源不支持 structcountry:string,street:string,ZipCode:string数据类型


这里有一个讨厌的解析方法。灵感来自和


这里有一个讨厌的解析方法。灵感来自和


有点棘手,但这里有一种解析这些CSV值的方法。您需要指定
quote=“[”
才能读取包含逗号的列
addresses\u value
。然后使用
from\u json
将其解析为字符串数组,并最终从数组元素创建所需的结构:

from pyspark.sql import functions as F

df = spark.read.csv(input_path, header=True, quote="[")

df1 = df.withColumn(
    "addresses_values",
    F.from_json(
        F.concat(F.lit("["), F.col("addresses_values")),
        "array<string>"
    )
).withColumn(
    "addresses_values",
    F.struct(
        F.col("addresses_values")[0].alias("country"),
        F.col("addresses_values")[1].alias("street"),
        F.col("addresses_values")[2].alias("ZipCode"),
    )
)

df1.show(truncate=False)

#+--------+---+----------------------------+
#|name    |age|addresses_values            |
#+--------+---+----------------------------+
#|person_1|30 |[France, street name, 75000]|
#+--------+---+----------------------------+

df1.printSchema()

#root
# |-- name: string (nullable = true)
# |-- age: string (nullable = true)
# |-- addresses_values: struct (nullable = false)
# |    |-- country: string (nullable = true)
# |    |-- street: string (nullable = true)
# |    |-- ZipCode: string (nullable = true)
从pyspark.sql导入函数为F
df=spark.read.csv(输入路径,标题=True,引号=“”)
df1=df.withColumn(
“地址和值”,
F.来自(
F.concat(F.lit(“[”),F.col(“地址值”),
“数组”
)
).withColumn(
“地址和值”,
F.struct(
F.col(“地址值”)[0]。别名(“国家”),
F.col(“地址和值”)[1]。别名(“街道”),
F.col(“地址值”)[2]。别名(“ZipCode”),
)
)
df1.show(truncate=False)
#+--------+---+----------------------------+
#|姓名|年龄|地址|值|
#+--------+---+----------------------------+
#|person|1 | 30 |[法国,街道名称,75000]|
#+--------+---+----------------------------+
df1.printSchema()
#根
#|--name:string(nullable=true)
#|--age:string(nullable=true)
#|--地址_值:struct(nullable=false)
#| |--country:string(nullable=true)
#| |--street:string(nullable=true)
#| |--ZipCode:string(nullable=true)

有点棘手,但这里有一种解析这些CSV值的方法。您需要指定
quote=“[”
才能读取包含逗号的列
地址\u值。然后使用
from_json
将其解析为字符串数组,并最终从数组元素创建所需的结构:

from pyspark.sql import functions as F

df = spark.read.csv(input_path, header=True, quote="[")

df1 = df.withColumn(
    "addresses_values",
    F.from_json(
        F.concat(F.lit("["), F.col("addresses_values")),
        "array<string>"
    )
).withColumn(
    "addresses_values",
    F.struct(
        F.col("addresses_values")[0].alias("country"),
        F.col("addresses_values")[1].alias("street"),
        F.col("addresses_values")[2].alias("ZipCode"),
    )
)

df1.show(truncate=False)

#+--------+---+----------------------------+
#|name    |age|addresses_values            |
#+--------+---+----------------------------+
#|person_1|30 |[France, street name, 75000]|
#+--------+---+----------------------------+

df1.printSchema()

#root
# |-- name: string (nullable = true)
# |-- age: string (nullable = true)
# |-- addresses_values: struct (nullable = false)
# |    |-- country: string (nullable = true)
# |    |-- street: string (nullable = true)
# |    |-- ZipCode: string (nullable = true)
从pyspark.sql导入函数为F
df=spark.read.csv(输入路径,标题=True,引号=“”)
df1=df.withColumn(
“地址和值”,
F.来自(
F.concat(F.lit(“[”),F.col(“地址值”),
“数组”
)
).withColumn(
“地址和值”,
F.struct(
F.col(“地址值”)[0]。别名(“国家”),
F.col(“地址和值”)[1]。别名(“街道”),
F.col(“地址值”)[2]。别名(“ZipCode”),
)
)
df1.show(truncate=False)
#+--------+---+----------------------------+
#|姓名|年龄|地址|值|
#+--------+---+----------------------------+
#|person|1 | 30 |[法国,街道名称,75000]|
#+--------+---+----------------------------+
df1.printSchema()
#根
#|--name:string(nullable=true)
#|--age:string(nullable=true)
#|--地址_值:struct(nullable=false)
#| |--country:string(nullable=true)
#| |--street:string(nullable=true)
#| |--ZipCode:string(nullable=true)