Warning: file_get_contents(/data/phpspider/zhask/data//catemap/3/apache-spark/5.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Dataframe 复制Pyspark数据帧中的行_Dataframe_Apache Spark_Pyspark_Apache Spark Sql - Fatal编程技术网

Dataframe 复制Pyspark数据帧中的行

Dataframe 复制Pyspark数据帧中的行,dataframe,apache-spark,pyspark,apache-spark-sql,Dataframe,Apache Spark,Pyspark,Apache Spark Sql,我有以下数据帧: id, test, date 1, A, 01/20/2020 1, B, 01/25/2020 2, A, 02/20/2020 2, B, 02/25/2020 2, C, 02/25/2020 由于不同测试的数量是3(A,B,C),我希望为id 1插入一行测试C,日期为“NA” 生成的数据帧应为: id, test, date 1, A, 01/20/2020 1, B, 01/25/2020 1, C, NA 2, A, 02/20/2020 2, B, 02/25/

我有以下数据帧:

id, test, date
1, A, 01/20/2020
1, B, 01/25/2020
2, A, 02/20/2020
2, B, 02/25/2020
2, C, 02/25/2020
由于不同测试的数量是3(A,B,C),我希望为id 1插入一行测试C,日期为“NA”

生成的数据帧应为:

id, test, date
1, A, 01/20/2020
1, B, 01/25/2020
1, C, NA
2, A, 02/20/2020
2, B, 02/25/2020
2, C, 02/25/2020
示例数据

data = [
    ('1', 'A', '01/20/2020'),
    ('1', 'B', '01/25/2020'),
    ('2', 'A', '02/20/2020'),
    ('2', 'B', '02/25/2020'),
    ('2', 'C', '02/25/2020'),
]
df = spark.createDataFrame(data, ['id', 'test', 'date'])
首先生成一个交叉表

# Solution
uniq_ids = df.select('id').distinct().coalesce(1)
uniq_tests = df.select('test').distinct().coalesce(1)
skeleton = (
    uniq_ids.
        crossJoin(
            uniq_tests
        )
)
+---+----+
| id|test|
+---+----+
|  1|   B|
|  2|   B|
|  1|   C|
|  2|   C|
|  1|   A|
|  2|   A|
+---+----+
然后离开加入它

(
    skeleton.
        join(
            df,
            ['id', 'test'],
            'left'
        ).
        orderBy('id', 'test', 'date').
        show(truncate=False)
)
+---+----+----------+                                                           
|id |test|date      |
+---+----+----------+
|1  |A   |01/20/2020|
|1  |B   |01/25/2020|
|1  |C   |null      |
|2  |A   |02/20/2020|
|2  |B   |02/25/2020|
|2  |C   |02/25/2020|
+---+----+----------+
具有真实数据的建议

  • 如果您有许多unique
    id
    s和unique
    test
    s,您可能需要更改合并(N)的分区数量
示例数据

data = [
    ('1', 'A', '01/20/2020'),
    ('1', 'B', '01/25/2020'),
    ('2', 'A', '02/20/2020'),
    ('2', 'B', '02/25/2020'),
    ('2', 'C', '02/25/2020'),
]
df = spark.createDataFrame(data, ['id', 'test', 'date'])
首先生成一个交叉表

# Solution
uniq_ids = df.select('id').distinct().coalesce(1)
uniq_tests = df.select('test').distinct().coalesce(1)
skeleton = (
    uniq_ids.
        crossJoin(
            uniq_tests
        )
)
+---+----+
| id|test|
+---+----+
|  1|   B|
|  2|   B|
|  1|   C|
|  2|   C|
|  1|   A|
|  2|   A|
+---+----+
然后离开加入它

(
    skeleton.
        join(
            df,
            ['id', 'test'],
            'left'
        ).
        orderBy('id', 'test', 'date').
        show(truncate=False)
)
+---+----+----------+                                                           
|id |test|date      |
+---+----+----------+
|1  |A   |01/20/2020|
|1  |B   |01/25/2020|
|1  |C   |null      |
|2  |A   |02/20/2020|
|2  |B   |02/25/2020|
|2  |C   |02/25/2020|
+---+----+----------+
具有真实数据的建议

  • 如果您有许多unique
    id
    s和unique
    test
    s,您可能需要更改合并(N)的分区数量
对于
spark2.4+
,一种高度可扩展的方式
(无任何连接)
将使用
收集列表
数组函数
/
高阶函数
若要确定缺少的链接,请添加这些链接,然后分解。
。它将适用于缺少
A、B或C的任何顺序。

#sampledataframe
#df.show()
#+---+----+----------+
#| id|test|      date|
#+---+----+----------+
#|  1|   A|01/20/2020|
#|  2|   A|02/20/2020|
#|  2|   B|02/25/2020|
#|  2|   C|02/25/2020|
#+---+----+----------+


from pyspark.sql import functions as F
df.groupBy("id").agg(F.collect_list("test").alias("x"),F.collect_list("date").alias("col2"))\
                   .withColumn("zip", F.arrays_zip(F.col("x"),F.col("col2")))\
                   .withColumn("except", F.array_except(F.array(*(F.lit(x) for x in ['A','B','C'])),"x")).drop("x","col2")\
                   .withColumn("except", F.expr("""transform(except,x-> struct(x,'NA'))"""))\
                   .withColumn("zipped", F.explode(F.array_union("zip","except")))\
                   .select("id",F.col("zipped.x").alias("test"),F.col("zipped.col2").alias("date"))\
                   .show(truncate=False)

#+---+----+----------+
#|id |test|date      |
#+---+----+----------+
#|1  |A   |01/20/2020|
#|1  |B   |01/25/2020|
#|1  |C   |NA        |
#|2  |A   |02/20/2020|
#|2  |B   |02/25/2020|
#|2  |C   |02/25/2020|
#+---+----+----------+
== Physical Plan ==
*(2) Project [id#1206L, zipped#1400.x AS test#1405, zipped#1400.col2 AS date#1406]
+- *(2) Generate explode(array_union(zip#1380, except#1394)), [id#1206L], false, [zipped#1400]
   +- *(2) Project [id#1206L, arrays_zip(x#1374, col2#1376) AS zip#1380, transform(array_except([A,B,C], x#1374), lambdafunction(named_struct(x, lambda x#1395, col2, NA), lambda x#1395, false)) AS except#1394]
      +- ObjectHashAggregate(keys=[id#1206L], functions=[collect_list(test#1207, 0, 0), collect_list(date#1208, 0, 0)])
         +- Exchange hashpartitioning(id#1206L, 200), [id=#1192]
            +- *(1) Project [id#1206L, date#1208, test#1207]
               +- *(1) Scan ExistingRDD[id#1206L,test#1207,date#1208]
加入解决方案的物理计划:
(如@cPak所示)

非连接解决方案的物理计划(使用数组函数+分解)

#sampledataframe
#df.show()
#+---+----+----------+
#| id|test|      date|
#+---+----+----------+
#|  1|   A|01/20/2020|
#|  2|   A|02/20/2020|
#|  2|   B|02/25/2020|
#|  2|   C|02/25/2020|
#+---+----+----------+


from pyspark.sql import functions as F
df.groupBy("id").agg(F.collect_list("test").alias("x"),F.collect_list("date").alias("col2"))\
                   .withColumn("zip", F.arrays_zip(F.col("x"),F.col("col2")))\
                   .withColumn("except", F.array_except(F.array(*(F.lit(x) for x in ['A','B','C'])),"x")).drop("x","col2")\
                   .withColumn("except", F.expr("""transform(except,x-> struct(x,'NA'))"""))\
                   .withColumn("zipped", F.explode(F.array_union("zip","except")))\
                   .select("id",F.col("zipped.x").alias("test"),F.col("zipped.col2").alias("date"))\
                   .show(truncate=False)

#+---+----+----------+
#|id |test|date      |
#+---+----+----------+
#|1  |A   |01/20/2020|
#|1  |B   |01/25/2020|
#|1  |C   |NA        |
#|2  |A   |02/20/2020|
#|2  |B   |02/25/2020|
#|2  |C   |02/25/2020|
#+---+----+----------+
== Physical Plan ==
*(2) Project [id#1206L, zipped#1400.x AS test#1405, zipped#1400.col2 AS date#1406]
+- *(2) Generate explode(array_union(zip#1380, except#1394)), [id#1206L], false, [zipped#1400]
   +- *(2) Project [id#1206L, arrays_zip(x#1374, col2#1376) AS zip#1380, transform(array_except([A,B,C], x#1374), lambdafunction(named_struct(x, lambda x#1395, col2, NA), lambda x#1395, false)) AS except#1394]
      +- ObjectHashAggregate(keys=[id#1206L], functions=[collect_list(test#1207, 0, 0), collect_list(date#1208, 0, 0)])
         +- Exchange hashpartitioning(id#1206L, 200), [id=#1192]
            +- *(1) Project [id#1206L, date#1208, test#1207]
               +- *(1) Scan ExistingRDD[id#1206L,test#1207,date#1208]

正如我们所看到的,join解决方案产生了许多数据无序交换,并使用笛卡尔积(bigdata效率很低)。在任何集群上,数据移动都将是非常高且非常繁重的。与array function/explode解决方案相比,数据移动要低得多,处理速度要快得多。

对于
spark2.4+
,一种高度可扩展的方式
(无任何连接)
将使用
groupBy
收集列表
数组函数
/
高阶函数
若要确定缺少的链接,请添加这些链接,然后分解。
。它将适用于缺少
A、B或C的任何顺序。

#sampledataframe
#df.show()
#+---+----+----------+
#| id|test|      date|
#+---+----+----------+
#|  1|   A|01/20/2020|
#|  2|   A|02/20/2020|
#|  2|   B|02/25/2020|
#|  2|   C|02/25/2020|
#+---+----+----------+


from pyspark.sql import functions as F
df.groupBy("id").agg(F.collect_list("test").alias("x"),F.collect_list("date").alias("col2"))\
                   .withColumn("zip", F.arrays_zip(F.col("x"),F.col("col2")))\
                   .withColumn("except", F.array_except(F.array(*(F.lit(x) for x in ['A','B','C'])),"x")).drop("x","col2")\
                   .withColumn("except", F.expr("""transform(except,x-> struct(x,'NA'))"""))\
                   .withColumn("zipped", F.explode(F.array_union("zip","except")))\
                   .select("id",F.col("zipped.x").alias("test"),F.col("zipped.col2").alias("date"))\
                   .show(truncate=False)

#+---+----+----------+
#|id |test|date      |
#+---+----+----------+
#|1  |A   |01/20/2020|
#|1  |B   |01/25/2020|
#|1  |C   |NA        |
#|2  |A   |02/20/2020|
#|2  |B   |02/25/2020|
#|2  |C   |02/25/2020|
#+---+----+----------+
== Physical Plan ==
*(2) Project [id#1206L, zipped#1400.x AS test#1405, zipped#1400.col2 AS date#1406]
+- *(2) Generate explode(array_union(zip#1380, except#1394)), [id#1206L], false, [zipped#1400]
   +- *(2) Project [id#1206L, arrays_zip(x#1374, col2#1376) AS zip#1380, transform(array_except([A,B,C], x#1374), lambdafunction(named_struct(x, lambda x#1395, col2, NA), lambda x#1395, false)) AS except#1394]
      +- ObjectHashAggregate(keys=[id#1206L], functions=[collect_list(test#1207, 0, 0), collect_list(date#1208, 0, 0)])
         +- Exchange hashpartitioning(id#1206L, 200), [id=#1192]
            +- *(1) Project [id#1206L, date#1208, test#1207]
               +- *(1) Scan ExistingRDD[id#1206L,test#1207,date#1208]
加入解决方案的物理计划:
(如@cPak所示)

非连接解决方案的物理计划(使用数组函数+分解)

#sampledataframe
#df.show()
#+---+----+----------+
#| id|test|      date|
#+---+----+----------+
#|  1|   A|01/20/2020|
#|  2|   A|02/20/2020|
#|  2|   B|02/25/2020|
#|  2|   C|02/25/2020|
#+---+----+----------+


from pyspark.sql import functions as F
df.groupBy("id").agg(F.collect_list("test").alias("x"),F.collect_list("date").alias("col2"))\
                   .withColumn("zip", F.arrays_zip(F.col("x"),F.col("col2")))\
                   .withColumn("except", F.array_except(F.array(*(F.lit(x) for x in ['A','B','C'])),"x")).drop("x","col2")\
                   .withColumn("except", F.expr("""transform(except,x-> struct(x,'NA'))"""))\
                   .withColumn("zipped", F.explode(F.array_union("zip","except")))\
                   .select("id",F.col("zipped.x").alias("test"),F.col("zipped.col2").alias("date"))\
                   .show(truncate=False)

#+---+----+----------+
#|id |test|date      |
#+---+----+----------+
#|1  |A   |01/20/2020|
#|1  |B   |01/25/2020|
#|1  |C   |NA        |
#|2  |A   |02/20/2020|
#|2  |B   |02/25/2020|
#|2  |C   |02/25/2020|
#+---+----+----------+
== Physical Plan ==
*(2) Project [id#1206L, zipped#1400.x AS test#1405, zipped#1400.col2 AS date#1406]
+- *(2) Generate explode(array_union(zip#1380, except#1394)), [id#1206L], false, [zipped#1400]
   +- *(2) Project [id#1206L, arrays_zip(x#1374, col2#1376) AS zip#1380, transform(array_except([A,B,C], x#1374), lambdafunction(named_struct(x, lambda x#1395, col2, NA), lambda x#1395, false)) AS except#1394]
      +- ObjectHashAggregate(keys=[id#1206L], functions=[collect_list(test#1207, 0, 0), collect_list(date#1208, 0, 0)])
         +- Exchange hashpartitioning(id#1206L, 200), [id=#1192]
            +- *(1) Project [id#1206L, date#1208, test#1207]
               +- *(1) Scan ExistingRDD[id#1206L,test#1207,date#1208]
正如我们所看到的,join解决方案产生了许多数据无序交换,并使用笛卡尔积(bigdata效率很低)。在任何集群上,数据移动都将是非常高且非常繁重的。与array function/explode解决方案相比,数据移动速度更低,处理速度更快