Pyspark Pypark聚合_Pyspark - Fatal编程技术网

Pyspark Pypark聚合

pyspark

Pyspark Pypark聚合,pyspark,Pyspark,我正在尝试聚合pyspark数据帧。示例如下所示： +---+-------------------+ | id| struct| +---+-------------------+ |id1| [foo, true, true]| |id1| [foo, true, false]| |id1|[foo, false, false]| |id1| [bar, true, true]| |id1| [bar, true, false]| |id1|[bar, false

我正在尝试聚合pyspark数据帧。示例如下所示：

+---+-------------------+
| id|             struct|
+---+-------------------+
|id1|  [foo, true, true]|
|id1| [foo, true, false]|
|id1|[foo, false, false]|
|id1|  [bar, true, true]|
|id1| [bar, true, false]|
|id1|[bar, false, false]|
|id2|  [foo, true, true]|
|id2|[foo, false, false]|
|id2|  [bar, true, true]|
|id2|[bar, false, false]|
+---+-------------------+

ID列最多有1500个唯一ID，struct.name将有5个唯一值

这是我的代码，它计算我想要的：

从pyspark.sql.types导入*
从shared.spark导入开始\u spark
导入pyspark.sql.F函数
火花=启动火花（“应用程序”）
schema=StructType（[StructField（'id'，StringType（）），
StructField（'struct'，StructType(
[StructField（'name'，StringType（）），
StructField（'param1'，BooleanType（）），
StructField（'param2'，BooleanType（）），
]
))])
数据=['id1'，['foo'，True，True]]，
['id1'，['foo'，True，False]]，
['id1'，['foo'，False，False]]，
['id1'，['bar'，True，True]]，
['id1'，['bar'，True，False]]，
['id1'，['bar'，False，False]]，
['id2'，['foo'，True，True]]，
['id2'，['foo'，False，False]]，
['id2'，['bar'，True，True]]，
['id2'，['bar'，False，False]]
]
df=spark.createDataFrame（数据，模式）
df.groupby（'id'））\
.agg（F.count（F.when）（（df['struct.name']='foo'））&
（df['struct.param1']）&
（df['struct.param2']），1））.alias（'foo_cond1'），
F.count（F.when（（df['struct.name']=='foo'）&
（df['struct.param1']）&
（df['struct.param2']==False），1））.alias（'foo_cond2'），
F.count（F.when（（df['struct.name']=='foo'）&
（df['struct.param1']==False）&
（df['struct.param2']==False），1））.alias（'foo_cond3'），
F.count（F.when（（df['struct.name']='bar'））&
（df['struct.param1']）&
（df['struct.param2']，1））.alias（'bar_cond1'），
F.count（F.when（（df['struct.name']='bar'））&
（df['struct.param1']）&
（df['struct.param2']==False），1））.alias（'bar_cond2'），
F.count（F.when（（df['struct.name']='bar'））&
（df['struct.param1']==False）&
（df['struct.param2']==False），1））.alias（'bar_cond3'），
) \
.withColumn（'foo'，F.struct（F.col（'foo_cond1'））。别名（'cond1'），
F.col（'foo_cond2'）。别名（'cond2'），
F.col（'foo_cond3'）。别名（'cond3'））
)
) \
.withColumn（'bar'，F.struct（F.col（'bar_cond1'））。别名（'cond1'），
F.col（'bar_cond2'）。别名（'cond2'），
F.col（'bar_cond3'）。别名（'cond3'））
)
) \
.选择（'id'，'foo'，'bar'）\
.show（）

结果如下：

+---+---------+---------+
| id|      foo|      bar|
+---+---------+---------+
|id1|[1, 1, 1]|[1, 1, 1]|
|id2|[1, 0, 1]|[1, 0, 1]|
+---+---------+---------+

有没有更好的方法来进行这样的聚合，这样可以用更少的代码实现更好的性能？也许使用UDAF？感谢您的每一个评论。谢谢

我能够使用PandasudType，但运行时间似乎增加了30%以上。但我只使用上面提到的样本数据

从pyspark.sql.types导入*
从shared.spark导入开始\u spark
导入pyspark.sql.F函数
从pyspark.sql.functions导入pandasuudf，PandasUDFType
火花=启动火花（“应用程序”）
schema=StructType（[StructField（'id'，StringType（）），
StructField（'struct'，StructType(
[StructField（'name'，StringType（）），
StructField（'param1'，BooleanType（）），
StructField（'param2'，BooleanType（）），
]
))])
schema_udf=StructType(
[StructField（'id'，StringType（）），
StructField（'foo1'，DoubleType（）），
StructField（'foo2'，DoubleType（）），
StructField（'foo3'，DoubleType（）），
StructField（'bar1'，DoubleType（）），
StructField（'bar2'，DoubleType（）），
StructField（'bar3'，DoubleType（）），
])
数据=['id1'，['foo'，True，True]]，
['id1'，['foo'，True，False]]，
['id1'，['foo'，False，False]]，
['id1'，['bar'，True，True]]，
['id1'，['bar'，True，False]]，
['id1'，['bar'，False，False]]，
['id2'，['foo'，True，True]]，
['id2'，['foo'，False，False]]，
['id2'，['bar'，True，True]]，
['id2'，['bar'，False，False]]
]
df=spark.createDataFrame（数据，模式）
@pandas\u udf（schema\u udf，PandasUDFType.GROUPED\u MAP）
def myGroupby（df_组）：
def计数梳（df）：
def计数组合（参数1、参数2）：
cond1，cond2，cond3=0,0,0
如果参数1：
如果参数2：
cond1+=1
其他：
cond2+=1
其他：
cond3+=1
返回cond1，cond2，cond3
如果df['name']=='foo'：
df['foo1']，df['foo2']，df['foo3']=计数组合（df.param1，df.param2）
如果df['name']=='bar'：
df['bar1']，df['bar2']，df['bar3']=计数组合（df.param1，df.param2）
返回df
df_结果=df_组。应用（计数梳，轴=1）
返回df_结果[['id'，'foo1'，'foo2'，'foo3'，'bar1'，'bar2'，'bar3']]。groupby（'id'）。sum（）。重置_索引（）
df\
.select（'id'，'struct.name'，'struct.param1'，'struct.param2'）\
.groupby（“id”）\
.apply（myGroupby）\
.withColumn（'foo'，F.struct（F.col（'foo1'））。别名（'cond1'），
F.col（'foo2'）。别名（'cond2'），
F.col（'foo3'）。别名（'cond3'））
)
) \
.withColumn（'bar'，F.struct（F.col（'bar1'））。别名（'cond1'），
F.col（'bar2'）。别名（'cond2'），
F.col（'bar3'）。别名（'cond3'））
)
) \
.选择
from pyspark.sql.types import *
import pyspark.sql.functions as F
schema = StructType([StructField('id', StringType()),
                     StructField('struct', StructType(
                         [StructField('name', StringType()),
                          StructField('param1', BooleanType()),
                          StructField('param2', BooleanType()),
                          ]
                     ))])
data = [['id1', ['foo', True, True]],
        ['id1', ['foo', True, False]],
        ['id1', ['foo', False, False]],
        ['id1', ['bar', True, True]],
        ['id1', ['bar', True, False]],
        ['id1', ['bar', False, False]],
        ['id2', ['foo', True, True]],
        ['id2', ['foo', False, False]],
        ['id2', ['bar', True, True]],
        ['id2', ['bar', False, False]]
        ]
df = spark.createDataFrame(data, schema)
df = df.withColumn('name', F.col('struct').getField('name'))
df = df.withColumn('param1', F.col('struct').getField('param1'))
df = df.withColumn('param2', F.col('struct').getField('param2'))
w = Window.partitionBy(['id', 'name'])
df = df.withColumn('c1', F.count(F.when((df['param1']==True)&(df['param2']==True), 1)).over(w))
df = df.withColumn('c2', F.count(F.when((df['param1']==True)&(df['param2']==False), 1)).over(w))
df = df.withColumn('c3', F.count(F.when((df['param1']==False)&(df['param2']==False), 1)).over(w))
df = df.withColumn('result', F.array(['c1', 'c2', 'c3']))
df.show()

+---+-------------------+----+------+------+---+---+---+---------+
| id|             struct|name|param1|param2| c1| c2| c3|   result|
+---+-------------------+----+------+------+---+---+---+---------+
|id2|  [bar, true, true]| bar|  true|  true|  1|  0|  1|[1, 0, 1]|
|id2|[bar, false, false]| bar| false| false|  1|  0|  1|[1, 0, 1]|
|id1|  [foo, true, true]| foo|  true|  true|  1|  1|  1|[1, 1, 1]|
|id1|[foo, false, false]| foo| false| false|  1|  1|  1|[1, 1, 1]|
|id1| [foo, true, false]| foo|  true| false|  1|  1|  1|[1, 1, 1]|
|id1|  [bar, true, true]| bar|  true|  true|  1|  1|  1|[1, 1, 1]|
|id1| [bar, true, false]| bar|  true| false|  1|  1|  1|[1, 1, 1]|
|id1|[bar, false, false]| bar| false| false|  1|  1|  1|[1, 1, 1]|
|id2|[foo, false, false]| foo| false| false|  1|  0|  1|[1, 0, 1]|
|id2|  [foo, true, true]| foo|  true|  true|  1|  0|  1|[1, 0, 1]|
+---+-------------------+----+------+------+---+---+---+---------+


df = df.groupby('id').pivot('name').agg(F.first('result'))
df.show()

+---+---------+---------+
| id|      bar|      foo|
+---+---------+---------+
|id1|[1, 1, 1]|[1, 1, 1]|
|id2|[1, 0, 1]|[1, 0, 1]|
+---+---------+---------+