Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/jquery-ui/2.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Pyspark Pypark聚合_Pyspark - Fatal编程技术网

Pyspark Pypark聚合

Pyspark Pypark聚合,pyspark,Pyspark,我正在尝试聚合pyspark数据帧。示例如下所示: +---+-------------------+ | id| struct| +---+-------------------+ |id1| [foo, true, true]| |id1| [foo, true, false]| |id1|[foo, false, false]| |id1| [bar, true, true]| |id1| [bar, true, false]| |id1|[bar, false

我正在尝试聚合pyspark数据帧。示例如下所示:

+---+-------------------+
| id|             struct|
+---+-------------------+
|id1|  [foo, true, true]|
|id1| [foo, true, false]|
|id1|[foo, false, false]|
|id1|  [bar, true, true]|
|id1| [bar, true, false]|
|id1|[bar, false, false]|
|id2|  [foo, true, true]|
|id2|[foo, false, false]|
|id2|  [bar, true, true]|
|id2|[bar, false, false]|
+---+-------------------+
ID列最多有1500个唯一ID,struct.name将有5个唯一值

这是我的代码,它计算我想要的:

从pyspark.sql.types导入*
从shared.spark导入开始\u spark
导入pyspark.sql.F函数
火花=启动火花(“应用程序”)
schema=StructType([StructField('id',StringType()),
StructField('struct',StructType(
[StructField('name',StringType()),
StructField('param1',BooleanType()),
StructField('param2',BooleanType()),
]
))])
数据=['id1',['foo',True,True]],
['id1',['foo',True,False]],
['id1',['foo',False,False]],
['id1',['bar',True,True]],
['id1',['bar',True,False]],
['id1',['bar',False,False]],
['id2',['foo',True,True]],
['id2',['foo',False,False]],
['id2',['bar',True,True]],
['id2',['bar',False,False]]
]
df=spark.createDataFrame(数据,模式)
df.groupby('id'))\
.agg(F.count(F.when)((df['struct.name']='foo'))&
(df['struct.param1'])&
(df['struct.param2']),1)).alias('foo_cond1'),
F.count(F.when((df['struct.name']=='foo')&
(df['struct.param1'])&
(df['struct.param2']==False),1)).alias('foo_cond2'),
F.count(F.when((df['struct.name']=='foo')&
(df['struct.param1']==False)&
(df['struct.param2']==False),1)).alias('foo_cond3'),
F.count(F.when((df['struct.name']='bar'))&
(df['struct.param1'])&
(df['struct.param2'],1)).alias('bar_cond1'),
F.count(F.when((df['struct.name']='bar'))&
(df['struct.param1'])&
(df['struct.param2']==False),1)).alias('bar_cond2'),
F.count(F.when((df['struct.name']='bar'))&
(df['struct.param1']==False)&
(df['struct.param2']==False),1)).alias('bar_cond3'),
) \
.withColumn('foo',F.struct(F.col('foo_cond1'))。别名('cond1'),
F.col('foo_cond2')。别名('cond2'),
F.col('foo_cond3')。别名('cond3'))
)
) \
.withColumn('bar',F.struct(F.col('bar_cond1'))。别名('cond1'),
F.col('bar_cond2')。别名('cond2'),
F.col('bar_cond3')。别名('cond3'))
)
) \
.选择('id','foo','bar')\
.show()
结果如下:

+---+---------+---------+
| id|      foo|      bar|
+---+---------+---------+
|id1|[1, 1, 1]|[1, 1, 1]|
|id2|[1, 0, 1]|[1, 0, 1]|
+---+---------+---------+

有没有更好的方法来进行这样的聚合,这样可以用更少的代码实现更好的性能?也许使用UDAF?感谢您的每一个评论。谢谢

我能够使用PandasudType,但运行时间似乎增加了30%以上。但我只使用上面提到的样本数据

从pyspark.sql.types导入*
从shared.spark导入开始\u spark
导入pyspark.sql.F函数
从pyspark.sql.functions导入pandasuudf,PandasUDFType
火花=启动火花(“应用程序”)
schema=StructType([StructField('id',StringType()),
StructField('struct',StructType(
[StructField('name',StringType()),
StructField('param1',BooleanType()),
StructField('param2',BooleanType()),
]
))])
schema_udf=StructType(
[StructField('id',StringType()),
StructField('foo1',DoubleType()),
StructField('foo2',DoubleType()),
StructField('foo3',DoubleType()),
StructField('bar1',DoubleType()),
StructField('bar2',DoubleType()),
StructField('bar3',DoubleType()),
])
数据=['id1',['foo',True,True]],
['id1',['foo',True,False]],
['id1',['foo',False,False]],
['id1',['bar',True,True]],
['id1',['bar',True,False]],
['id1',['bar',False,False]],
['id2',['foo',True,True]],
['id2',['foo',False,False]],
['id2',['bar',True,True]],
['id2',['bar',False,False]]
]
df=spark.createDataFrame(数据,模式)
@pandas\u udf(schema\u udf,PandasUDFType.GROUPED\u MAP)
def myGroupby(df_组):
def计数梳(df):
def计数组合(参数1、参数2):
cond1,cond2,cond3=0,0,0
如果参数1:
如果参数2:
cond1+=1
其他:
cond2+=1
其他:
cond3+=1
返回cond1,cond2,cond3
如果df['name']=='foo':
df['foo1'],df['foo2'],df['foo3']=计数组合(df.param1,df.param2)
如果df['name']=='bar':
df['bar1'],df['bar2'],df['bar3']=计数组合(df.param1,df.param2)
返回df
df_结果=df_组。应用(计数梳,轴=1)
返回df_结果[['id','foo1','foo2','foo3','bar1','bar2','bar3']]。groupby('id')。sum()。重置_索引()
df\
.select('id','struct.name','struct.param1','struct.param2')\
.groupby(“id”)\
.apply(myGroupby)\
.withColumn('foo',F.struct(F.col('foo1'))。别名('cond1'),
F.col('foo2')。别名('cond2'),
F.col('foo3')。别名('cond3'))
)
) \
.withColumn('bar',F.struct(F.col('bar1'))。别名('cond1'),
F.col('bar2')。别名('cond2'),
F.col('bar3')。别名('cond3'))
)
) \
.选择
from pyspark.sql.types import *
import pyspark.sql.functions as F
schema = StructType([StructField('id', StringType()),
                     StructField('struct', StructType(
                         [StructField('name', StringType()),
                          StructField('param1', BooleanType()),
                          StructField('param2', BooleanType()),
                          ]
                     ))])
data = [['id1', ['foo', True, True]],
        ['id1', ['foo', True, False]],
        ['id1', ['foo', False, False]],
        ['id1', ['bar', True, True]],
        ['id1', ['bar', True, False]],
        ['id1', ['bar', False, False]],
        ['id2', ['foo', True, True]],
        ['id2', ['foo', False, False]],
        ['id2', ['bar', True, True]],
        ['id2', ['bar', False, False]]
        ]
df = spark.createDataFrame(data, schema)
df = df.withColumn('name', F.col('struct').getField('name'))
df = df.withColumn('param1', F.col('struct').getField('param1'))
df = df.withColumn('param2', F.col('struct').getField('param2'))
w = Window.partitionBy(['id', 'name'])
df = df.withColumn('c1', F.count(F.when((df['param1']==True)&(df['param2']==True), 1)).over(w))
df = df.withColumn('c2', F.count(F.when((df['param1']==True)&(df['param2']==False), 1)).over(w))
df = df.withColumn('c3', F.count(F.when((df['param1']==False)&(df['param2']==False), 1)).over(w))
df = df.withColumn('result', F.array(['c1', 'c2', 'c3']))
df.show()

+---+-------------------+----+------+------+---+---+---+---------+
| id|             struct|name|param1|param2| c1| c2| c3|   result|
+---+-------------------+----+------+------+---+---+---+---------+
|id2|  [bar, true, true]| bar|  true|  true|  1|  0|  1|[1, 0, 1]|
|id2|[bar, false, false]| bar| false| false|  1|  0|  1|[1, 0, 1]|
|id1|  [foo, true, true]| foo|  true|  true|  1|  1|  1|[1, 1, 1]|
|id1|[foo, false, false]| foo| false| false|  1|  1|  1|[1, 1, 1]|
|id1| [foo, true, false]| foo|  true| false|  1|  1|  1|[1, 1, 1]|
|id1|  [bar, true, true]| bar|  true|  true|  1|  1|  1|[1, 1, 1]|
|id1| [bar, true, false]| bar|  true| false|  1|  1|  1|[1, 1, 1]|
|id1|[bar, false, false]| bar| false| false|  1|  1|  1|[1, 1, 1]|
|id2|[foo, false, false]| foo| false| false|  1|  0|  1|[1, 0, 1]|
|id2|  [foo, true, true]| foo|  true|  true|  1|  0|  1|[1, 0, 1]|
+---+-------------------+----+------+------+---+---+---+---------+

df = df.groupby('id').pivot('name').agg(F.first('result'))
df.show()

+---+---------+---------+
| id|      bar|      foo|
+---+---------+---------+
|id1|[1, 1, 1]|[1, 1, 1]|
|id2|[1, 0, 1]|[1, 0, 1]|
+---+---------+---------+