
PySpark数据帧操作效率,pyspark,spark-dataframe,rdd,Pyspark,Spark Dataframe,Rdd,假设我有以下数据框: +----------+-----+----+-------+ |display_id|ad_id|prob|clicked| +----------+-----+----+-------+ | 123| 989| 0.9| 0| | 123| 990| 0.8| 1| | 123| 999| 0.7| 0| | 234| 789| 0.9| 0| | 234| 7


|       123|  989| 0.9|      0|
|       123|  990| 0.8|      1|
|       123|  999| 0.7|      0|
|       234|  789| 0.9|      0|
|       234|  777| 0.7|      0|
|       234|  769| 0.6|      1|
|       234|  798| 0.5|      0|



#scala:  val dfad = sc.parallelize(Seq((123,989,0.9,0),(123,990,0.8,1),(123,999,0.7,0),(234,789,0.9,0),(234,777,0.7,0),(234,769,0.6,1),(234,798,0.5,0))).toDF("display_id","ad_id","prob","clicked")
#^^^that's^^^ the only difference (besides putting val in front of variables) between this python response and a Scala one

dfad = sc.parallelize(((123,989,0.9,0),(123,990,0.8,1),(123,999,0.7,0),(234,789,0.9,0),(234,777,0.7,0),(234,769,0.6,1),(234,798,0.5,0))).toDF(["display_id","ad_id","prob","clicked"])

df1 = sqlContext.sql("SELECT display_id,collect_list(ad_id) ad_id_sorted FROM (SELECT * FROM df_ad SORT BY display_id,prob DESC) x GROUP BY display_id")
|display_id|        ad_id_sorted|
|       234|[789, 777, 769, 798]|
|       123|     [989, 990, 999]|

df2 = sqlContext.sql("SELECT display_id, max(ad_id) as ad_id_set from df_ad where clicked=1 group by display_id")
|       234|      769|
|       123|      990|

final_df = df1.join(df2,"display_id")
|display_id|        ad_id_sorted|ad_id_set|
|       234|[789, 777, 769, 798]|      769|
|       123|     [989, 990, 999]|      990|


#scala:  val dfad = sc.parallelize(Seq((123,989,0.9,0),(123,990,0.8,1),(123,999,0.7,0),(234,789,0.9,0),(234,777,0.7,0),(234,769,0.6,1),(234,798,0.5,0))).toDF("display_id","ad_id","prob","clicked")
#^^^that's^^^ the only difference (besides putting val in front of variables) between this python response and a Scala one

dfad = sc.parallelize(((123,989,0.9,0),(123,990,0.8,1),(123,999,0.7,0),(234,789,0.9,0),(234,777,0.7,0),(234,769,0.6,1),(234,798,0.5,0))).toDF(["display_id","ad_id","prob","clicked"])

df1 = sqlContext.sql("SELECT display_id,collect_list(ad_id) ad_id_sorted FROM (SELECT * FROM df_ad SORT BY display_id,prob DESC) x GROUP BY display_id")
|display_id|        ad_id_sorted|
|       234|[789, 777, 769, 798]|
|       123|     [989, 990, 999]|

df2 = sqlContext.sql("SELECT display_id, max(ad_id) as ad_id_set from df_ad where clicked=1 group by display_id")
|       234|      769|
|       123|      990|

final_df = df1.join(df2,"display_id")
|display_id|        ad_id_sorted|ad_id_set|
|       234|[789, 777, 769, 798]|      769|
|       123|     [989, 990, 999]|      990|