Dataframe Pyspark获取每天的最后日期时间
我有一个数据框,其中包含所有用户的更新统计数据,我想使用Pyspark创建一个新的数据框,其中包含相同的信息,但只包含每天最后更新的统计数据 输入示例:Dataframe Pyspark获取每天的最后日期时间,dataframe,pyspark,Dataframe,Pyspark,我有一个数据框,其中包含所有用户的更新统计数据,我想使用Pyspark创建一个新的数据框,其中包含相同的信息,但只包含每天最后更新的统计数据 输入示例: date id stat1 stat2 stat3 2020-01-13T22:22:10.000+0000 1 173736 3043 2996 2020-01-13T22:43:19.000+0000 1 173775 3042 2996 2
date id stat1 stat2 stat3
2020-01-13T22:22:10.000+0000 1 173736 3043 2996
2020-01-13T22:43:19.000+0000 1 173775 3042 2996
2020-01-14T22:43:19.000+0000 1 173775 3042 2996
2020-01-15T22:43:19.000+0000 1 173775 3042 2996
2020-01-13T22:22:10.000+0000 2 257624 1500 53
2020-01-13T22:43:19.000+0000 2 257625 1500 65
预期产出:
date id stat1 stat2 stat3
2020-01-13T22:43:19.000+0000 1 173775 3042 2996
2020-01-14T22:43:19.000+0000 1 173775 3042 2996
2020-01-15T22:43:19.000+0000 1 173775 3042 2996
2020-01-13T22:43:19.000+0000 2 257625 1500 65
我建议使用
窗口功能
。首先将日期列置于
和
(用于获取日期中的最大时间),然后使用max over row_number()
或last over窗口。
然后过滤器
获取所需行。两种方法都试一下,看看哪一种最适合你的情况
数据帧:
df.show()
#+----------------------------+---+------+-----+-----+
#|date |id |stat1 |stat2|stat3|
#+----------------------------+---+------+-----+-----+
#|2020-01-13T22:22:10.000+0000|1 |173736|3043 |2996 |
#|2020-01-13T22:43:19.000+0000|1 |173775|3042 |2996 |
#|2020-01-14T22:43:19.000+0000|1 |173775|3042 |2996 |
#|2020-01-15T22:43:19.000+0000|1 |173775|3042 |2996 |
#|2020-01-13T22:22:10.000+0000|2 |257624|1500 |53 |
#|2020-01-13T22:43:19.000+0000|2 |257625|1500 |65 |
#+----------------------------+---+------+-----+-----+
from pyspark.sql import functions as F
from pyspark.sql.window import Window
w=Window().partitionBy("id","date1").orderBy("date")
w2=Window().partitionBy("id","date1")
df.withColumn("date", F.to_timestamp("date","yyyy-MM-dd'T'HH:mm:ss"))\
.withColumn("date1", F.to_date("date"))\
.withColumn("rownum", F.row_number().over(w))\
.withColumn("max", F.max("rownum").over(w2))\
.filter('rownum=max').drop("date1","rownum","max")\
.orderBy("id","date").show(truncate=False)
#+-------------------+---+------+-----+-----+
#|date |id |stat1 |stat2|stat3|
#+-------------------+---+------+-----+-----+
#|2020-01-13 22:43:19|1 |173775|3042 |2996 |
#|2020-01-14 22:43:19|1 |173775|3042 |2996 |
#|2020-01-15 22:43:19|1 |173775|3042 |2996 |
#|2020-01-13 22:43:19|2 |257625|1500 |65 |
#+-------------------+---+------+-----+-----+
from pyspark.sql import functions as F
from pyspark.sql.window import Window
w=Window().partitionBy("id","date1").orderBy("date").rangeBetween(Window.unboundedPreceding,Window.unboundedFollowing)
df.withColumn("date", F.to_timestamp("date","yyyy-MM-dd'T'HH:mm:ss"))\
.withColumn("date1", F.to_date("date"))\
.withColumn("rownum", F.last("date").over(w))\
.filter('rownum=date').drop("date1","rownum").orderBy("id","date").show(truncate=False)
#+-------------------+---+------+-----+-----+
#|date |id |stat1 |stat2|stat3|
#+-------------------+---+------+-----+-----+
#|2020-01-13 22:43:19|1 |173775|3042 |2996 |
#|2020-01-14 22:43:19|1 |173775|3042 |2996 |
#|2020-01-15 22:43:19|1 |173775|3042 |2996 |
#|2020-01-13 22:43:19|2 |257625|1500 |65 |
#+-------------------+---+------+-----+-----+
使用max:
df.show()
#+----------------------------+---+------+-----+-----+
#|date |id |stat1 |stat2|stat3|
#+----------------------------+---+------+-----+-----+
#|2020-01-13T22:22:10.000+0000|1 |173736|3043 |2996 |
#|2020-01-13T22:43:19.000+0000|1 |173775|3042 |2996 |
#|2020-01-14T22:43:19.000+0000|1 |173775|3042 |2996 |
#|2020-01-15T22:43:19.000+0000|1 |173775|3042 |2996 |
#|2020-01-13T22:22:10.000+0000|2 |257624|1500 |53 |
#|2020-01-13T22:43:19.000+0000|2 |257625|1500 |65 |
#+----------------------------+---+------+-----+-----+
from pyspark.sql import functions as F
from pyspark.sql.window import Window
w=Window().partitionBy("id","date1").orderBy("date")
w2=Window().partitionBy("id","date1")
df.withColumn("date", F.to_timestamp("date","yyyy-MM-dd'T'HH:mm:ss"))\
.withColumn("date1", F.to_date("date"))\
.withColumn("rownum", F.row_number().over(w))\
.withColumn("max", F.max("rownum").over(w2))\
.filter('rownum=max').drop("date1","rownum","max")\
.orderBy("id","date").show(truncate=False)
#+-------------------+---+------+-----+-----+
#|date |id |stat1 |stat2|stat3|
#+-------------------+---+------+-----+-----+
#|2020-01-13 22:43:19|1 |173775|3042 |2996 |
#|2020-01-14 22:43:19|1 |173775|3042 |2996 |
#|2020-01-15 22:43:19|1 |173775|3042 |2996 |
#|2020-01-13 22:43:19|2 |257625|1500 |65 |
#+-------------------+---+------+-----+-----+
from pyspark.sql import functions as F
from pyspark.sql.window import Window
w=Window().partitionBy("id","date1").orderBy("date").rangeBetween(Window.unboundedPreceding,Window.unboundedFollowing)
df.withColumn("date", F.to_timestamp("date","yyyy-MM-dd'T'HH:mm:ss"))\
.withColumn("date1", F.to_date("date"))\
.withColumn("rownum", F.last("date").over(w))\
.filter('rownum=date').drop("date1","rownum").orderBy("id","date").show(truncate=False)
#+-------------------+---+------+-----+-----+
#|date |id |stat1 |stat2|stat3|
#+-------------------+---+------+-----+-----+
#|2020-01-13 22:43:19|1 |173775|3042 |2996 |
#|2020-01-14 22:43:19|1 |173775|3042 |2996 |
#|2020-01-15 22:43:19|1 |173775|3042 |2996 |
#|2020-01-13 22:43:19|2 |257625|1500 |65 |
#+-------------------+---+------+-----+-----+
在无界窗口上使用
函数:
df.show()
#+----------------------------+---+------+-----+-----+
#|date |id |stat1 |stat2|stat3|
#+----------------------------+---+------+-----+-----+
#|2020-01-13T22:22:10.000+0000|1 |173736|3043 |2996 |
#|2020-01-13T22:43:19.000+0000|1 |173775|3042 |2996 |
#|2020-01-14T22:43:19.000+0000|1 |173775|3042 |2996 |
#|2020-01-15T22:43:19.000+0000|1 |173775|3042 |2996 |
#|2020-01-13T22:22:10.000+0000|2 |257624|1500 |53 |
#|2020-01-13T22:43:19.000+0000|2 |257625|1500 |65 |
#+----------------------------+---+------+-----+-----+
from pyspark.sql import functions as F
from pyspark.sql.window import Window
w=Window().partitionBy("id","date1").orderBy("date")
w2=Window().partitionBy("id","date1")
df.withColumn("date", F.to_timestamp("date","yyyy-MM-dd'T'HH:mm:ss"))\
.withColumn("date1", F.to_date("date"))\
.withColumn("rownum", F.row_number().over(w))\
.withColumn("max", F.max("rownum").over(w2))\
.filter('rownum=max').drop("date1","rownum","max")\
.orderBy("id","date").show(truncate=False)
#+-------------------+---+------+-----+-----+
#|date |id |stat1 |stat2|stat3|
#+-------------------+---+------+-----+-----+
#|2020-01-13 22:43:19|1 |173775|3042 |2996 |
#|2020-01-14 22:43:19|1 |173775|3042 |2996 |
#|2020-01-15 22:43:19|1 |173775|3042 |2996 |
#|2020-01-13 22:43:19|2 |257625|1500 |65 |
#+-------------------+---+------+-----+-----+
from pyspark.sql import functions as F
from pyspark.sql.window import Window
w=Window().partitionBy("id","date1").orderBy("date").rangeBetween(Window.unboundedPreceding,Window.unboundedFollowing)
df.withColumn("date", F.to_timestamp("date","yyyy-MM-dd'T'HH:mm:ss"))\
.withColumn("date1", F.to_date("date"))\
.withColumn("rownum", F.last("date").over(w))\
.filter('rownum=date').drop("date1","rownum").orderBy("id","date").show(truncate=False)
#+-------------------+---+------+-----+-----+
#|date |id |stat1 |stat2|stat3|
#+-------------------+---+------+-----+-----+
#|2020-01-13 22:43:19|1 |173775|3042 |2996 |
#|2020-01-14 22:43:19|1 |173775|3042 |2996 |
#|2020-01-15 22:43:19|1 |173775|3042 |2996 |
#|2020-01-13 22:43:19|2 |257625|1500 |65 |
#+-------------------+---+------+-----+-----+