Warning: file_get_contents(/data/phpspider/zhask/data//catemap/3/apache-spark/6.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python 如何在PySpark中进行会话筛选?_Python_Apache Spark_Pyspark_Apache Spark Sql - Fatal编程技术网

Python 如何在PySpark中进行会话筛选?

Python 如何在PySpark中进行会话筛选?,python,apache-spark,pyspark,apache-spark-sql,Python,Apache Spark,Pyspark,Apache Spark Sql,我正在spark中使用timeseries数据,并希望通过应用会话筛选器来减少timestep的数量,该筛选器将timestep分组到彼此靠近的位置,并且只保留该组中的最后一个timestep。需要确保没有时间步延迟超过特定时间 我怎样才能优雅地使用PySpark 例如: 最大延迟为59分钟 对于输入时间步: 12:00, 12:01, 12:03, 13:00, 13:15, 13:45, 13:58, 14:15, 14:30, 14:45, 15:00, 15:20, 15:30; 19:

我正在spark中使用timeseries数据,并希望通过应用会话筛选器来减少timestep的数量,该筛选器将timestep分组到彼此靠近的位置,并且只保留该组中的最后一个timestep。需要确保没有时间步延迟超过特定时间

我怎样才能优雅地使用PySpark

例如:

最大延迟为59分钟

对于输入时间步: 12:00, 12:01, 12:03, 13:00, 13:15, 13:45, 13:58, 14:15, 14:30, 14:45, 15:00, 15:20, 15:30; 19:00

我想获得: 12:03,13:58,15:00,15:30,19:00

到目前为止,我只找到了一个(缓慢而冗长的)迭代解决方案:

"""
all_canges : pyspark.sql.DataFrame
        Input dataframe containing only partition columns and timestamp column
maxSessionDuration : int
        Maximum duration of a session in seconds
key : list
        List of partition keys
order_column : string
        Name of the timestamp column
max_iterations: int
        Maximum number of iterations to resolve a series of changes longer than the session duration. 
"""       

time_window = Window.partitionBy(key).orderBy("timestamp_seconds")

# Column names

timestep_seconds_col = "timestamp_seconds"
largest_preceding_col = "largest_preceding"
session_timestamp_col = "session_timestamp"
preserve_timestamp_col = "preserve_timestamp"

# Timestamp in seconds
all_changes = all_changes.withColumn(timestep_seconds_col, F.col(order_column).cast('timestamp').cast('long'))
# Split changes in session timesteps and non-session timesteps, for first run, all timestamps are non-session timesteps
session_timesteps = all_changes.filter(F.lit(False))
non_session_timesteps = all_changes.filter(F.lit(True))

# Logic for keeping records to in case of along series of changes that can be longer than a session.
cond_preceding = (F.col(largest_preceding_col).isNull() |
                  (F.col(largest_preceding_col) < F.col(timestep_seconds_col)))

# Initialize
iterations = 0
converged = False

while (iterations < max_iterations) & ~converged:
    iterations += 1

    new_timesteps = (
        non_session_timesteps
        # Step 1:
        # Max timestamp (value of order column) within max session duration
        .withColumn(session_timestamp_col,
                    F.max(F.col(timestep_seconds_col)).over(time_window.rangeBetween(0, maxSessionDuration)))
        # Step 2
        # Only keep session_timesteps if the timestamp is bigger than the session_timestamp of the previous row.
        .withColumn(session_timestamp_col,
                    F.when(F.col(timestep_seconds_col) > F.lag(session_timestamp_col, 1, 0).over(time_window),
                           F.col(session_timestamp_col))
                    .otherwise(F.lit(None))
                    )
        # Step 3
        # Account for long series of changes that can be longer than a session.
        .withColumn(largest_preceding_col,
                    F.last(F.col(session_timestamp_col),
                           ignorenulls=True)
                    .over(time_window.rowsBetween(Window.unboundedPreceding, Window.currentRow)))
        .withColumn(preserve_timestamp_col, cond_preceding)
    )
    # Session timestamps
    new_session_timestamps = (
        new_timesteps
        .filter(F.col(session_timestamp_col).isNotNull())
        .withColumn(timestep_seconds_col, F.col(session_timestamp_col))
        .drop(session_timestamp_col, largest_preceding_col, preserve_timestamp_col)
    )
    session_timesteps = session_timesteps.unionByName(new_session_timestamps)

    # Non-session timesteps to be treated in next iteration
    non_session_timesteps = (
        new_timesteps
        .filter(F.col(preserve_timestamp_col))
        .drop(session_timestamp_col, largest_preceding_col, preserve_timestamp_col)
    )
    # Convergence means that no time steps needed to be recovered due to consecutive changes
    converged = non_session_timesteps.count() == 0

# Use the remaining session_timestamps
all_changes_reduced = (
    session_timesteps
    .unionByName(non_session_timesteps)  # in case max iterations is reached
    .select(*key, F.col(timestep_seconds_col).cast(T.TimestampType()).alias(order_column))
)
“”“
所有存储:pyspark.sql.DataFrame
仅包含分区列和时间戳列的输入数据帧
maxSessionDuration:int
会话的最大持续时间(秒)
关键字:列表
分区键列表
order\u列:字符串
时间戳列的名称
最大迭代次数:int
解决超过会话持续时间的一系列更改的最大迭代次数。
"""       
time\u window=window.partitionBy(key.orderBy)(“时间戳\u秒”)
#列名
timestep\u seconds\u col=“timestamp\u seconds”
最大值在前\u col=“最大值在前”
会话\u时间戳\u col=“会话\u时间戳”
preserve\u timestamp\u col=“preserve\u timestamp”
#时间戳(秒)
all_changes=all_changes.withColumn(timestep_seconds_col,F.col(order_column)).cast('timestamp').cast('long'))
#拆分会话时间步和非会话时间步中的更改,对于第一次运行,所有时间步都是非会话时间步
会话\u timesteps=所有\u更改。筛选器(F.lit(False))
非会话时间步=所有更改。筛选器(F.lit(True))
#用于在发生一系列可能比会话更长的更改时保留记录的逻辑。
cond_preference=(F.col(最大的_preference_col).isNull()|
(F.col(最大值在前)F.lag(session\u timestamp\u col,1,0)。over(time\u window),
F.col(会话时间戳)
.否则(F.lit(无))
)
#步骤3
#说明可能比会话更长的一系列更改。
.带列(最大列前列),
F.last(F.col(会话时间戳),
ignorenulls=True)
.over(时间_window.rowsBetween(window.unbounddpreceiding,window.currentRow)))
.withColumn(保留时间戳col,条件前)
)
#会话时间戳
新会话时间戳=(
新的\u时间步
.filter(F.col(会话\u时间戳\u col).isNotNull())
.withColumn(timestep\u seconds\u col,F.col(session\u timestamp\u col))
.drop(会话\u时间戳\u列、最大\u前\u列、保留\u时间戳\u列)
)
session\u timesteps=session\u timesteps.unionByName(新的\u session\u时间戳)
#要在下一次迭代中处理的非会话时间步
非会话时间步=(
新的\u时间步
.filter(F.col(保留时间戳)
.drop(会话\u时间戳\u列、最大\u前\u列、保留\u时间戳\u列)
)
#收敛意味着不需要因连续更改而恢复时间步长
聚合=非会话\u时间步长。计数()==0
#使用剩余的会话\u时间戳
所有更改减少=(
会话\u时间步
.unionByName(非会话时间步)#以防达到最大迭代次数
.select(*键,F.col(timestep\u seconds\u col).cast(T.TimestampType()).alias(order\u列))
)