Pandas 如何在dask环境中使用pd.groupby.transform改进函数

Pandas 如何在dask环境中使用pd.groupby.transform改进函数,pandas,dask,Pandas,Dask,我们需要根据时间顺序创建组。 我们正在使用dask,但对于此功能,我们需要回到pandas,因为它尚未在dask中实现。虽然该功能可以工作,但是否仍有改进性能的方法?(我们在本地客户端上运行代码,有时在纱线客户端上运行代码) 以下是我们的功能和一个最小、完整和可验证的示例: import pandas as pd import numpy as np import random import dask import dask.dataframe as dd from datetime impor

我们需要根据时间顺序创建组。 我们正在使用

import pandas as pd
import numpy as np
import random
import dask
import dask.dataframe as dd
from datetime import timedelta

def create_groups_from_time_sequence(df, col_id: str=None, col_time: np.datetime64=None, time_threshold: str='120s',
                                     min_weight: int=2) -> pd.DataFrame:
    Function creates group of units for  relationships
    :param df: dataframe pandas or dask
    :param col_id: column containing the index
    :param col_time: column containing datetime of query
    :param time_threshold: maximum threshold between queries to create
    :param min_weight: The threshold to filter the minimum relationship between 2 ids
    :return: pandas dataframe
    partitions = None
    if isinstance(df, dd.DataFrame):
        partitions = df.npartitions
        df = df.compute()

    if np.issubdtype(df[col_time].dtype, np.datetime64):
        df[col_time] = pd.to_datetime(df[col_time])

    df = df.sort_values([col_id, col_time])
    df['cluster_bool'] = df.groupby(col_id)[col_time].transform(lambda x: x.diff() > time_threshold)
    df['EdgeID'] = df.groupby(col_id)['cluster_bool'].transform(lambda x: x.astype(int).cumsum())
    df['cluster_weight'] = df.groupby([col_id, 'EdgeID'])['EdgeID'].transform('count')
    mask_weight = df['cluster_weight'] > min_weight
    df = df[mask_weight]
    df = df.drop(['cluster_bool'], axis=1).reset_index(drop=True)

    if partitions:
        df = dd.from_pandas(df, npartitions=partitions)
        df = df.set_index('EdgeID')

    return df
 df_raw = dask.datasets.timeseries()
 df = df_raw[['id', 'name']]
 df = df.assign(timegroup=df.index)
 df.timegroup = df.timegroup.apply(lambda s: s + timedelta(seconds=random.randint(0,60)) ) 

| timestamp           | id   | name   | timegroup           |  
| 2000-01-01 00:00:00 | 968  | Alice  | 2000-01-01 00:00:46 |  
| 2000-01-01 00:00:01 | 1030 | Xavier | 2000-01-01 00:00:22 |  
| 2000-01-01 00:00:02 | 991  | George | 2000-01-01 00:00:59 |  
| 2000-01-01 00:00:03 | 975  | Zelda  | 2000-01-01 00:00:26 |  
| 2000-01-01 00:00:04 | 1028 | Zelda  | 2000-01-01 00:00:18 |  

 dfg = create_groups_from_time_sequence(df, col_id='id', col_time='timegroup', time_threshold='120s',min_weight=2)

| EdgeID    | id    | name      | timegroup             | cluster_weight    |  
|--------   |------ |---------  |---------------------  |----------------   |  
| 0         | 960   | Norbert   | 2000-01-01 00:01:10   | 3                 |  
| 0         | 969   | Sarah     | 2000-01-01 00:03:32   | 7                 |  
| 0         | 1013  | Michael   | 2000-01-01 00:02:58   | 8                 |  
| 0         | 963   | Ray       | 2000-01-01 00:05:58   | 5                 |  
| 0         | 996   | Ray       | 2000-01-01 00:03:41   | 6                 |  

import pandas as pd
import numpy as np
import random
import dask
import dask.dataframe as dd
from datetime import timedelta

def create_groups_from_time_sequence(df, col_id: str=None, col_time: np.datetime64=None, time_threshold: str='120s',
                                     min_weight: int=2) -> pd.DataFrame:
    Function creates group of units for  relationships
    :param df: dataframe pandas or dask
    :param col_id: column containing the index
    :param col_time: column containing datetime of query
    :param time_threshold: maximum threshold between queries to create
    :param min_weight: The threshold to filter the minimum relationship between 2 ids
    :return: pandas dataframe
    partitions = None
    if isinstance(df, dd.DataFrame):
        partitions = df.npartitions
        df = df.compute()

    if np.issubdtype(df[col_time].dtype, np.datetime64):
        df[col_time] = pd.to_datetime(df[col_time])

    df = df.sort_values([col_id, col_time])
    df['cluster_bool'] = df.groupby(col_id)[col_time].transform(lambda x: x.diff() > time_threshold)
    df['EdgeID'] = df.groupby(col_id)['cluster_bool'].transform(lambda x: x.astype(int).cumsum())
    df['cluster_weight'] = df.groupby([col_id, 'EdgeID'])['EdgeID'].transform('count')
    mask_weight = df['cluster_weight'] > min_weight
    df = df[mask_weight]
    df = df.drop(['cluster_bool'], axis=1).reset_index(drop=True)

    if partitions:
        df = dd.from_pandas(df, npartitions=partitions)
        df = df.set_index('EdgeID')

    return df
 df_raw = dask.datasets.timeseries()
 df = df_raw[['id', 'name']]
 df = df.assign(timegroup=df.index)
 df.timegroup = df.timegroup.apply(lambda s: s + timedelta(seconds=random.randint(0,60)) ) 

| timestamp           | id   | name   | timegroup           |  
| 2000-01-01 00:00:00 | 968  | Alice  | 2000-01-01 00:00:46 |  
| 2000-01-01 00:00:01 | 1030 | Xavier | 2000-01-01 00:00:22 |  
| 2000-01-01 00:00:02 | 991  | George | 2000-01-01 00:00:59 |  
| 2000-01-01 00:00:03 | 975  | Zelda  | 2000-01-01 00:00:26 |  
| 2000-01-01 00:00:04 | 1028 | Zelda  | 2000-01-01 00:00:18 |  

 dfg = create_groups_from_time_sequence(df, col_id='id', col_time='timegroup', time_threshold='120s',min_weight=2)

| EdgeID    | id    | name      | timegroup             | cluster_weight    |  
|--------   |------ |---------  |---------------------  |----------------   |  
| 0         | 960   | Norbert   | 2000-01-01 00:01:10   | 3                 |  
| 0         | 969   | Sarah     | 2000-01-01 00:03:32   | 7                 |  
| 0         | 1013  | Michael   | 2000-01-01 00:02:58   | 8                 |  
| 0         | 963   | Ray       | 2000-01-01 00:05:58   | 5                 |  
| 0         | 996   | Ray       | 2000-01-01 00:03:41   | 6                 |  