Pandas 如何在dask环境中使用pd.groupby.transform改进函数

Pandas 如何在dask环境中使用pd.groupby.transform改进函数,pandas,dask,Pandas,Dask,我们需要根据时间顺序创建组。 我们正在使用dask,但对于此功能,我们需要回到pandas,因为它尚未在dask中实现。虽然该功能可以工作,但是否仍有改进性能的方法?(我们在本地客户端上运行代码,有时在纱线客户端上运行代码) 以下是我们的功能和一个最小、完整和可验证的示例: import pandas as pd import numpy as np import random import dask import dask.dataframe as dd from datetime impor

我们需要根据时间顺序创建组。 我们正在使用
dask
,但对于此功能,我们需要回到pandas,因为它尚未在
dask
中实现。虽然该功能可以工作,但是否仍有改进性能的方法?(我们在
本地客户端
上运行代码,有时在
纱线客户端
上运行代码)
以下是我们的功能和一个最小、完整和可验证的示例:

import pandas as pd
import numpy as np
import random
import dask
import dask.dataframe as dd
from datetime import timedelta

def create_groups_from_time_sequence(df, col_id: str=None, col_time: np.datetime64=None, time_threshold: str='120s',
                                     min_weight: int=2) -> pd.DataFrame:
    """
    Function creates group of units for  relationships
    :param df: dataframe pandas or dask
    :param col_id: column containing the index
    :param col_time: column containing datetime of query
    :param time_threshold: maximum threshold between queries to create
    :param min_weight: The threshold to filter the minimum relationship between 2 ids
    :return: pandas dataframe
    """
    partitions = None
    if isinstance(df, dd.DataFrame):
        partitions = df.npartitions
        df = df.compute()

    if np.issubdtype(df[col_time].dtype, np.datetime64):
        df[col_time] = pd.to_datetime(df[col_time])

    df = df.sort_values([col_id, col_time])
    df['cluster_bool'] = df.groupby(col_id)[col_time].transform(lambda x: x.diff() > time_threshold)
    df['EdgeID'] = df.groupby(col_id)['cluster_bool'].transform(lambda x: x.astype(int).cumsum())
    df['cluster_weight'] = df.groupby([col_id, 'EdgeID'])['EdgeID'].transform('count')
    mask_weight = df['cluster_weight'] > min_weight
    df = df[mask_weight]
    df = df.drop(['cluster_bool'], axis=1).reset_index(drop=True)

    if partitions:
        df = dd.from_pandas(df, npartitions=partitions)
        df = df.set_index('EdgeID')

    return df
 df_raw = dask.datasets.timeseries()
 df = df_raw[['id', 'name']]
 df = df.assign(timegroup=df.index)
 df.timegroup = df.timegroup.apply(lambda s: s + timedelta(seconds=random.randint(0,60)) ) 
 df.head()


| timestamp           | id   | name   | timegroup           |  
| 2000-01-01 00:00:00 | 968  | Alice  | 2000-01-01 00:00:46 |  
| 2000-01-01 00:00:01 | 1030 | Xavier | 2000-01-01 00:00:22 |  
| 2000-01-01 00:00:02 | 991  | George | 2000-01-01 00:00:59 |  
| 2000-01-01 00:00:03 | 975  | Zelda  | 2000-01-01 00:00:26 |  
| 2000-01-01 00:00:04 | 1028 | Zelda  | 2000-01-01 00:00:18 |  


 dfg = create_groups_from_time_sequence(df, col_id='id', col_time='timegroup', time_threshold='120s',min_weight=2)
 dfg.head() 

| EdgeID    | id    | name      | timegroup             | cluster_weight    |  
|--------   |------ |---------  |---------------------  |----------------   |  
| 0         | 960   | Norbert   | 2000-01-01 00:01:10   | 3                 |  
| 0         | 969   | Sarah     | 2000-01-01 00:03:32   | 7                 |  
| 0         | 1013  | Michael   | 2000-01-01 00:02:58   | 8                 |  
| 0         | 963   | Ray       | 2000-01-01 00:05:58   | 5                 |  
| 0         | 996   | Ray       | 2000-01-01 00:03:41   | 6                 |  
将上述函数用于
dask
数据集示例:

import pandas as pd
import numpy as np
import random
import dask
import dask.dataframe as dd
from datetime import timedelta

def create_groups_from_time_sequence(df, col_id: str=None, col_time: np.datetime64=None, time_threshold: str='120s',
                                     min_weight: int=2) -> pd.DataFrame:
    """
    Function creates group of units for  relationships
    :param df: dataframe pandas or dask
    :param col_id: column containing the index
    :param col_time: column containing datetime of query
    :param time_threshold: maximum threshold between queries to create
    :param min_weight: The threshold to filter the minimum relationship between 2 ids
    :return: pandas dataframe
    """
    partitions = None
    if isinstance(df, dd.DataFrame):
        partitions = df.npartitions
        df = df.compute()

    if np.issubdtype(df[col_time].dtype, np.datetime64):
        df[col_time] = pd.to_datetime(df[col_time])

    df = df.sort_values([col_id, col_time])
    df['cluster_bool'] = df.groupby(col_id)[col_time].transform(lambda x: x.diff() > time_threshold)
    df['EdgeID'] = df.groupby(col_id)['cluster_bool'].transform(lambda x: x.astype(int).cumsum())
    df['cluster_weight'] = df.groupby([col_id, 'EdgeID'])['EdgeID'].transform('count')
    mask_weight = df['cluster_weight'] > min_weight
    df = df[mask_weight]
    df = df.drop(['cluster_bool'], axis=1).reset_index(drop=True)

    if partitions:
        df = dd.from_pandas(df, npartitions=partitions)
        df = df.set_index('EdgeID')

    return df
 df_raw = dask.datasets.timeseries()
 df = df_raw[['id', 'name']]
 df = df.assign(timegroup=df.index)
 df.timegroup = df.timegroup.apply(lambda s: s + timedelta(seconds=random.randint(0,60)) ) 
 df.head()


| timestamp           | id   | name   | timegroup           |  
| 2000-01-01 00:00:00 | 968  | Alice  | 2000-01-01 00:00:46 |  
| 2000-01-01 00:00:01 | 1030 | Xavier | 2000-01-01 00:00:22 |  
| 2000-01-01 00:00:02 | 991  | George | 2000-01-01 00:00:59 |  
| 2000-01-01 00:00:03 | 975  | Zelda  | 2000-01-01 00:00:26 |  
| 2000-01-01 00:00:04 | 1028 | Zelda  | 2000-01-01 00:00:18 |  


 dfg = create_groups_from_time_sequence(df, col_id='id', col_time='timegroup', time_threshold='120s',min_weight=2)
 dfg.head() 

| EdgeID    | id    | name      | timegroup             | cluster_weight    |  
|--------   |------ |---------  |---------------------  |----------------   |  
| 0         | 960   | Norbert   | 2000-01-01 00:01:10   | 3                 |  
| 0         | 969   | Sarah     | 2000-01-01 00:03:32   | 7                 |  
| 0         | 1013  | Michael   | 2000-01-01 00:02:58   | 8                 |  
| 0         | 963   | Ray       | 2000-01-01 00:05:58   | 5                 |  
| 0         | 996   | Ray       | 2000-01-01 00:03:41   | 6                 |  
看见