如何优化python dataframe上当前使用列和行迭代完成的多个操作？_Python_Pandas_Scipy_Vectorization_Pandas Groupby

如何优化python dataframe上当前使用列和行迭代完成的多个操作？

python pandas

如何优化python dataframe上当前使用列和行迭代完成的多个操作？,python,pandas,scipy,vectorization,pandas-groupby,Python,Pandas,Scipy,Vectorization,Pandas Groupby,我目前有一个处理行和列多次迭代的过程。我相信应该有一种更有效的方法来计算输出，在数据帧上使用某种向量化函数，并结合GROUPBY；然而，我不知道怎么做这段代码在示例数据集上运行得非常快，但我在记录的竖梃上运行了多次。这是一个示例代码： import pandas as pd import numpy as np from scipy import stats data_metric1 = list(np.random.rand(100)) + list(np.random.rand(100

我目前有一个处理行和列多次迭代的过程。我相信应该有一种更有效的方法来计算输出，在数据帧上使用某种向量化函数，并结合GROUPBY；然而，我不知道怎么做

这段代码在示例数据集上运行得非常快，但我在记录的竖梃上运行了多次。

这是一个示例代码：

import pandas as pd
import numpy as np
from scipy import stats

data_metric1 = list(np.random.rand(100)) + list(np.random.rand(100) + np.random.randint(-1, 1, 100) * 0.1)
data_metric2 = list(np.random.rand(100)) + list(np.random.rand(100) + np.random.randint(-1, 1, 100) * 0.1)

test_label = list(np.random.choice(['test', 'control'], 100))
dimensions = list(np.random.choice(['a', 'b'], 100))

df = pd.DataFrame(list(zip(test_label, dimensions, data_metric1, data_metric2)), columns=['tc', 'dim', 'metric1', 'metric2'])

cols = ['dim', 'col', 'control_count', 'test_count', 'mean_control', 'mean_test', 'lift', 'p_value_utest']
out_list = []

def run_test_detail(df, dim, col):
    control_data = df[df.tc == 'control'][col]
    test_data = df[df.tc == 'test'][col]
    mean_control = control_data.mean()
    mean_test = test_data.mean()
    control_count = control_data.size
    test_count = test_data.size

    lift = (mean_test - mean_control) / mean_control * 100
    p_value_utest = stats.mannwhitneyu(control_data, test_data)[1]

    r = [dim, col, control_count, test_count, mean_control, mean_test, lift, p_value_utest]
    out_list.append(r)


def run_test(df):
    dims = df['dim'].unique()
    cols = [col for col in df.columns if col not in ['tc', 'dim']]
    for col in cols:
        run_test_detail(df, 'all', col)

    for dim in dims:
        df_dim = df[df['dim'] == dim]
        for col in cols:
            run_test_detail(df_dim, dim, col)

run_test(df)
out_df = pd.DataFrame(out_list, columns=cols)
print(tabulate(out_df, headers='keys', showindex=False))
dim    col        control_count    test_count    mean_control    mean_test       lift    p_value_utest
-----  -------  ---------------  ------------  --------------  -----------  ---------  ---------------
all    metric1               56            44        0.520419     0.499656   -3.98968        0.285719
all    metric2               56            44        0.460029     0.540992   17.5995         0.0798384
b      metric1               28            22        0.53479      0.546289    2.15024        0.488306
b      metric2               28            22        0.496291     0.596768   20.2456         0.114689
a      metric1               28            22        0.506048     0.453023  -10.4783         0.22006
a      metric2               28            22        0.423766     0.485215   14.5006         0.157083

我已经使用多重处理重构了您的解决方案。如果您有很多度量和维度，那么它应该可以很好地扩展<代码>运行测试\u详细信息可以通过一些缓存进行更多优化，但我不知道您的数据有多大，因此可能会出现问题。。此外，如果需要一些标准的统计程序，我会检查

请让我知道，如果它提高了你的执行时间

import pandas as pd
import numpy as np
from scipy import stats
import multiprocessing as mp


data_metric1 = list(np.random.rand(100)) + list(np.random.rand(100) + np.random.randint(-1, 1, 100) * 0.1)
data_metric2 = list(np.random.rand(100)) + list(np.random.rand(100) + np.random.randint(-1, 1, 100) * 0.1)

test_label = list(np.random.choice(['test', 'control'], 100))
dimensions = list(np.random.choice(['a', 'b'], 100))

df = pd.DataFrame(list(zip(test_label, dimensions, data_metric1, data_metric2)), columns=['tc', 'dim', 'metric1', 'metric2'])

##############

def run_test_detail(df, dim, col):
    control_data = df[df.tc == 'control'][col]
    test_data = df[df.tc == 'test'][col]
    mean_control = control_data.mean()
    mean_test = test_data.mean()
    control_count = control_data.size
    test_count = test_data.size

    lift = (mean_test - mean_control) / mean_control * 100
    p_value_utest = stats.mannwhitneyu(control_data, test_data)[1]

    r = [dim, col, control_count, test_count, mean_control, mean_test, lift, p_value_utest]
    return r


##############

pool = mp.Pool(processes=mp.cpu_count())

result1 = pool.starmap(run_test_detail, [(df, 'all', col) 
                       for col in df.columns.difference(['tc', 'dim'])])

result2 = pool.starmap(run_test_detail, [(df[df['dim'] == dim], dim, col) 
                       for col in df.columns.difference(['tc', 'dim']) 
                       for dim in df['dim'].unique()])

##############

cols = ['dim', 'col', 'control_count', 'test_count', 'mean_control', 'mean_test', 'lift', 'p_value_utest']
out_df = pd.DataFrame(data=result1+result2, columns=cols)

为什么您需要在列上迭代，您不能只执行

df[cols]