Python dask：提高将多个文件加载到单个数据帧的速度_Python_Pandas_Performance_Dask

Python dask：提高将多个文件加载到单个数据帧的速度

python pandas performance dask

Python dask：提高将多个文件加载到单个数据帧的速度,python,pandas,performance,dask,Python,Pandas,Performance,Dask,我正在相当定期地将数千个大小合理（约100万行）的数据帧合并在一起。虽然我可以让pandas使用read\u csv，但由于开销极大，这是一个糟糕的解决方案我需要一个更快的解决方案，dask显然在其read\u csv/read\u table函数中加入了这种多csv功能然而，我没有注意到这些解决方案在速度上有多大的提高有没有办法提高以下类型流程的速度 import io import re import numpy as np import dask.bag as dbag impor

我正在相当定期地将数千个大小合理（约100万行）的数据帧合并在一起。虽然我可以让pandas使用

read\u csv

，但由于开销极大，这是一个糟糕的解决方案

我需要一个更快的解决方案，dask显然在其

read\u csv

read\u table

函数中加入了这种多csv功能

然而，我没有注意到这些解决方案在速度上有多大的提高

有没有办法提高以下类型流程的速度

import io
import re
import numpy as np
import dask.bag as dbag
import dask.dataframe as ddf

def filter_data(fp, ix_col = 'index_here', val_col = 'some_value'):
    dask_frame = ddf.read_table(fp)
    # filter to only one column and index (like a series)
    series = dask_frame[[ix_col, val_col]].set_index(ix_col)

    # Rename it to be the filename / file_id
    file_id = re.match("file_(.+)\.txt", fp)[1]
    series.columns = [file_id]
    return series

def get_dataframe(file_paths):
    # Make a collection
    dasks_bag = dbag.from_sequence(file_paths)

    # Open the files as dask frame and filter each to series-like frames
    filtered_dfs = dasks_bag.map(filter_data)

    # Compute pandas dataframe on each within the list
    filtered_dfs = filtered_dfs.compute()

    # concatenate them together
    df = ddf.concat(filtered_dfs, axis = 1)

    # Compute on concatenated again, so it becomes pandas dataframe
    return df.dropna(how = "all").compute()



# Just write some random files here
paths = ['file_120202021.txt', 'file_123.txt', 'file_12330.txt']
for fp in paths:
    with open(fp, 'w') as f:
        f.write('index_here\tsome_value\tother_cols\n')
        for row in range(0,1000):
            for val, other_col in np.random.rand(1, 2):
                f.write(str(row)+'\t'+str(val)+'\t'+str(other_col)+'\n')

# Make a dataframe with dask
get_dataframe(paths)

编辑： 我这里有一个小脚本，显示了dask的失败：在我的机器上进行dask所需的时间为1.87秒 而熊猫所需的时间为0.29秒

显然，我这样做是错误的，因为dask是专门为数据帧上更快速的计算而设计的

import io
import re
import numpy as np
import pandas as pd
import dask.bag as dbag
import dask.dataframe as ddf
import time

def get_dask_dataframe(file_paths,  ix_col = 'index_here', val_col = 'some_value'):
    # Make a collection
    dasks_bag = dbag.from_sequence(file_paths)

    # read and filter to data of interest
    dask_frames = ddf.read_table(file_paths, include_path_column = True)[[ix_col, val_col, 'path']]

    # Make pandas dataframe
    df = dask_frames.compute()

    # Pivot since read_table puts path in one column
    df = df.pivot_table(values = val_col, index = ix_col, columns = 'path')
    return df.dropna(how = "all")

def get_pandas_dataframe(file_paths, ix_col = 'index_here', val_col = 'some_value'):
    # Make a collection
    l = []
    for f in file_paths:
        series = pd.read_csv(f, sep = '\t')[[ix_col, val_col]].set_index(ix_col)
        # Rename it to be the filename / file_id
        file_id = re.match("file_(.+)\.txt", f)[1]
        series.columns = [file_id]
        l += [series]

    # concatenate them together
    df = pd.concat(l, axis = 1)
    return df.dropna(how = "all")


# Just write a whole bunch of random files
paths = ['file_'+str(i)+'.txt' for i in range(0, 100)]
for fp in paths:
    with open(fp, 'w') as f:
        f.write('index_here\tsome_value\tother_cols\n')
        for row in range(0,1000):
            for val, other_col in np.random.rand(1, 2):
                f.write(str(row)+'\t'+str(val)+'\t'+str(other_col)+'\n')

t0 = time.time()
# Make a dataframe with dask
df1 = get_dask_dataframe(paths)
t1 = time.time()
print(t1-t0)

t0 = time.time()
# Make a dataframe with dask
df2 = get_pandas_dataframe(paths)
t1 = time.time()
print(t1-t0)