Python dask:提高将多个文件加载到单个数据帧的速度

Python dask:提高将多个文件加载到单个数据帧的速度,python,pandas,performance,dask,Python,Pandas,Performance,Dask,我正在相当定期地将数千个大小合理(约100万行)的数据帧合并在一起。 虽然我可以让pandas使用read\u csv,但由于开销极大,这是一个糟糕的解决方案 我需要一个更快的解决方案,dask显然在其read\u csv/read\u table函数中加入了这种多csv功能 然而,我没有注意到这些解决方案在速度上有多大的提高 有没有办法提高以下类型流程的速度 import io import re import numpy as np import dask.bag as dbag impor

我正在相当定期地将数千个大小合理(约100万行)的数据帧合并在一起。 虽然我可以让pandas使用
read\u csv
,但由于开销极大,这是一个糟糕的解决方案

我需要一个更快的解决方案,dask显然在其
read\u csv
/
read\u table
函数中加入了这种多csv功能

然而,我没有注意到这些解决方案在速度上有多大的提高

有没有办法提高以下类型流程的速度

import io
import re
import numpy as np
import dask.bag as dbag
import dask.dataframe as ddf

def filter_data(fp, ix_col = 'index_here', val_col = 'some_value'):
    dask_frame = ddf.read_table(fp)
    # filter to only one column and index (like a series)
    series = dask_frame[[ix_col, val_col]].set_index(ix_col)

    # Rename it to be the filename / file_id
    file_id = re.match("file_(.+)\.txt", fp)[1]
    series.columns = [file_id]
    return series

def get_dataframe(file_paths):
    # Make a collection
    dasks_bag = dbag.from_sequence(file_paths)

    # Open the files as dask frame and filter each to series-like frames
    filtered_dfs = dasks_bag.map(filter_data)

    # Compute pandas dataframe on each within the list
    filtered_dfs = filtered_dfs.compute()

    # concatenate them together
    df = ddf.concat(filtered_dfs, axis = 1)

    # Compute on concatenated again, so it becomes pandas dataframe
    return df.dropna(how = "all").compute()



# Just write some random files here
paths = ['file_120202021.txt', 'file_123.txt', 'file_12330.txt']
for fp in paths:
    with open(fp, 'w') as f:
        f.write('index_here\tsome_value\tother_cols\n')
        for row in range(0,1000):
            for val, other_col in np.random.rand(1, 2):
                f.write(str(row)+'\t'+str(val)+'\t'+str(other_col)+'\n')

# Make a dataframe with dask
get_dataframe(paths)
编辑: 我这里有一个小脚本,显示了dask的失败: 在我的机器上进行dask所需的时间为1.87秒 而熊猫所需的时间为0.29秒

显然,我这样做是错误的,因为dask是专门为数据帧上更快速的计算而设计的

import io
import re
import numpy as np
import pandas as pd
import dask.bag as dbag
import dask.dataframe as ddf
import time

def get_dask_dataframe(file_paths,  ix_col = 'index_here', val_col = 'some_value'):
    # Make a collection
    dasks_bag = dbag.from_sequence(file_paths)

    # read and filter to data of interest
    dask_frames = ddf.read_table(file_paths, include_path_column = True)[[ix_col, val_col, 'path']]

    # Make pandas dataframe
    df = dask_frames.compute()

    # Pivot since read_table puts path in one column
    df = df.pivot_table(values = val_col, index = ix_col, columns = 'path')
    return df.dropna(how = "all")

def get_pandas_dataframe(file_paths, ix_col = 'index_here', val_col = 'some_value'):
    # Make a collection
    l = []
    for f in file_paths:
        series = pd.read_csv(f, sep = '\t')[[ix_col, val_col]].set_index(ix_col)
        # Rename it to be the filename / file_id
        file_id = re.match("file_(.+)\.txt", f)[1]
        series.columns = [file_id]
        l += [series]

    # concatenate them together
    df = pd.concat(l, axis = 1)
    return df.dropna(how = "all")


# Just write a whole bunch of random files
paths = ['file_'+str(i)+'.txt' for i in range(0, 100)]
for fp in paths:
    with open(fp, 'w') as f:
        f.write('index_here\tsome_value\tother_cols\n')
        for row in range(0,1000):
            for val, other_col in np.random.rand(1, 2):
                f.write(str(row)+'\t'+str(val)+'\t'+str(other_col)+'\n')

t0 = time.time()
# Make a dataframe with dask
df1 = get_dask_dataframe(paths)
t1 = time.time()
print(t1-t0)

t0 = time.time()
# Make a dataframe with dask
df2 = get_pandas_dataframe(paths)
t1 = time.time()
print(t1-t0)