Python dask:提高将多个文件加载到单个数据帧的速度

Python dask:提高将多个文件加载到单个数据帧的速度,python,pandas,performance,dask,Python,Pandas,Performance,Dask,我正在相当定期地将数千个大小合理(约100万行)的数据帧合并在一起。 虽然我可以让pandas使用read\u csv,但由于开销极大,这是一个糟糕的解决方案 我需要一个更快的解决方案,dask显然在其read\u csv/read\u table函数中加入了这种多csv功能 然而,我没有注意到这些解决方案在速度上有多大的提高 有没有办法提高以下类型流程的速度 import io import re import numpy as np import dask.bag as dbag impor

我正在相当定期地将数千个大小合理(约100万行)的数据帧合并在一起。 虽然我可以让pandas使用
read\u csv

read\u csv
read\u table



import io
import re
import numpy as np
import dask.bag as dbag
import dask.dataframe as ddf

def filter_data(fp, ix_col = 'index_here', val_col = 'some_value'):
    dask_frame = ddf.read_table(fp)
    # filter to only one column and index (like a series)
    series = dask_frame[[ix_col, val_col]].set_index(ix_col)

    # Rename it to be the filename / file_id
    file_id = re.match("file_(.+)\.txt", fp)[1]
    series.columns = [file_id]
    return series

def get_dataframe(file_paths):
    # Make a collection
    dasks_bag = dbag.from_sequence(file_paths)

    # Open the files as dask frame and filter each to series-like frames
    filtered_dfs =

    # Compute pandas dataframe on each within the list
    filtered_dfs = filtered_dfs.compute()

    # concatenate them together
    df = ddf.concat(filtered_dfs, axis = 1)

    # Compute on concatenated again, so it becomes pandas dataframe
    return df.dropna(how = "all").compute()

# Just write some random files here
paths = ['file_120202021.txt', 'file_123.txt', 'file_12330.txt']
for fp in paths:
    with open(fp, 'w') as f:
        for row in range(0,1000):
            for val, other_col in np.random.rand(1, 2):

# Make a dataframe with dask
编辑: 我这里有一个小脚本,显示了dask的失败: 在我的机器上进行dask所需的时间为1.87秒 而熊猫所需的时间为0.29秒


import io
import re
import numpy as np
import pandas as pd
import dask.bag as dbag
import dask.dataframe as ddf
import time

def get_dask_dataframe(file_paths,  ix_col = 'index_here', val_col = 'some_value'):
    # Make a collection
    dasks_bag = dbag.from_sequence(file_paths)

    # read and filter to data of interest
    dask_frames = ddf.read_table(file_paths, include_path_column = True)[[ix_col, val_col, 'path']]

    # Make pandas dataframe
    df = dask_frames.compute()

    # Pivot since read_table puts path in one column
    df = df.pivot_table(values = val_col, index = ix_col, columns = 'path')
    return df.dropna(how = "all")

def get_pandas_dataframe(file_paths, ix_col = 'index_here', val_col = 'some_value'):
    # Make a collection
    l = []
    for f in file_paths:
        series = pd.read_csv(f, sep = '\t')[[ix_col, val_col]].set_index(ix_col)
        # Rename it to be the filename / file_id
        file_id = re.match("file_(.+)\.txt", f)[1]
        series.columns = [file_id]
        l += [series]

    # concatenate them together
    df = pd.concat(l, axis = 1)
    return df.dropna(how = "all")

# Just write a whole bunch of random files
paths = ['file_'+str(i)+'.txt' for i in range(0, 100)]
for fp in paths:
    with open(fp, 'w') as f:
        for row in range(0,1000):
            for val, other_col in np.random.rand(1, 2):

t0 = time.time()
# Make a dataframe with dask
df1 = get_dask_dataframe(paths)
t1 = time.time()

t0 = time.time()
# Make a dataframe with dask
df2 = get_pandas_dataframe(paths)
t1 = time.time()