Python dask:提高将多个文件加载到单个数据帧的速度
我正在相当定期地将数千个大小合理(约100万行)的数据帧合并在一起。 虽然我可以让pandas使用Python dask:提高将多个文件加载到单个数据帧的速度,python,pandas,performance,dask,Python,Pandas,Performance,Dask,我正在相当定期地将数千个大小合理(约100万行)的数据帧合并在一起。 虽然我可以让pandas使用read\u csv,但由于开销极大,这是一个糟糕的解决方案 我需要一个更快的解决方案,dask显然在其read\u csv/read\u table函数中加入了这种多csv功能 然而,我没有注意到这些解决方案在速度上有多大的提高 有没有办法提高以下类型流程的速度 import io import re import numpy as np import dask.bag as dbag impor
read\u csv
,但由于开销极大,这是一个糟糕的解决方案
我需要一个更快的解决方案,dask显然在其read\u csv
/read\u table
函数中加入了这种多csv功能
然而,我没有注意到这些解决方案在速度上有多大的提高
有没有办法提高以下类型流程的速度
import io
import re
import numpy as np
import dask.bag as dbag
import dask.dataframe as ddf
def filter_data(fp, ix_col = 'index_here', val_col = 'some_value'):
dask_frame = ddf.read_table(fp)
# filter to only one column and index (like a series)
series = dask_frame[[ix_col, val_col]].set_index(ix_col)
# Rename it to be the filename / file_id
file_id = re.match("file_(.+)\.txt", fp)[1]
series.columns = [file_id]
return series
def get_dataframe(file_paths):
# Make a collection
dasks_bag = dbag.from_sequence(file_paths)
# Open the files as dask frame and filter each to series-like frames
filtered_dfs = dasks_bag.map(filter_data)
# Compute pandas dataframe on each within the list
filtered_dfs = filtered_dfs.compute()
# concatenate them together
df = ddf.concat(filtered_dfs, axis = 1)
# Compute on concatenated again, so it becomes pandas dataframe
return df.dropna(how = "all").compute()
# Just write some random files here
paths = ['file_120202021.txt', 'file_123.txt', 'file_12330.txt']
for fp in paths:
with open(fp, 'w') as f:
f.write('index_here\tsome_value\tother_cols\n')
for row in range(0,1000):
for val, other_col in np.random.rand(1, 2):
f.write(str(row)+'\t'+str(val)+'\t'+str(other_col)+'\n')
# Make a dataframe with dask
get_dataframe(paths)
编辑:
我这里有一个小脚本,显示了dask的失败:
在我的机器上进行dask所需的时间为1.87秒
而熊猫所需的时间为0.29秒
显然,我这样做是错误的,因为dask是专门为数据帧上更快速的计算而设计的
import io
import re
import numpy as np
import pandas as pd
import dask.bag as dbag
import dask.dataframe as ddf
import time
def get_dask_dataframe(file_paths, ix_col = 'index_here', val_col = 'some_value'):
# Make a collection
dasks_bag = dbag.from_sequence(file_paths)
# read and filter to data of interest
dask_frames = ddf.read_table(file_paths, include_path_column = True)[[ix_col, val_col, 'path']]
# Make pandas dataframe
df = dask_frames.compute()
# Pivot since read_table puts path in one column
df = df.pivot_table(values = val_col, index = ix_col, columns = 'path')
return df.dropna(how = "all")
def get_pandas_dataframe(file_paths, ix_col = 'index_here', val_col = 'some_value'):
# Make a collection
l = []
for f in file_paths:
series = pd.read_csv(f, sep = '\t')[[ix_col, val_col]].set_index(ix_col)
# Rename it to be the filename / file_id
file_id = re.match("file_(.+)\.txt", f)[1]
series.columns = [file_id]
l += [series]
# concatenate them together
df = pd.concat(l, axis = 1)
return df.dropna(how = "all")
# Just write a whole bunch of random files
paths = ['file_'+str(i)+'.txt' for i in range(0, 100)]
for fp in paths:
with open(fp, 'w') as f:
f.write('index_here\tsome_value\tother_cols\n')
for row in range(0,1000):
for val, other_col in np.random.rand(1, 2):
f.write(str(row)+'\t'+str(val)+'\t'+str(other_col)+'\n')
t0 = time.time()
# Make a dataframe with dask
df1 = get_dask_dataframe(paths)
t1 = time.time()
print(t1-t0)
t0 = time.time()
# Make a dataframe with dask
df2 = get_pandas_dataframe(paths)
t1 = time.time()
print(t1-t0)