dask性能沿轴应用
我试图使用dask在一个大型高分辨率海洋模型数据集上计算随时间的线性趋势 我遵循了这个示例(),发现dask性能沿轴应用,dask,python-xarray,Dask,Python Xarray,我试图使用dask在一个大型高分辨率海洋模型数据集上计算随时间的线性趋势 我遵循了这个示例(),发现apply\u沿_轴的语法更简单 我目前正在使用dask.array。沿\u轴应用\u将numpy函数包装到一维数组上,然后将生成的dask数组打包到xarrayDataarray。使用top-u表明计算不是并行执行的(cpu使用率约为100%) 我是否应该期望map_块具有更好的性能?或者有没有关于如何改进沿_轴应用的性能的建议? 任何提示都非常感谢 import numpy as np fro
apply\u沿_轴的语法更简单
我目前正在使用dask.array。沿\u轴应用\u
将numpy函数包装到一维数组上,然后将生成的dask数组打包到xarrayDataarray
。使用top-u
表明计算不是并行执行的(cpu使用率约为100%)
我是否应该期望map_块
具有更好的性能?或者有没有关于如何改进沿_轴应用
的性能的建议?
任何提示都非常感谢
import numpy as np
from scipy import optimize
import xarray as xr
import dask.array as dsa
def _lin_trend(y):
x = np.arange(len(y))
return np.polyfit(x, y, 1)
def linear_trend(da, dim, name='parameter'):
da = da.copy()
axis_num = da.get_axis_num(dim)
dims = list(da.dims)
dims[axis_num] = name
coords = da.rename({dim:name}).coords
coords[name] = ['slope', 'intercept']
dsk = da.data
dsk_trend = dsa.apply_along_axis(_lin_trend,0,dsk)
out = xr.DataArray(dsk_trend, dims=dims, coords=coords)
return out
我一直在使用xarray的apply_ufunc
(需要xarray v0.10或更高版本)做类似的事情。这可能比在dask中使用apply_沿轴
功能更容易管理
import xarray as xr
import numpy as np
from scipy import stats
def _calc_slope(x, y):
'''wrapper that returns the slop from a linear regression fit of x and y'''
slope = stats.linregress(x, y)[0] # extract slope only
return slope
def linear_trend(obj):
time_nums = xr.DataArray(obj['time'].values.astype(np.float),
dims='time',
coords={'time': obj['time']},
name='time_nums')
trend = xr.apply_ufunc(_calc_slope, time_nums, obj,
vectorize=True,
input_core_dims=[['time'], ['time']],
output_core_dims=[[]],
output_dtypes=[np.float],
dask='parallelized')
return trend
回答您的问题:为什么性能不如预期。这可能有很多原因。dask数组是如何分块的?您正在使用哪个dask计划程序?在我更好地了解您的配置之后,我将更新我答案的第二部分。我认为性能最终会受到我正在使用的文件系统的限制。不过,为了回答您的问题,我的数据集具有以下形状:
<xarray.Dataset>
Dimensions: (st_edges_ocean: 51, st_ocean: 50, time: 101, xt_ocean: 3600, yt_ocean: 2700)
Coordinates:
* xt_ocean (xt_ocean) float64 -279.9 -279.8 -279.7 -279.6 -279.5 ...
* yt_ocean (yt_ocean) float64 -81.11 -81.07 -81.02 -80.98 -80.94 ...
* st_ocean (st_ocean) float64 5.034 15.1 25.22 35.36 45.58 55.85 ...
* st_edges_ocean (st_edges_ocean) float64 0.0 10.07 20.16 30.29 40.47 ...
* time (time) float64 3.634e+04 3.671e+04 3.707e+04 3.744e+04 ...
现在,我定义了一组私有函数(使用linregress和polyfit来计算时间序列的斜率),以及使用dask.apply_和xarray.apply__nc的不同实现
def _calc_slope_poly(y):
"""ufunc to be used by linear_trend"""
x = np.arange(len(y))
return np.polyfit(x, y, 1)[0]
def _calc_slope(y):
'''returns the slop from a linear regression fit of x and y'''
x = np.arange(len(y))
return stats.linregress(x, y)[0]
def linear_trend_along(da, dim):
"""computes linear trend over 'dim' from the da.
Slope and intercept of the least square fit are added to a new
DataArray which has the dimension 'name' instead of 'dim', containing
slope and intercept for each gridpoint
"""
da = da.copy()
axis_num = da.get_axis_num(dim)
trend = dsa.apply_along_axis(_calc_slope, axis_num, da.data)
return trend
def linear_trend_ufunc(obj, dim):
trend = xr.apply_ufunc(_calc_slope, obj,
vectorize=True,
input_core_dims=[[dim]],
output_core_dims=[[]],
output_dtypes=[np.float],
dask='parallelized')
return trend
def linear_trend_ufunc_poly(obj, dim):
trend = xr.apply_ufunc(_calc_slope_poly, obj,
vectorize=True,
input_core_dims=[[dim]],
output_core_dims=[[]],
output_dtypes=[np.float],
dask='parallelized')
return trend
def linear_trend_along_poly(da, dim):
"""computes linear trend over 'dim' from the da.
Slope and intercept of the least square fit are added to a new
DataArray which has the dimension 'name' instead of 'dim', containing
slope and intercept for each gridpoint
"""
da = da.copy()
axis_num = da.get_axis_num(dim)
trend = dsa.apply_along_axis(_calc_slope_poly, axis_num, da.data)
return trend
trend_ufunc = linear_trend_ufunc(da, 't')
trend_ufunc_poly = linear_trend_ufunc_poly(da, 't')
trend_along = linear_trend_along(da, 't')
trend_along_poly = linear_trend_along_poly(da, 't')
计算计时似乎表明,apply_沿途
方法可能会稍微快一些。然而,使用多元拟合代替线性回归似乎有很大的影响。我不知道为什么这要快得多,但也许你对此感兴趣
%%timeit
print(trend_ufunc[1,1,1].data.compute())
每个回路4.89 s±180 ms(7次运行的平均值±标准偏差,每个回路1次)
每个回路2.74 s±182 ms(7次运行的平均值±标准偏差,每个回路1次)
每个回路4.58 s±193 ms(7次运行的平均值±标准偏差,每个回路1次)
每个循环2.64 s±65 ms(平均值±标准偏差为7次运行,每个循环1次)您能分享您的dask数据块配置和数据阵列的形状吗?计算斜率采用两个参数x和y,但它应该是两个不同的数据集,但您似乎只给出了一个?obj或obs的结构是什么?我是新手,希望在我的代码中应用它,但不太了解如何构造它。谢谢。Xarray将两个参数传递给\u calc\u slope
,time\u nums
和一片obj
obj
本身是一个xarray数据集或数据数组,它至少包含维度时间(如果不是更多的话)。
import xarray as xr
import numpy as np
from scipy import stats
import dask.array as dsa
slope = 10
intercept = 5
t = np.arange(250)
x = np.arange(10)
y = np.arange(500)
z = np.arange(200)
chunks = {'x':10, 'y':10}
noise = np.random.random([len(x), len(y), len(z), len(t)])
ones = np.ones_like(noise)
time = ones*t
data = (time*slope+intercept)+noise
da = xr.DataArray(data, dims=['x', 'y', 'z', 't'],
coords={'x':('x', x),
'y':('y', y),
'z':('z', z),
't':('t', t)})
da = da.chunk(chunks)
da
def _calc_slope_poly(y):
"""ufunc to be used by linear_trend"""
x = np.arange(len(y))
return np.polyfit(x, y, 1)[0]
def _calc_slope(y):
'''returns the slop from a linear regression fit of x and y'''
x = np.arange(len(y))
return stats.linregress(x, y)[0]
def linear_trend_along(da, dim):
"""computes linear trend over 'dim' from the da.
Slope and intercept of the least square fit are added to a new
DataArray which has the dimension 'name' instead of 'dim', containing
slope and intercept for each gridpoint
"""
da = da.copy()
axis_num = da.get_axis_num(dim)
trend = dsa.apply_along_axis(_calc_slope, axis_num, da.data)
return trend
def linear_trend_ufunc(obj, dim):
trend = xr.apply_ufunc(_calc_slope, obj,
vectorize=True,
input_core_dims=[[dim]],
output_core_dims=[[]],
output_dtypes=[np.float],
dask='parallelized')
return trend
def linear_trend_ufunc_poly(obj, dim):
trend = xr.apply_ufunc(_calc_slope_poly, obj,
vectorize=True,
input_core_dims=[[dim]],
output_core_dims=[[]],
output_dtypes=[np.float],
dask='parallelized')
return trend
def linear_trend_along_poly(da, dim):
"""computes linear trend over 'dim' from the da.
Slope and intercept of the least square fit are added to a new
DataArray which has the dimension 'name' instead of 'dim', containing
slope and intercept for each gridpoint
"""
da = da.copy()
axis_num = da.get_axis_num(dim)
trend = dsa.apply_along_axis(_calc_slope_poly, axis_num, da.data)
return trend
trend_ufunc = linear_trend_ufunc(da, 't')
trend_ufunc_poly = linear_trend_ufunc_poly(da, 't')
trend_along = linear_trend_along(da, 't')
trend_along_poly = linear_trend_along_poly(da, 't')
%%timeit
print(trend_ufunc[1,1,1].data.compute())
%%timeit
trend_ufunc_poly[1,1,1].compute()
%%timeit
trend_along[1,1,1].compute()
%%timeit
trend_along_poly[1,1,1].compute()