从CSV延迟加载dask数据帧(内部延迟)
在使用dask.distributed时,我试图在延迟函数中从S3上的CSV加载dask数据帧,如下所示:从CSV延迟加载dask数据帧(内部延迟),dask,dask-distributed,dask-delayed,Dask,Dask Distributed,Dask Delayed,在使用dask.distributed时,我试图在延迟函数中从S3上的CSV加载dask数据帧,如下所示: @delayed def func1(): ... return df.read_csv(*s3_url*, ...) read_csv()不需要与分布式客户端交互,所以我认为这是可能的。然后在客户机上计算func1返回的延迟对象 直到这一点看起来不错,打印结果 Dask DataFrame Structure: COL1 COL2 n
@delayed
def func1():
...
return df.read_csv(*s3_url*, ...)
read_csv()不需要与分布式客户端交互,所以我认为这是可能的。然后在客户机上计算func1返回的延迟对象
直到这一点看起来不错,打印结果
Dask DataFrame Structure:
COL1 COL2
npartitions=9
object object
... ...
... ... ...
... ...
... ...
Dask Name: from-delayed, 27 tasks
但是,由于未能序列化(,…,,,'\n'),它将失败。异常:当我尝试进一步处理thread.lock对象时,无法对其进行pickle,例如
client.compute(frame)
有没有办法让这个计划发挥作用,或者我忽略了一些更基本的限制
PS.我得到的错误日志:
.pickle - Failed to serialize (<dask.bytes.core.OpenFile object at ...>, 20971520, 10485760, '\n'). Exception: can't pickle thread.lock objects
ERROR:2017-11-10 15:31:31:root:Exception while executing graph: can't pickle thread.lock objects
Traceback (most recent call last):
...
client.compute(res.data)
File ".../python2.7/site-packages/distributed/client.py", line 2089, in compute
resources=resources)
File ".../python2.7/site-packages/distributed/client.py", line 1906, in _graph_to_futures
'tasks': valmap(dumps_task, dsk3),
File ".../python2.7/site-packages/toolz-0.8.2-py2.7.egg/toolz/dicttoolz.py", line 84, in valmap
rv.update(zip(iterkeys(d), map(func, itervalues(d))))
File ".../python2.7/site-packages/distributed/worker.py", line 731, in dumps_task
'args': pickle.dumps(task[1:])}
File ".../python2.7/site-packages/distributed/protocol/pickle.py", line 51, in dumps
return cloudpickle.dumps(x, protocol=pickle.HIGHEST_PROTOCOL)
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 829, in dumps
cp.dump(obj)
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 233, in dump
return Pickler.dump(self, obj)
File "...python2.7/pickle.py", line 224, in dump
self.save(obj)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 568, in save_tuple
save(element)
File "...python2.7/pickle.py", line 331, in save
self.save_reduce(obj=obj, *rv)
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 727, in save_reduce
save(state)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 655, in save_dict
self._batch_setitems(obj.iteritems())
File "...python2.7/pickle.py", line 687, in _batch_setitems
save(v)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 564, in save_instancemethod
obj=obj)
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 709, in save_reduce
save(args)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 554, in save_tuple
save(element)
File "...python2.7/pickle.py", line 331, in save
self.save_reduce(obj=obj, *rv)
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 727, in save_reduce
save(state)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 655, in save_dict
self._batch_setitems(obj.iteritems())
File "...python2.7/pickle.py", line 687, in _batch_setitems
save(v)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 655, in save_dict
self._batch_setitems(obj.iteritems())
File "...python2.7/pickle.py", line 692, in _batch_setitems
save(v)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 554, in save_tuple
save(element)
File "...python2.7/pickle.py", line 331, in save
self.save_reduce(obj=obj, *rv)
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 727, in save_reduce
save(state)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 655, in save_dict
self._batch_setitems(obj.iteritems())
File "...python2.7/pickle.py", line 687, in _batch_setitems
save(v)
File "...python2.7/pickle.py", line 331, in save
self.save_reduce(obj=obj, *rv)
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 727, in save_reduce
save(state)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 655, in save_dict
self._batch_setitems(obj.iteritems())
File "...python2.7/pickle.py", line 687, in _batch_setitems
save(v)
File "...python2.7/pickle.py", line 306, in save
rv = reduce(self.proto)
TypeError: can't pickle thread.lock objects
.pickle-未能序列化(,2097152010485760,“\n”)。异常:无法pickle thread.lock对象
错误:2017-11-10 15:31:31:根:执行图形时异常:无法pickle thread.lock对象
回溯(最近一次呼叫最后一次):
...
client.compute(res.data)
文件“../python2.7/site packages/distributed/client.py”,第2089行,在compute中
资源=资源)
文件“../python2.7/site packages/distributed/client.py”,第1906行,在_graph_to_futures中
“任务”:valmap(转储任务,dsk3),
valmap中的文件“../python2.7/site packages/toolz-0.8.2-py2.7.egg/toolz/dicttoolz.py”,第84行
rv.更新(zip(iterkeys(d)、地图(func、iTerValue(d)))
文件“../python2.7/site packages/distributed/worker.py”,第731行,在dumps\u任务中
“args”:pickle.dumps(任务[1:])}
文件“../python2.7/site packages/distributed/protocol/pickle.py”,第51行,转储中
返回cloudpickle.dumps(x,协议=pickle.HIGHEST\u协议)
文件“../python2.7/site packages/cloudpickle/cloudpickle.py”,第829行,转储
cp.dump(obj)
文件“../python2.7/site packages/cloudpickle/cloudpickle.py”,第233行,在转储中
返回Pickler.dump(自、obj)
文件“…python2.7/pickle.py”,第224行,在转储中
自我保存(obj)
文件“…python2.7/pickle.py”,第286行,保存
f(self,obj)#用显式self调用未绑定方法
文件“…python2.7/pickle.py”,第568行,在save_tuple中
保存(元素)
文件“…python2.7/pickle.py”,第331行,保存
自我保存(obj=obj,*rv)
文件“../python2.7/site packages/cloudpickle/cloudpickle.py”,第727行,在save\u中
保存(状态)
文件“…python2.7/pickle.py”,第286行,保存
f(self,obj)#用显式self调用未绑定方法
保存目录中第655行的文件“…python2.7/pickle.py”
self.\u batch\u setitems(obj.iteritems())
文件“…python2.7/pickle.py”,第687行,在批处理设置项中
保存(v)
文件“…python2.7/pickle.py”,第286行,保存
f(self,obj)#用显式self调用未绑定方法
文件“../python2.7/site packages/cloudpickle/cloudpickle.py”,第564行,在save_instancemethod中
obj=obj)
文件“../python2.7/site packages/cloudpickle/cloudpickle.py”,第709行,在save\u中
保存(args)
文件“…python2.7/pickle.py”,第286行,保存
f(self,obj)#用显式self调用未绑定方法
文件“…python2.7/pickle.py”,第554行,在save_tuple中
保存(元素)
文件“…python2.7/pickle.py”,第331行,保存
自我保存(obj=obj,*rv)
文件“../python2.7/site packages/cloudpickle/cloudpickle.py”,第727行,在save\u中
保存(状态)
文件“…python2.7/pickle.py”,第286行,保存
f(self,obj)#用显式self调用未绑定方法
保存目录中第655行的文件“…python2.7/pickle.py”
self.\u batch\u setitems(obj.iteritems())
文件“…python2.7/pickle.py”,第687行,在批处理设置项中
保存(v)
文件“…python2.7/pickle.py”,第286行,保存
f(self,obj)#用显式self调用未绑定方法
保存目录中第655行的文件“…python2.7/pickle.py”
self.\u batch\u setitems(obj.iteritems())
文件“…python2.7/pickle.py”,第692行,在批处理设置项中
保存(v)
文件“…python2.7/pickle.py”,第286行,保存
f(self,obj)#用显式self调用未绑定方法
文件“…python2.7/pickle.py”,第554行,在save_tuple中
保存(元素)
文件“…python2.7/pickle.py”,第331行,保存
自我保存(obj=obj,*rv)
文件“../python2.7/site packages/cloudpickle/cloudpickle.py”,第727行,在save\u中
保存(状态)
文件“…python2.7/pickle.py”,第286行,保存
f(self,obj)#用显式self调用未绑定方法
保存目录中第655行的文件“…python2.7/pickle.py”
self.\u batch\u setitems(obj.iteritems())
文件“…python2.7/pickle.py”,第687行,在批处理设置项中
保存(v)
文件“…python2.7/pickle.py”,第331行,保存
自我保存(obj=obj,*rv)
文件“../python2.7/site packages/cloudpickle/cloudpickle.py”,第727行,在save\u中
保存(状态)
文件“…python2.7/pickle.py”,第286行,保存
f(self,obj)#用显式self调用未绑定方法
保存目录中第655行的文件“…python2.7/pickle.py”
self.\u batch\u setitems(obj.iteritems())
文件“…python2.7/pickle.py”,第687行,在批处理设置项中
保存(v)
文件“…python2.7/pickle.py”,第306行,保存
rv=减少(自编程)
TypeError:无法pickle thread.lock对象
作为后续行动,我试图通过从worker发布命名数据集,然后从客户端计算机获取数据集来解决此问题,但到目前为止,我收到了相同的错误。问题不一定与delayed()内部的执行有关,因为它也在本地发生:
.pickle - Failed to serialize (<dask.bytes.core.OpenFile object at ...>, 20971520, 10485760, '\n'). Exception: can't pickle thread.lock objects
ERROR:2017-11-10 15:31:31:root:Exception while executing graph: can't pickle thread.lock objects
Traceback (most recent call last):
...
client.compute(res.data)
File ".../python2.7/site-packages/distributed/client.py", line 2089, in compute
resources=resources)
File ".../python2.7/site-packages/distributed/client.py", line 1906, in _graph_to_futures
'tasks': valmap(dumps_task, dsk3),
File ".../python2.7/site-packages/toolz-0.8.2-py2.7.egg/toolz/dicttoolz.py", line 84, in valmap
rv.update(zip(iterkeys(d), map(func, itervalues(d))))
File ".../python2.7/site-packages/distributed/worker.py", line 731, in dumps_task
'args': pickle.dumps(task[1:])}
File ".../python2.7/site-packages/distributed/protocol/pickle.py", line 51, in dumps
return cloudpickle.dumps(x, protocol=pickle.HIGHEST_PROTOCOL)
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 829, in dumps
cp.dump(obj)
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 233, in dump
return Pickler.dump(self, obj)
File "...python2.7/pickle.py", line 224, in dump
self.save(obj)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 568, in save_tuple
save(element)
File "...python2.7/pickle.py", line 331, in save
self.save_reduce(obj=obj, *rv)
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 727, in save_reduce
save(state)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 655, in save_dict
self._batch_setitems(obj.iteritems())
File "...python2.7/pickle.py", line 687, in _batch_setitems
save(v)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 564, in save_instancemethod
obj=obj)
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 709, in save_reduce
save(args)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 554, in save_tuple
save(element)
File "...python2.7/pickle.py", line 331, in save
self.save_reduce(obj=obj, *rv)
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 727, in save_reduce
save(state)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 655, in save_dict
self._batch_setitems(obj.iteritems())
File "...python2.7/pickle.py", line 687, in _batch_setitems
save(v)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 655, in save_dict
self._batch_setitems(obj.iteritems())
File "...python2.7/pickle.py", line 692, in _batch_setitems
save(v)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 554, in save_tuple
save(element)
File "...python2.7/pickle.py", line 331, in save
self.save_reduce(obj=obj, *rv)
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 727, in save_reduce
save(state)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 655, in save_dict
self._batch_setitems(obj.iteritems())
File "...python2.7/pickle.py", line 687, in _batch_setitems
save(v)
File "...python2.7/pickle.py", line 331, in save
self.save_reduce(obj=obj, *rv)
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 727, in save_reduce
save(state)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 655, in save_dict
self._batch_setitems(obj.iteritems())
File "...python2.7/pickle.py", line 687, in _batch_setitems
save(v)
File "...python2.7/pickle.py", line 306, in save
rv = reduce(self.proto)
TypeError: can't pickle thread.lock objects