Pandas Dask集群:属性错误:';数据帧';对象没有属性'_数据';

Pandas Dask集群:属性错误:';数据帧';对象没有属性'_数据';,pandas,dockerfile,dask,dask-dataframe,Pandas,Dockerfile,Dask,Dask Dataframe,我在GCP上使用Dask集群。我正在使用以下代码部署它: from dask_cloudprovider.gcp import GCPCluster from dask.distributed import Client enviroment_vars = { 'EXTRA_PIP_PACKAGES': '"gcsfs"' } cluster = GCPCluster( n_workers=32, docker_image='daskdev/das

我在GCP上使用Dask集群。我正在使用以下代码部署它:

from dask_cloudprovider.gcp import GCPCluster
from dask.distributed import Client

enviroment_vars = {
    'EXTRA_PIP_PACKAGES': '"gcsfs"'
}

cluster = GCPCluster(
    n_workers=32,
    docker_image='daskdev/dask:2021.2.0',
    env_vars=enviroment_vars,
    network='my-network',
    #filesystem_size=150,
    machine_type='e2-standard-16',
    projectid='my-project-id',
    zone='us-central1-a',
    on_host_maintenance="MIGRATE"

client = Client(cluster)
然后我读取csv文件,代码如下:

import dask.dataframe as dd
import csv

col_dtypes = {
    'var1': 'float64',
    'var2': 'object',
    'var3': 'object',
    'var4': 'float64'
}

df = dd.read_csv('gs://my_bucket/files-*.csv', blocksize=None, dtype= col_dtypes)
df = df.persist()
一切都很好,但当我尝试进行一些查询或计算时,会出现错误。例如,这段代码:

df.var1.value_counts().compute()
这是输出:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-14-711a7c21ed42> in <module>
----> 1 df.var1.value_counts().compute()

/opt/conda/lib/python3.8/site-packages/dask/base.py in compute(self, **kwargs)
    279         dask.base.compute
    280         """
--> 281         (result,) = compute(self, traverse=False, **kwargs)
    282         return result
    283 

/opt/conda/lib/python3.8/site-packages/dask/base.py in compute(*args, **kwargs)
    561         postcomputes.append(x.__dask_postcompute__())
    562 
--> 563     results = schedule(dsk, keys, **kwargs)
    564     return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
    565 

/opt/conda/lib/python3.8/site-packages/distributed/client.py in get(self, dsk, keys, workers, allow_other_workers, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)
   2653                     should_rejoin = False
   2654             try:
-> 2655                 results = self.gather(packed, asynchronous=asynchronous, direct=direct)
   2656             finally:
   2657                 for f in futures.values():

/opt/conda/lib/python3.8/site-packages/distributed/client.py in gather(self, futures, errors, direct, asynchronous)
   1962             else:
   1963                 local_worker = None
-> 1964             return self.sync(
   1965                 self._gather,
   1966                 futures,

/opt/conda/lib/python3.8/site-packages/distributed/client.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
    836             return future
    837         else:
--> 838             return sync(
    839                 self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
    840             )

/opt/conda/lib/python3.8/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
    338     if error[0]:
    339         typ, exc, tb = error[0]
--> 340         raise exc.with_traceback(tb)
    341     else:
    342         return result[0]

/opt/conda/lib/python3.8/site-packages/distributed/utils.py in f()
    322             if callback_timeout is not None:
    323                 future = asyncio.wait_for(future, callback_timeout)
--> 324             result[0] = yield future
    325         except Exception as exc:
    326             error[0] = sys.exc_info()

/opt/conda/lib/python3.8/site-packages/tornado/gen.py in run(self)
    760 
    761                     try:
--> 762                         value = future.result()
    763                     except Exception:
    764                         exc_info = sys.exc_info()

/opt/conda/lib/python3.8/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
   1827                             exc = CancelledError(key)
   1828                         else:
-> 1829                             raise exception.with_traceback(traceback)
   1830                         raise exc
   1831                     if errors == "skip":

/opt/conda/lib/python3.8/site-packages/dask/optimization.py in __call__()
    961         if not len(args) == len(self.inkeys):
    962             raise ValueError("Expected %d args, got %d" % (len(self.inkeys), len(args)))
--> 963         return core.get(self.dsk, self.outkey, dict(zip(self.inkeys, args)))
    964 
    965     def __reduce__(self):

/opt/conda/lib/python3.8/site-packages/dask/core.py in get()
    149     for key in toposort(dsk):
    150         task = dsk[key]
--> 151         result = _execute_task(task, cache)
    152         cache[key] = result
    153     result = _execute_task(out, cache)

/opt/conda/lib/python3.8/site-packages/dask/core.py in _execute_task()
    119         # temporaries by their reference count and can execute certain
    120         # operations in-place.
--> 121         return func(*(_execute_task(a, cache) for a in args))
    122     elif not ishashable(arg):
    123         return arg

/opt/conda/lib/python3.8/site-packages/dask/utils.py in apply()
     33 def apply(func, args, kwargs=None):
     34     if kwargs:
---> 35         return func(*args, **kwargs)
     36     else:
     37         return func(*args)

/opt/conda/lib/python3.8/site-packages/dask/dataframe/core.py in apply_and_enforce()
   5474             return meta
   5475         if is_dataframe_like(df):
-> 5476             check_matching_columns(meta, df)
   5477             c = meta.columns
   5478         else:

/opt/conda/lib/python3.8/site-packages/dask/dataframe/utils.py in check_matching_columns()
    690 def check_matching_columns(meta, actual):
    691     # Need nan_to_num otherwise nan comparison gives False
--> 692     if not np.array_equal(np.nan_to_num(meta.columns), np.nan_to_num(actual.columns)):
    693         extra = methods.tolist(actual.columns.difference(meta.columns))
    694         missing = methods.tolist(meta.columns.difference(actual.columns))

/opt/conda/lib/python3.8/site-packages/pandas/core/generic.py in __getattr__()
   5268             or name in self._accessors
   5269         ):
-> 5270             return object.__getattribute__(self, name)
   5271         else:
   5272             if self._info_axis._can_hold_identifiers_and_holds_name(name):

pandas/_libs/properties.pyx in pandas._libs.properties.AxisProperty.__get__()

/opt/conda/lib/python3.8/site-packages/pandas/core/generic.py in __getattr__()
   5268             or name in self._accessors
   5269         ):
-> 5270             return object.__getattribute__(self, name)
   5271         else:
   5272             if self._info_axis._can_hold_identifiers_and_holds_name(name):

AttributeError: 'DataFrame' object has no attribute '_data'
---------------------------------------------------------------------------
AttributeError回溯(最近一次呼叫上次)
在里面
---->1 df.var1.value_counts().compute()
/计算中的opt/conda/lib/python3.8/site-packages/dask/base.py(self,**kwargs)
279 dask.base.compute
280         """
-->281(结果,)=compute(自我,遍历=False,**kwargs)
282返回结果
283
/compute中的opt/conda/lib/python3.8/site-packages/dask/base.py(*args,**kwargs)
561 postcomputes.append(x.\uuuu dask\u postcompute\uuuu())
562
-->563结果=进度表(dsk、键、**kwargs)
564返回重新打包([f(r,*a)用于r,(f,a)压缩(结果,邮政编码)])
565
/get中的opt/conda/lib/python3.8/site-packages/distributed/client.py(self、dsk、key、worker、allow\u其他worker、resources、sync、asynchronous、direct、retries、priority、fifo\u timeout、actors、**kwargs)
2653应重新加入=错误
2654尝试:
->2655结果=自聚集(打包、异步=异步、直接=直接)
2656最后:
2657对于期货中的f.values():
/gather中的opt/conda/lib/python3.8/site-packages/distributed/client.py(self、futures、errors、direct、asynchronous)
其他:
1963本地工人=无
->1964返回自我同步(
1965年,赛尔夫,
1966年期货,
/opt/conda/lib/python3.8/site-packages/distributed/client.py同步(self、func、异步、回调超时、*args、**kwargs)
836回归未来
837其他:
-->838返回同步(
839 self.loop,func,*args,callback\u timeout=callback\u timeout,**kwargs
840             )
/opt/conda/lib/python3.8/site-packages/distributed/utils.py处于同步状态(循环、函数、回调超时、*args、**kwargs)
338如果错误[0]:
339典型、exc、tb=错误[0]
-->340带回溯的提升exc(tb)
341其他:
342返回结果[0]
/f()中的opt/conda/lib/python3.8/site-packages/distributed/utils.py
322如果回调超时不是无:
323 future=asyncio.wait\u for(future,回调\u超时)
-->324结果[0]=未来收益率
325例外情况除外,作为exc:
326错误[0]=sys.exc_info()
/运行中的opt/conda/lib/python3.8/site-packages/tornado/gen.py(self)
760
761尝试:
-->762 value=future.result()
763例外情况除外:
764 exc_info=sys.exc_info()
/opt/conda/lib/python3.8/site-packages/distributed/client.py in_-gather(self、futures、errors、direct、local_-worker)
1827 exc=取消错误(键)
1828其他:
->1829 raise异常。带_回溯(回溯)
1830年
1831如果错误==“跳过”:
/opt/conda/lib/python3.8/site-packages/dask/optimization.py
961如果不是len(args)=len(self.inkeys):
962 raise VALUELERROR(“应为%d个参数,获得%d个”%(len(self.inkeys),len(args)))
-->963返回core.get(self.dsk、self.outkey、dict(zip(self.inkeys、args)))
964
965定义减少(自):
/get()中的opt/conda/lib/python3.8/site-packages/dask/core.py
149用于输入拓扑排序(dsk):
150任务=dsk[键]
-->151结果=_执行_任务(任务,缓存)
152缓存[键]=结果
153结果=_执行_任务(输出,缓存)
/opt/conda/lib/python3.8/site-packages/dask/core.py in_execute_task()
119#临时机构按其引用计数,并可执行某些
120#运营到位。
-->121 return func(*(在args中为a执行任务(a,缓存))
122 elif不可用(arg):
123返回参数
/apply()中的opt/conda/lib/python3.8/site-packages/dask/utils.py
33 def应用(func、args、kwargs=无):
34如果kwargs:
--->35返回函数(*args,**kwargs)
36.其他:
37返回函数(*args)
/opt/conda/lib/python3.8/site-packages/dask/dataframe/core.py在apply_和_-exforce()中
5474返回元
5475如果是数据帧(df):
->5476检查匹配列(meta、df)
5477 c=meta.columns
5478其他:
/检查匹配列()中的opt/conda/lib/python3.8/site-packages/dask/dataframe/utils.py
690 def检查匹配列(元、实际):
691#需要nan_to_num,否则nan比较结果为False
-->692如果不是np.array_equal(np.nan_to_num(meta.columns),np.nan_to_num(actual.columns)):
693 extra=methods.tolist(实际的.columns.difference(meta.columns))
694 missing=methods.tolist(meta.columns.difference(actual.columns))
/opt/conda/lib/python3.8/site-packages/pandas/core/generic.py in
5268或名称在self.\u访问器中
5269         ):
->5270返回对象。\uuuu getattribute\uuuuu(self,name)
5271其他:
5272如果自身信息轴可容纳标识符且容纳名称(名称):
pandas/_libs/properties.pyx在pandas._libs.properties.AxisProperty.uuuuu get_uuuuuu()
/opt/conda/lib/python3.8/site-packages/pandas/core/generic.py in
5268或名称在self.\u访问器中
5269         ):
->5270返回对象。\uuuu getattribute\uuuuu(self,name)
5271其他: