Pandas Dask集群:属性错误:';数据帧';对象没有属性'_数据';
我在GCP上使用Dask集群。我正在使用以下代码部署它:Pandas Dask集群:属性错误:';数据帧';对象没有属性'_数据';,pandas,dockerfile,dask,dask-dataframe,Pandas,Dockerfile,Dask,Dask Dataframe,我在GCP上使用Dask集群。我正在使用以下代码部署它: from dask_cloudprovider.gcp import GCPCluster from dask.distributed import Client enviroment_vars = { 'EXTRA_PIP_PACKAGES': '"gcsfs"' } cluster = GCPCluster( n_workers=32, docker_image='daskdev/das
from dask_cloudprovider.gcp import GCPCluster
from dask.distributed import Client
enviroment_vars = {
'EXTRA_PIP_PACKAGES': '"gcsfs"'
}
cluster = GCPCluster(
n_workers=32,
docker_image='daskdev/dask:2021.2.0',
env_vars=enviroment_vars,
network='my-network',
#filesystem_size=150,
machine_type='e2-standard-16',
projectid='my-project-id',
zone='us-central1-a',
on_host_maintenance="MIGRATE"
client = Client(cluster)
然后我读取csv文件,代码如下:
import dask.dataframe as dd
import csv
col_dtypes = {
'var1': 'float64',
'var2': 'object',
'var3': 'object',
'var4': 'float64'
}
df = dd.read_csv('gs://my_bucket/files-*.csv', blocksize=None, dtype= col_dtypes)
df = df.persist()
一切都很好,但当我尝试进行一些查询或计算时,会出现错误。例如,这段代码:
df.var1.value_counts().compute()
这是输出:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-14-711a7c21ed42> in <module>
----> 1 df.var1.value_counts().compute()
/opt/conda/lib/python3.8/site-packages/dask/base.py in compute(self, **kwargs)
279 dask.base.compute
280 """
--> 281 (result,) = compute(self, traverse=False, **kwargs)
282 return result
283
/opt/conda/lib/python3.8/site-packages/dask/base.py in compute(*args, **kwargs)
561 postcomputes.append(x.__dask_postcompute__())
562
--> 563 results = schedule(dsk, keys, **kwargs)
564 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
565
/opt/conda/lib/python3.8/site-packages/distributed/client.py in get(self, dsk, keys, workers, allow_other_workers, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)
2653 should_rejoin = False
2654 try:
-> 2655 results = self.gather(packed, asynchronous=asynchronous, direct=direct)
2656 finally:
2657 for f in futures.values():
/opt/conda/lib/python3.8/site-packages/distributed/client.py in gather(self, futures, errors, direct, asynchronous)
1962 else:
1963 local_worker = None
-> 1964 return self.sync(
1965 self._gather,
1966 futures,
/opt/conda/lib/python3.8/site-packages/distributed/client.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
836 return future
837 else:
--> 838 return sync(
839 self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
840 )
/opt/conda/lib/python3.8/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
338 if error[0]:
339 typ, exc, tb = error[0]
--> 340 raise exc.with_traceback(tb)
341 else:
342 return result[0]
/opt/conda/lib/python3.8/site-packages/distributed/utils.py in f()
322 if callback_timeout is not None:
323 future = asyncio.wait_for(future, callback_timeout)
--> 324 result[0] = yield future
325 except Exception as exc:
326 error[0] = sys.exc_info()
/opt/conda/lib/python3.8/site-packages/tornado/gen.py in run(self)
760
761 try:
--> 762 value = future.result()
763 except Exception:
764 exc_info = sys.exc_info()
/opt/conda/lib/python3.8/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
1827 exc = CancelledError(key)
1828 else:
-> 1829 raise exception.with_traceback(traceback)
1830 raise exc
1831 if errors == "skip":
/opt/conda/lib/python3.8/site-packages/dask/optimization.py in __call__()
961 if not len(args) == len(self.inkeys):
962 raise ValueError("Expected %d args, got %d" % (len(self.inkeys), len(args)))
--> 963 return core.get(self.dsk, self.outkey, dict(zip(self.inkeys, args)))
964
965 def __reduce__(self):
/opt/conda/lib/python3.8/site-packages/dask/core.py in get()
149 for key in toposort(dsk):
150 task = dsk[key]
--> 151 result = _execute_task(task, cache)
152 cache[key] = result
153 result = _execute_task(out, cache)
/opt/conda/lib/python3.8/site-packages/dask/core.py in _execute_task()
119 # temporaries by their reference count and can execute certain
120 # operations in-place.
--> 121 return func(*(_execute_task(a, cache) for a in args))
122 elif not ishashable(arg):
123 return arg
/opt/conda/lib/python3.8/site-packages/dask/utils.py in apply()
33 def apply(func, args, kwargs=None):
34 if kwargs:
---> 35 return func(*args, **kwargs)
36 else:
37 return func(*args)
/opt/conda/lib/python3.8/site-packages/dask/dataframe/core.py in apply_and_enforce()
5474 return meta
5475 if is_dataframe_like(df):
-> 5476 check_matching_columns(meta, df)
5477 c = meta.columns
5478 else:
/opt/conda/lib/python3.8/site-packages/dask/dataframe/utils.py in check_matching_columns()
690 def check_matching_columns(meta, actual):
691 # Need nan_to_num otherwise nan comparison gives False
--> 692 if not np.array_equal(np.nan_to_num(meta.columns), np.nan_to_num(actual.columns)):
693 extra = methods.tolist(actual.columns.difference(meta.columns))
694 missing = methods.tolist(meta.columns.difference(actual.columns))
/opt/conda/lib/python3.8/site-packages/pandas/core/generic.py in __getattr__()
5268 or name in self._accessors
5269 ):
-> 5270 return object.__getattribute__(self, name)
5271 else:
5272 if self._info_axis._can_hold_identifiers_and_holds_name(name):
pandas/_libs/properties.pyx in pandas._libs.properties.AxisProperty.__get__()
/opt/conda/lib/python3.8/site-packages/pandas/core/generic.py in __getattr__()
5268 or name in self._accessors
5269 ):
-> 5270 return object.__getattribute__(self, name)
5271 else:
5272 if self._info_axis._can_hold_identifiers_and_holds_name(name):
AttributeError: 'DataFrame' object has no attribute '_data'
---------------------------------------------------------------------------
AttributeError回溯(最近一次呼叫上次)
在里面
---->1 df.var1.value_counts().compute()
/计算中的opt/conda/lib/python3.8/site-packages/dask/base.py(self,**kwargs)
279 dask.base.compute
280 """
-->281(结果,)=compute(自我,遍历=False,**kwargs)
282返回结果
283
/compute中的opt/conda/lib/python3.8/site-packages/dask/base.py(*args,**kwargs)
561 postcomputes.append(x.\uuuu dask\u postcompute\uuuu())
562
-->563结果=进度表(dsk、键、**kwargs)
564返回重新打包([f(r,*a)用于r,(f,a)压缩(结果,邮政编码)])
565
/get中的opt/conda/lib/python3.8/site-packages/distributed/client.py(self、dsk、key、worker、allow\u其他worker、resources、sync、asynchronous、direct、retries、priority、fifo\u timeout、actors、**kwargs)
2653应重新加入=错误
2654尝试:
->2655结果=自聚集(打包、异步=异步、直接=直接)
2656最后:
2657对于期货中的f.values():
/gather中的opt/conda/lib/python3.8/site-packages/distributed/client.py(self、futures、errors、direct、asynchronous)
其他:
1963本地工人=无
->1964返回自我同步(
1965年,赛尔夫,
1966年期货,
/opt/conda/lib/python3.8/site-packages/distributed/client.py同步(self、func、异步、回调超时、*args、**kwargs)
836回归未来
837其他:
-->838返回同步(
839 self.loop,func,*args,callback\u timeout=callback\u timeout,**kwargs
840 )
/opt/conda/lib/python3.8/site-packages/distributed/utils.py处于同步状态(循环、函数、回调超时、*args、**kwargs)
338如果错误[0]:
339典型、exc、tb=错误[0]
-->340带回溯的提升exc(tb)
341其他:
342返回结果[0]
/f()中的opt/conda/lib/python3.8/site-packages/distributed/utils.py
322如果回调超时不是无:
323 future=asyncio.wait\u for(future,回调\u超时)
-->324结果[0]=未来收益率
325例外情况除外,作为exc:
326错误[0]=sys.exc_info()
/运行中的opt/conda/lib/python3.8/site-packages/tornado/gen.py(self)
760
761尝试:
-->762 value=future.result()
763例外情况除外:
764 exc_info=sys.exc_info()
/opt/conda/lib/python3.8/site-packages/distributed/client.py in_-gather(self、futures、errors、direct、local_-worker)
1827 exc=取消错误(键)
1828其他:
->1829 raise异常。带_回溯(回溯)
1830年
1831如果错误==“跳过”:
/opt/conda/lib/python3.8/site-packages/dask/optimization.py
961如果不是len(args)=len(self.inkeys):
962 raise VALUELERROR(“应为%d个参数,获得%d个”%(len(self.inkeys),len(args)))
-->963返回core.get(self.dsk、self.outkey、dict(zip(self.inkeys、args)))
964
965定义减少(自):
/get()中的opt/conda/lib/python3.8/site-packages/dask/core.py
149用于输入拓扑排序(dsk):
150任务=dsk[键]
-->151结果=_执行_任务(任务,缓存)
152缓存[键]=结果
153结果=_执行_任务(输出,缓存)
/opt/conda/lib/python3.8/site-packages/dask/core.py in_execute_task()
119#临时机构按其引用计数,并可执行某些
120#运营到位。
-->121 return func(*(在args中为a执行任务(a,缓存))
122 elif不可用(arg):
123返回参数
/apply()中的opt/conda/lib/python3.8/site-packages/dask/utils.py
33 def应用(func、args、kwargs=无):
34如果kwargs:
--->35返回函数(*args,**kwargs)
36.其他:
37返回函数(*args)
/opt/conda/lib/python3.8/site-packages/dask/dataframe/core.py在apply_和_-exforce()中
5474返回元
5475如果是数据帧(df):
->5476检查匹配列(meta、df)
5477 c=meta.columns
5478其他:
/检查匹配列()中的opt/conda/lib/python3.8/site-packages/dask/dataframe/utils.py
690 def检查匹配列(元、实际):
691#需要nan_to_num,否则nan比较结果为False
-->692如果不是np.array_equal(np.nan_to_num(meta.columns),np.nan_to_num(actual.columns)):
693 extra=methods.tolist(实际的.columns.difference(meta.columns))
694 missing=methods.tolist(meta.columns.difference(actual.columns))
/opt/conda/lib/python3.8/site-packages/pandas/core/generic.py in
5268或名称在self.\u访问器中
5269 ):
->5270返回对象。\uuuu getattribute\uuuuu(self,name)
5271其他:
5272如果自身信息轴可容纳标识符且容纳名称(名称):
pandas/_libs/properties.pyx在pandas._libs.properties.AxisProperty.uuuuu get_uuuuuu()
/opt/conda/lib/python3.8/site-packages/pandas/core/generic.py in
5268或名称在self.\u访问器中
5269 ):
->5270返回对象。\uuuu getattribute\uuuuu(self,name)
5271其他: