Python Dask数据帧写入csv不工作

Python Dask数据帧写入csv不工作,python,python-3.x,pandas,dask,Python,Python 3.x,Pandas,Dask,我必须比较两个大型CSV并将数据输出到CSV。我用过熊猫,但它显示了记忆警告。现在使用Dask数据帧读取和合并,然后输出到CSV。但它坚持在15%,什么也没发生。这是我的密码 将熊猫作为pd导入 将dask.dataframe作为dd导入 从dask.diagnostics导入进度条 pbar=ProgressBar() pbar.register() 数据类型={'B_Number':'float64','Real_Length':'Int64} df=dd.read_csv(“./docs

我必须比较两个大型CSV并将数据输出到CSV。我用过熊猫,但它显示了记忆警告。现在使用Dask数据帧读取和合并,然后输出到CSV。但它坚持在15%,什么也没发生。这是我的密码

将熊猫作为pd导入
将dask.dataframe作为dd导入
从dask.diagnostics导入进度条
pbar=ProgressBar()
pbar.register()
数据类型={'B_Number':'float64','Real_Length':'Int64}
df=dd.read_csv(“./docs/Turk_CDR.csv”,parse_dates={'datetime':[0,1]},dtype=dtypes)
df1=dd.read\u csv(“./docs/Test.csv”,parse\u dates={'datetime':[0,1]},dtype=dtypes)
将numpy作为np导入
df['B_Number']=df['B_Number'].astype(np.64)
df1['B_编号']=df1['B_编号'].aType(np.64)
new_df=dd.merge(df,df1,how='outer',left_on=['datetime','B_Number'],right_on=['datetime','B_Number'])
新_df.info()
[##########]30%完成| 5分钟44.8秒
---------------------------------------------------------------------------
AttributeError回溯(最近一次呼叫上次)
在里面
---->1个新的文件到csv(“./test.csv”,mode='a',chunksize=100000,compression='gzip'))
F:\python\WPy64-3740\python-3.7.4.amd64\lib\site-packages\dask\dataframe\core.py-in-to_csv(self,filename,**kwargs)
1297从.io导入到_csv
1298
->1299返回到_csv(self,filename,**kwargs)
1300
1301 def to_json(self,filename,*args,**kwargs):
F:\python\WPy64-3740\python-3.7.4.amd64\lib\site packages\dask\dataframe\io\csv.py in to_csv(df、文件名、函数名、压缩、计算、调度程序、存储选项、仅头文件第一分区、**kwargs)
759
760如果计算:
-->761延迟(值).compute(调度程序=调度程序)
762返回[f.文件中f的路径]
763其他:
F:\python\WPy64-3740\python-3.7.4.amd64\lib\site packages\dask\base.py in compute(self,**kwargs)
173 dask.base.compute
174         """
-->175(结果,)=compute(自我,遍历=False,**kwargs)
176返回结果
177
F:\python\WPy64-3740\python-3.7.4.amd64\lib\site packages\dask\base.py in compute(*args,**kwargs)
444个键=[x.\u dask\u keys\u()表示集合中的x]
445 postcomputes=[x.\u dask\u postcompute\uuuux()用于集合中的x]
-->446结果=进度表(dsk、键、**kwargs)
447返回重新打包([f(r,*a)用于r,(f,a)压缩(结果,邮政编码)])
448
get中的F:\python\WPy64-3740\python-3.7.4.amd64\lib\site packages\dask\threaded.py(dsk、结果、缓存、num_worker、池、**kwargs)
80 get\u id=\u线程\u get\u id,
81打包例外=打包例外,
--->82**夸尔格
83     )
84
F:\python\WPy64-3740\python-3.7.4.amd64\lib\site packages\dask\local.py在get\u async中(应用\u async、num\u worker、dsk、结果、缓存、get\u id、在本地重新运行\u异常、打包\u异常、引发\u异常、回调、转储、加载、**kwargs)
489 _执行_任务(任务、数据)#在本地重新执行
490其他:
-->491 raise_异常(exc,tb)
492 res,工人id=负载(res\U信息)
493状态[“缓存”][key]=res
F:\python\WPy64-3740\python-3.7.4.amd64\lib\site packages\dask\compatibility.py in reraise(exc,tb)
128如果exc.uu回溯不是tb:
129带回溯的提升执行(tb)
-->130升exc
131
132进口腌菜作为包装
F:\python\WPy64-3740\python-3.7.4.amd64\lib\site packages\dask\local.py in execute\u task(关键字、任务信息、转储、加载、获取标识、包异常)
231试试:
232任务,数据=负载(任务信息)
-->233结果=_执行_任务(任务,数据)
234 id=get_id()
235结果=转储((结果,id))
F:\python\WPy64-3740\python-3.7.4.amd64\lib\site packages\dask\core.py in\u execute\u task(arg、cache、dsk)
116 elif istask(arg):
117 func,args=arg[0],arg[1:]
-->118 args2=[[为args中的a执行任务(a,缓存)]
119返回函数(*args2)
120 elif不可登录(arg):
F:\python\WPy64-3740\python-3.7.4.amd64\lib\site packages\dask\core.py in(.0)
116 elif istask(arg):
117 func,args=arg[0],arg[1:]
-->118 args2=[[为args中的a执行任务(a,缓存)]
119返回函数(*args2)
120 elif不可登录(arg):
F:\python\WPy64-3740\python-3.7.4.amd64\lib\site packages\dask\core.py in\u execute\u task(arg、cache、dsk)
116 elif istask(arg):
117 func,args=arg[0],arg[1:]
-->118 args2=[[为args中的a执行任务(a,缓存)]
119返回函数(*args2)
120 elif不可登录(arg):
F:\python\WPy64-3740\python-3.7.4.amd64\lib\site packages\dask\core.py in(.0)
116 elif istask(arg):
117 func,args=arg[0],arg[1:]
-->118 args2=[[为args中的a执行任务(a,缓存)]
119返回函数(*args2)
120 elif不可登录(arg):
F:\python\WPy64-3740\python-3.7.4.amd64\lib\site packages\dask\core.py in\u execute\u task(arg、cache、dsk)
116 elif istask(arg):
117 func,args=arg[0],arg[1:]
-->118 args2=[[为args中的a执行任务(a,缓存)]
119返回函数(*args2)
120 elif不可登录(arg):
F:\python\WPy64-3740\python-3.7.4.amd64\lib\site packages\dask\core.py in(.0)
116 elif istask(arg):
117 func,args=arg[0],arg[1:]
-->118 args2=[[为args中的a执行任务(a,缓存)]
119返回函数(*args2)
120 elif不可登录(arg):
F:\python\WPy64-3740\python-3.7.4.amd64\lib\site packages\dask\core.py in\u execute\u task(arg、cache、dsk)
113     """
114如果存在(参数,列表):
-->115 return[_execute_arg中a的任务(a,缓存)]
116 elif istask(arg):
117
<class 'dask.dataframe.core.DataFrame'>
Columns: 4 entries, datetime to Real_Length_y
dtypes: Int64(2), datetime64[ns](1), float64(1)
[############                            ] | 30% Completed |  5min 44.8s



---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

<ipython-input-3-fe5dee9d0fbf> in <module>
----> 1 new_df.to_csv("./test.csv",mode='a', chunksize=100000, compression = 'gzip')


F:\python\WPy64-3740\python-3.7.4.amd64\lib\site-packages\dask\dataframe\core.py in to_csv(self, filename, **kwargs)
   1297         from .io import to_csv
   1298 
-> 1299         return to_csv(self, filename, **kwargs)
   1300 
   1301     def to_json(self, filename, *args, **kwargs):


F:\python\WPy64-3740\python-3.7.4.amd64\lib\site-packages\dask\dataframe\io\csv.py in to_csv(df, filename, name_function, compression, compute, scheduler, storage_options, header_first_partition_only, **kwargs)
    759 
    760     if compute:
--> 761         delayed(values).compute(scheduler=scheduler)
    762         return [f.path for f in files]
    763     else:


F:\python\WPy64-3740\python-3.7.4.amd64\lib\site-packages\dask\base.py in compute(self, **kwargs)
    173         dask.base.compute
    174         """
--> 175         (result,) = compute(self, traverse=False, **kwargs)
    176         return result
    177 


F:\python\WPy64-3740\python-3.7.4.amd64\lib\site-packages\dask\base.py in compute(*args, **kwargs)
    444     keys = [x.__dask_keys__() for x in collections]
    445     postcomputes = [x.__dask_postcompute__() for x in collections]
--> 446     results = schedule(dsk, keys, **kwargs)
    447     return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
    448 


F:\python\WPy64-3740\python-3.7.4.amd64\lib\site-packages\dask\threaded.py in get(dsk, result, cache, num_workers, pool, **kwargs)
     80         get_id=_thread_get_id,
     81         pack_exception=pack_exception,
---> 82         **kwargs
     83     )
     84 


F:\python\WPy64-3740\python-3.7.4.amd64\lib\site-packages\dask\local.py in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)
    489                         _execute_task(task, data)  # Re-execute locally
    490                     else:
--> 491                         raise_exception(exc, tb)
    492                 res, worker_id = loads(res_info)
    493                 state["cache"][key] = res


F:\python\WPy64-3740\python-3.7.4.amd64\lib\site-packages\dask\compatibility.py in reraise(exc, tb)
    128         if exc.__traceback__ is not tb:
    129             raise exc.with_traceback(tb)
--> 130         raise exc
    131 
    132     import pickle as cPickle


F:\python\WPy64-3740\python-3.7.4.amd64\lib\site-packages\dask\local.py in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
    231     try:
    232         task, data = loads(task_info)
--> 233         result = _execute_task(task, data)
    234         id = get_id()
    235         result = dumps((result, id))


F:\python\WPy64-3740\python-3.7.4.amd64\lib\site-packages\dask\core.py in _execute_task(arg, cache, dsk)
    116     elif istask(arg):
    117         func, args = arg[0], arg[1:]
--> 118         args2 = [_execute_task(a, cache) for a in args]
    119         return func(*args2)
    120     elif not ishashable(arg):


F:\python\WPy64-3740\python-3.7.4.amd64\lib\site-packages\dask\core.py in <listcomp>(.0)
    116     elif istask(arg):
    117         func, args = arg[0], arg[1:]
--> 118         args2 = [_execute_task(a, cache) for a in args]
    119         return func(*args2)
    120     elif not ishashable(arg):


F:\python\WPy64-3740\python-3.7.4.amd64\lib\site-packages\dask\core.py in _execute_task(arg, cache, dsk)
    116     elif istask(arg):
    117         func, args = arg[0], arg[1:]
--> 118         args2 = [_execute_task(a, cache) for a in args]
    119         return func(*args2)
    120     elif not ishashable(arg):


F:\python\WPy64-3740\python-3.7.4.amd64\lib\site-packages\dask\core.py in <listcomp>(.0)
    116     elif istask(arg):
    117         func, args = arg[0], arg[1:]
--> 118         args2 = [_execute_task(a, cache) for a in args]
    119         return func(*args2)
    120     elif not ishashable(arg):


F:\python\WPy64-3740\python-3.7.4.amd64\lib\site-packages\dask\core.py in _execute_task(arg, cache, dsk)
    116     elif istask(arg):
    117         func, args = arg[0], arg[1:]
--> 118         args2 = [_execute_task(a, cache) for a in args]
    119         return func(*args2)
    120     elif not ishashable(arg):


F:\python\WPy64-3740\python-3.7.4.amd64\lib\site-packages\dask\core.py in <listcomp>(.0)
    116     elif istask(arg):
    117         func, args = arg[0], arg[1:]
--> 118         args2 = [_execute_task(a, cache) for a in args]
    119         return func(*args2)
    120     elif not ishashable(arg):


F:\python\WPy64-3740\python-3.7.4.amd64\lib\site-packages\dask\core.py in _execute_task(arg, cache, dsk)
    113     """
    114     if isinstance(arg, list):
--> 115         return [_execute_task(a, cache) for a in arg]
    116     elif istask(arg):
    117         func, args = arg[0], arg[1:]


F:\python\WPy64-3740\python-3.7.4.amd64\lib\site-packages\dask\core.py in <listcomp>(.0)
    113     """
    114     if isinstance(arg, list):
--> 115         return [_execute_task(a, cache) for a in arg]
    116     elif istask(arg):
    117         func, args = arg[0], arg[1:]


F:\python\WPy64-3740\python-3.7.4.amd64\lib\site-packages\dask\core.py in _execute_task(arg, cache, dsk)
    117         func, args = arg[0], arg[1:]
    118         args2 = [_execute_task(a, cache) for a in args]
--> 119         return func(*args2)
    120     elif not ishashable(arg):
    121         return arg


F:\python\WPy64-3740\python-3.7.4.amd64\lib\site-packages\dask\dataframe\shuffle.py in shuffle_group_3(df, col, npartitions, p)
    621     g = df.groupby(col)
    622     d = {i: g.get_group(i) for i in g.groups}
--> 623     p.append(d, fsync=True)
    624 
    625 


F:\python\WPy64-3740\python-3.7.4.amd64\lib\site-packages\partd\encode.py in append(self, data, **kwargs)
     21 
     22     def append(self, data, **kwargs):
---> 23         data = valmap(self.encode, data)
     24         data = valmap(frame, data)
     25         self.partd.append(data, **kwargs)


F:\python\WPy64-3740\python-3.7.4.amd64\lib\site-packages\toolz\dicttoolz.py in valmap(func, d, factory)
     81     """
     82     rv = factory()
---> 83     rv.update(zip(iterkeys(d), map(func, itervalues(d))))
     84     return rv
     85 


F:\python\WPy64-3740\python-3.7.4.amd64\lib\site-packages\partd\pandas.py in serialize(df)
    156 
    157     for block in df._data.blocks:
--> 158         h, b = block_to_header_bytes(block)
    159         headers.append(h)
    160         bytes.append(b)


F:\python\WPy64-3740\python-3.7.4.amd64\lib\site-packages\partd\pandas.py in block_to_header_bytes(block)
    126 
    127     header = (block.mgr_locs.as_array, values.dtype, values.shape, extension)
--> 128     bytes = pnp.compress(pnp.serialize(values), values.dtype)
    129     return header, bytes
    130 


F:\python\WPy64-3740\python-3.7.4.amd64\lib\site-packages\partd\numpy.py in serialize(x)
     99         return frame(pickle.dumps(l, protocol=pickle.HIGHEST_PROTOCOL))
    100     else:
--> 101         return x.tobytes()
    102 
    103 


AttributeError: 'IntegerArray' object has no attribute 'tobytes'