Python 将熊猫合并到Dask
我试图在数据帧上执行一个有点复杂的操作,包括对numpy数组进行分组和操作。我可以在Pandas中运行该操作,但是Dask中的等效代码给了我一个错误,我还没有找到解决方法Python 将熊猫合并到Dask,python,pandas,numpy,dask,Python,Pandas,Numpy,Dask,我试图在数据帧上执行一个有点复杂的操作,包括对numpy数组进行分组和操作。我可以在Pandas中运行该操作,但是Dask中的等效代码给了我一个错误,我还没有找到解决方法 def dask_ordering(f): means = f.groupby("type")["vector"].apply(array_mean, meta="object").to_frame(name="mean") means['idx'] = means.index means['i
def dask_ordering(f):
means = f.groupby("type")["vector"].apply(array_mean, meta="object").to_frame(name="mean")
means['idx'] = means.index
means['idx'] = means['idx'].astype(str)
f = f.merge(means, left_on="type", right_on="idx")
f["cosine distance"] = f.apply(lambda row:cosine(row["vector"], row["mean"]), axis=1, meta="object")
f.groupby("type", group_keys=False).apply(lambda x:x.sort_values("cosine distance"), meta="object")
return f
我的数据框中有两种项目:形状和颜色。每个项都与表示为numpy数组的向量相关联
from pandas import DataFrame
from numpy import array
from scipy.spatial.distance import cosine
from numpy import mean as array_mean
p = DataFrame({
"type": ["shape", "shape", "color", "color", "color"],
"vector": [array([1.0, 1.1]),
array([0.8, 0.9]),
array([0.6, 0.8]),
array([1.1, 1.2]),
array([0.7, 0.9])
]
})
type vector
0 shape [1.0, 1.1]
1 shape [0.8, 0.9]
2 color [0.6, 0.8]
3 color [1.1, 1.2]
4 color [0.7, 0.9]
我想做以下工作:
def pandas_ordering(f):
means = f.groupby("type")["vector"].apply(array_mean).to_frame().rename(columns={"vector":"mean"})
f = f.merge(means, left_on="type", right_index=True)
f["cosine distance"] = f.apply(lambda row:cosine(row["vector"], row["mean"]), axis=1)
return f.groupby("type", group_keys=False).apply(lambda x:x.sort_values("cosine distance"))
pandas_ordering(p)
type vector mean cosine distance
4 color [0.7, 0.9] [0.8000000000000002, 0.9666666666666667] 0.000459
2 color [0.6, 0.8] [0.8000000000000002, 0.9666666666666667] 0.001144
3 color [1.1, 1.2] [0.8000000000000002, 0.9666666666666667] 0.001280
0 shape [1.0, 1.1] [0.9, 1.0] 0.000012
1 shape [0.8, 0.9] [0.9, 1.0] 0.000019
我在Dask中重写了函数。逻辑是相同的,除了几个meta
装饰外,代码几乎是相同的
import dask.dataframe as dd
f = dd.from_pandas(p, npartitions=1)
def dask_ordering(f):
means = f.groupby("type")["vector"].apply(array_mean, meta="object").to_frame().rename(columns={0:"mean"})
f = f.merge(means, left_on="type", right_index=True)
f["cosine distance"] = f.apply(lambda row:cosine(row["vector"], row["mean"]), axis=1, meta="object")
f.groupby("type", group_keys=False).apply(lambda x:x.sort_values("cosine distance"), meta="object")
return f
然而,Dask版本在尝试将均值帧与原始向量帧合并时会出现错误
dask_ordering(f).compute()
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-120-46dd96a5db68> in <module>
----> 1 dask_ordering(f).compute()
<ipython-input-119-49ddda5479b1> in dask_ordering(f)
5 def dask_ordering(f):
6 means = f.groupby("type")["vector"].apply(array_mean, meta="object").to_frame().rename(columns={0:"mean"})
----> 7 f = f.merge(means, left_on="type", right_index=True)
8 f["cosine distance"] = f_1.apply(lambda row:cosine(row["vector"], row["mean"]), axis=1, meta="object")
9 f.groupby("type", group_keys=False).apply(lambda x:x.sort_values("cosine distance"), meta="object")
~/Documents/notebooks/env/lib/python3.6/site-packages/dask/dataframe/core.py in merge(self, right, how, on, left_on, right_on, left_index, right_index, suffixes, indicator, npartitions, shuffle)
3768 npartitions=npartitions,
3769 indicator=indicator,
-> 3770 shuffle=shuffle,
3771 )
3772
~/Documents/notebooks/env/lib/python3.6/site-packages/dask/dataframe/multi.py in merge(left, right, how, on, left_on, right_on, left_index, right_index, suffixes, indicator, npartitions, shuffle, max_branch)
490 right_index=right_index,
491 suffixes=suffixes,
--> 492 indicator=indicator,
493 )
494
~/Documents/notebooks/env/lib/python3.6/site-packages/dask/dataframe/multi.py in single_partition_join(left, right, **kwargs)
321 # new index will not necessarily correspond the current divisions
322
--> 323 meta = left._meta_nonempty.merge(right._meta_nonempty, **kwargs)
324 kwargs["empty_index_dtype"] = meta.index.dtype
325 name = "merge-" + tokenize(left, right, **kwargs)
~/Documents/notebooks/env/lib/python3.6/site-packages/pandas/core/frame.py in merge(self, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
7347 copy=copy,
7348 indicator=indicator,
-> 7349 validate=validate,
7350 )
7351
~/Documents/notebooks/env/lib/python3.6/site-packages/pandas/core/reshape/merge.py in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
79 copy=copy,
80 indicator=indicator,
---> 81 validate=validate,
82 )
83 return op.get_result()
~/Documents/notebooks/env/lib/python3.6/site-packages/pandas/core/reshape/merge.py in __init__(self, left, right, how, on, left_on, right_on, axis, left_index, right_index, sort, suffixes, copy, indicator, validate)
628 # validate the merge keys dtypes. We may need to coerce
629 # to avoid incompat dtypes
--> 630 self._maybe_coerce_merge_keys()
631
632 # If argument passed to validate,
~/Documents/notebooks/env/lib/python3.6/site-packages/pandas/core/reshape/merge.py in _maybe_coerce_merge_keys(self)
1136 inferred_right in string_types and inferred_left not in string_types
1137 ):
-> 1138 raise ValueError(msg)
1139
1140 # datetimelikes must match exactly
ValueError: You are trying to merge on object and int64 columns. If you wish to proceed you should use pd.concat
及
两者都给予
mean
type
color [0.8000000000000002, 0.9666666666666667]
shape [0.9, 1.0]
有人知道如何在Dask中实现这一点吗?问题在于,当您创建
数据帧时,意味着数据帧-索引是int
,而不是str
(可能是值得在github中提出的一个bug)(出于某种原因-它可能会自动转换为分类类型或类似类型。)
同时,时间低于功能是一种变通方法
def dask_ordering(f):
means = f.groupby("type")["vector"].apply(array_mean, meta="object").to_frame(name="mean")
means['idx'] = means.index
means['idx'] = means['idx'].astype(str)
f = f.merge(means, left_on="type", right_on="idx")
f["cosine distance"] = f.apply(lambda row:cosine(row["vector"], row["mean"]), axis=1, meta="object")
f.groupby("type", group_keys=False).apply(lambda x:x.sort_values("cosine distance"), meta="object")
return f
def dask_ordering(f):
means = f.groupby("type")["vector"].apply(array_mean, meta="object").to_frame(name="mean")
means['idx'] = means.index
means['idx'] = means['idx'].astype(str)
f = f.merge(means, left_on="type", right_on="idx")
f["cosine distance"] = f.apply(lambda row:cosine(row["vector"], row["mean"]), axis=1, meta="object")
f.groupby("type", group_keys=False).apply(lambda x:x.sort_values("cosine distance"), meta="object")
return f