Python 如何在不添加额外索引的情况下使用groupby apply()
我经常想通过组合一个分组数据帧的多个列来创建一个新的数据帧。apply()函数允许我这样做,但它要求我创建一个不需要的索引:Python 如何在不添加额外索引的情况下使用groupby apply(),python,pandas,apply,Python,Pandas,Apply,我经常想通过组合一个分组数据帧的多个列来创建一个新的数据帧。apply()函数允许我这样做,但它要求我创建一个不需要的索引: In [359]: df = pandas.DataFrame({'x': 3 * ['a'] + 2 * ['b'], 'y': np.random.normal(size=5), 'z': np.random.normal(size=5)}) In [360]: df Out[360]: x y z 0 a 0.
In [359]: df = pandas.DataFrame({'x': 3 * ['a'] + 2 * ['b'], 'y': np.random.normal(size=5), 'z': np.random.normal(size=5)})
In [360]: df
Out[360]:
x y z
0 a 0.201980 -0.470388
1 a 0.190846 -2.089032
2 a -1.131010 0.227859
3 b -0.263865 -1.906575
4 b -1.335956 -0.722087
In [361]: df.groupby('x').apply(lambda x: pandas.DataFrame({'r': (x.y + x.z).sum() / x.z.sum(), 's': (x.y + x.z ** 2).sum() / x.z.sum()}))
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/home/emarkley/work/src/partner_analysis2/main.py in <module>()
----> 1 df.groupby('x').apply(lambda x: pandas.DataFrame({'r': (x.y + x.z).sum() / x.z.sum(), 's': (x.y + x.z ** 2).sum() / x.z.sum()}))
/usr/local/lib/python3.2/site-packages/pandas-0.8.2.dev-py3.2-linux-x86_64.egg/pandas/core/groupby.py in apply(self, func, *args, **kwargs)
267 applied : type depending on grouped object and function
268 """
--> 269 return self._python_apply_general(func, *args, **kwargs)
270
271 def aggregate(self, func, *args, **kwargs):
/usr/local/lib/python3.2/site-packages/pandas-0.8.2.dev-py3.2-linux-x86_64.egg/pandas/core/groupby.py in _python_apply_general(self, func, *args, **kwargs)
417 group_axes = _get_axes(group)
418
--> 419 res = func(group, *args, **kwargs)
420
421 if not _is_indexed_like(res, group_axes):
/home/emarkley/work/src/partner_analysis2/main.py in <lambda>(x)
----> 1 df.groupby('x').apply(lambda x: pandas.DataFrame({'r': (x.y + x.z).sum() / x.z.sum(), 's': (x.y + x.z ** 2).sum() / x.z.sum()}))
/usr/local/lib/python3.2/site-packages/pandas-0.8.2.dev-py3.2-linux-x86_64.egg/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
371 mgr = self._init_mgr(data, index, columns, dtype=dtype, copy=copy)
372 elif isinstance(data, dict):
--> 373 mgr = self._init_dict(data, index, columns, dtype=dtype)
374 elif isinstance(data, ma.MaskedArray):
375 mask = ma.getmaskarray(data)
/usr/local/lib/python3.2/site-packages/pandas-0.8.2.dev-py3.2-linux-x86_64.egg/pandas/core/frame.py in _init_dict(self, data, index, columns, dtype)
454 # figure out the index, if necessary
455 if index is None:
--> 456 index = extract_index(data)
457 else:
458 index = _ensure_index(index)
/usr/local/lib/python3.2/site-packages/pandas-0.8.2.dev-py3.2-linux-x86_64.egg/pandas/core/frame.py in extract_index(data)
4719
4720 if not indexes and not raw_lengths:
-> 4721 raise ValueError('If use all scalar values, must pass index')
4722
4723 if have_series or have_dicts:
ValueError: If use all scalar values, must pass index
In [362]: df.groupby('x').apply(lambda x: pandas.DataFrame({'r': (x.y + x.z).sum() / x.z.sum(), 's': (x.y + x.z ** 2).sum() / x.z.sum()}, index=[0]))
Out[362]:
r s
x
a 0 1.316605 -1.672293
b 0 1.608606 -0.972593
[359]中的df=pandas.DataFrame({'x':3*['a']+2*['b'],'y':np.random.normal(size=5),'z':np.random.normal(size=5)})
In[360]:df
Out[360]:
x y z
0 a 0.201980-0.470388
1 a 0.190846-2.089032
2A-1.131010 0.227859
3 b-0.263865-1.906575
4b-1.335956-0.722087
[361]中的df.groupby('x').apply(lambda x:pandas.DataFrame({'r':(x.y+x.z).sum()/x.z.sum(),'s':(x.y+x.z**2.sum()/x.z.sum())
---------------------------------------------------------------------------
ValueError回溯(最近一次调用上次)
/home/emarkley/work/src/partner_analysis2/main.py in()
---->1 df.groupby('x').apply(lambda x:pandas.DataFrame({'r':(x.y+x.z).sum()/x.z.sum(),'s':(x.y+x.z**2.sum()/x.z.sum())
/应用中的usr/local/lib/python3.2/site-packages/pandas-0.8.2.dev-py3.2-linux-x86_64.egg/pandas/core/groupby.py(self、func、*args、**kwargs)
267应用:类型取决于分组对象和功能
268 """
-->269返回self.\u python\u apply\u general(func、*args、**kwargs)
270
271 def聚合(自身、函数、*args、**kwargs):
/usr/local/lib/python3.2/site-packages/pandas-0.8.2.dev-py3.2-linux-x86_64.egg/pandas/core/groupby.py in_python_apply_general(self、func、*args、**kwargs)
417组_轴=_获取_轴(组)
418
-->419 res=func(组,*args,**kwargs)
420
421如果不是,则类似于索引(res,组轴):
/home/emarkley/work/src/partner_analysis2/main.py in(x)
---->1 df.groupby('x').apply(lambda x:pandas.DataFrame({'r':(x.y+x.z).sum()/x.z.sum(),'s':(x.y+x.z**2.sum()/x.z.sum())
/usr/local/lib/python3.2/site-packages/pandas-0.8.2.dev-py3.2-linux-x86_64.egg/pandas/core/frame.py in_uuuuuinit_uuuu(self、数据、索引、列、数据类型、副本)
371 mgr=self.\u init\u mgr(数据、索引、列、dtype=dtype、copy=copy)
372 elif isinstance(数据、指令):
-->373 mgr=self.\u init\u dict(数据、索引、列、数据类型=dtype)
374 elif isinstance(数据,ma.MaskedArray):
375掩码=ma.getmaskarray(数据)
/usr/local/lib/python3.2/site-packages/pandas-0.8.2.dev-py3.2-linux-x86_64.egg/pandas/core/frame.py in_init_dict(self、数据、索引、列、数据类型)
454#如有必要,找出索引
455如果索引为无:
-->456索引=提取索引(数据)
457其他:
458索引=_确保_索引(索引)
/提取索引(数据)中的usr/local/lib/python3.2/site-packages/pandas-0.8.2.dev-py3.2-linux-x86_64.egg/pandas/core/frame.py
4719
4720如果不是索引和非原始长度:
->4721 raise VALUERROR('如果使用所有标量值,则必须通过索引')
4722
4723如果有_系列或_指令:
ValueError:如果使用所有标量值,则必须通过索引
[362]中的df.groupby('x').apply(lambda x:pandas.DataFrame({'r':(x.y+x.z).sum()/x.z.sum(),'s':(x.y+x.z**2.sum()/x.z.sum(),索引=[0]))
Out[362]:
RS
x
a 0 1.316605-1.672293
b 0 1.608606-0.972593
是否有任何方法可以使用apply()或其他函数来获得相同的结果,而不需要额外的零索引?您正在为每个组生成一个聚合的r和s值,因此您应该在此处使用
系列:
In [26]: df.groupby('x').apply(lambda x:
Series({'r': (x.y + x.z).sum() / x.z.sum(),
's': (x.y + x.z ** 2).sum() / x.z.sum()}))
Out[26]:
r s
x
a -0.338590 -0.916635
b 66.655533 102.566146
谢谢。我也不知道我应该返回一个系列,还尝试返回一个数据帧。这个信息(应该返回一个系列)应该在官方文档中。