Python 如何根据列名通过方法对多索引数据帧重新采样
这是一个带有多索引列的Pandas v0.14.0数据框Python 如何根据列名通过方法对多索引数据帧重新采样,python,pandas,multi-index,Python,Pandas,Multi Index,这是一个带有多索引列的Pandas v0.14.0数据框 > import pandas as pd > import numpy as np > > rng = pd.date_range('1/1/2001', periods=6, freq='H') > mi = [(dt, i) for dt in rng for i in range(2)] > f = pd.DataFrame(np.random.randn(len(mi), 2), >
> import pandas as pd
> import numpy as np
>
> rng = pd.date_range('1/1/2001', periods=6, freq='H')
> mi = [(dt, i) for dt in rng for i in range(2)]
> f = pd.DataFrame(np.random.randn(len(mi), 2),
> index = pd.MultiIndex.from_tuples(mi, names=['time', 'extra']),
columns =['A', 'B'])
> g = f.unstack('extra')
> g
A B
extra 0 1 0 1
time
2001-01-01 00:00:00 -0.169742 0.390842 -0.017884 1.043376
2001-01-01 01:00:00 -0.184442 -0.102512 -0.013702 0.675290
2001-01-01 02:00:00 0.244708 -0.360740 1.059269 -0.330537
2001-01-01 03:00:00 -2.275161 -1.782581 0.754368 -0.157851
2001-01-01 04:00:00 -0.554282 0.310691 0.917221 -0.114459
2001-01-01 05:00:00 0.599133 0.904824 1.858538 1.319041
我可以在所有列中使用一种方法成功地重新采样g
,例如通过g.resample('6H',how=np.sum)
。如何使用不同的方法对每列重新采样,例如对“A”列求和并对“B”列求平均值
我尝试了以下方法,该方法适用于非多索引列,但出现了一个错误
> g.resample('6H', how={'A': np.sum, 'B': np.mean})
KeyError Traceback (most recent call last)
<ipython-input-217-b1a72fd62178> in <module>()
4 g = f.unstack('extra')
5 print(g)
----> 6 g.resample('6H', how={'A': np.sum, 'B': np.mean})
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/generic.py in resample(self, rule, how, axis, fill_method, closed, label, convention, kind, loffset, limit, base)
2834 fill_method=fill_method, convention=convention,
2835 limit=limit, base=base)
-> 2836 return sampler.resample(self).__finalize__(self)
2837
2838 def first(self, offset):
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/tseries/resample.py in resample(self, obj)
81
82 if isinstance(ax, DatetimeIndex):
---> 83 rs = self._resample_timestamps()
84 elif isinstance(ax, PeriodIndex):
85 offset = to_offset(self.freq)
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/tseries/resample.py in _resample_timestamps(self)
252 # downsample
253 grouped = obj.groupby(grouper, axis=self.axis)
--> 254 result = grouped.aggregate(self._agg_method)
255 else:
256 # upsampling shortcut
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/groupby.py in aggregate(self, arg, *args, **kwargs)
2402 colg = SeriesGroupBy(obj[col], selection=col,
2403 grouper=self.grouper)
-> 2404 result[col] = colg.aggregate(agg_how)
2405 keys.append(col)
2406
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
2078 cyfunc = _intercept_cython(func_or_funcs)
2079 if cyfunc and not args and not kwargs:
-> 2080 return getattr(self, cyfunc)()
2081
2082 if self.grouper.nkeys > 1:
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/groupby.py in mean(self)
668 self._set_selection_from_grouper()
669 f = lambda x: x.mean(axis=self.axis)
--> 670 return self._python_agg_general(f)
671
672 def median(self):
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/groupby.py in _python_agg_general(self, func, *args, **kwargs)
1012 # iterate through "columns" ex exclusions to populate output dict
1013 output = {}
-> 1014 for name, obj in self._iterate_slices():
1015 try:
1016 result, counts = self.grouper.agg_series(obj, f)
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/groupby.py in _iterate_slices(self)
650
651 def _iterate_slices(self):
--> 652 yield self.name, self._selected_obj
653
654 def transform(self, func, *args, **kwargs):
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/lib.so in pandas.lib.cache_readonly.__get__ (pandas/lib.c:37563)()
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/groupby.py in _selected_obj(self)
461 return self.obj
462 else:
--> 463 return self.obj[self._selection]
464
465 def _set_selection_from_grouper(self):
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/frame.py in __getitem__(self, key)
1682 return self._getitem_multilevel(key)
1683 else:
-> 1684 return self._getitem_column(key)
1685
1686 def _getitem_column(self, key):
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/frame.py in _getitem_column(self, key)
1689 # get column
1690 if self.columns.is_unique:
-> 1691 return self._get_item_cache(key)
1692
1693 # duplicate columns & possible reduce dimensionaility
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
1050 res = cache.get(item)
1051 if res is None:
-> 1052 values = self._data.get(item)
1053 res = self._box_item_values(item, values)
1054 cache[item] = res
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/internals.py in get(self, item)
2535
2536 if not isnull(item):
-> 2537 loc = self.items.get_loc(item)
2538 else:
2539 indexer = np.arange(len(self.items))[isnull(self.items)]
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/index.py in get_loc(self, key)
1154 loc : int if unique index, possibly slice or mask if not
1155 """
-> 1156 return self._engine.get_loc(_values_from_object(key))
1157
1158 def get_value(self, series, key):
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/index.so in pandas.index.IndexEngine.get_loc (pandas/index.c:3650)()
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/index.so in pandas.index.IndexEngine.get_loc (pandas/index.c:3577)()
KeyError: 'B'
>g.resample('6H',how={'A':np.sum,'B':np.mean})
KeyError回溯(最近一次呼叫最后一次)
在()
4 g=f.取消堆叠(“额外”)
5份印刷品(g)
---->6g.重采样('6H',how={'A':np.sum,'B':np.mean})
/用户/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/generic.py重采样(自我、规则、方式、轴、填充方法、闭合、标签、约定、种类、偏移、限制、基础)
2834填充方法=填充方法,约定=约定,
2835极限=极限,基准=基准)
->2836返回取样器。重新取样(自).\uuuu最终确定\uuuuuu(自)
2837
2838 def first(自补偿):
/用户/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/tseries/resample.py重采样(self,obj)
81
82如果isinstance(ax,DatetimeIndex):
--->83 rs=自重采样时间戳()
84 elif isinstance(ax,周期索引):
85偏移量=至偏移量(自频率)
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/tseries/resample.py in_resample_timestaps(self)
252#下样本
253 grouped=obj.groupby(grouper,axis=self.axis)
-->254结果=分组聚合(自聚集法)
255其他:
256#上采样快捷方式
/聚合用户/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/groupby.py(self、arg、*args、**kwargs)
2402列=系列分组依据(obj[col],选择=col,
2403石斑鱼=self.gropper)
->2404结果[col]=冷聚合(聚合方式)
2405键。追加(列)
2406
/聚合用户/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/groupby.py(self、func_或_funcs、*args、**kwargs)
2078 cyfunc=\u intercept\u cython(func\u或\u funcs)
2079如果cyfunc和not args和not kwargs:
->2080返回getattr(self,cyfunc)()
2081
2082如果self.grouper.nkeys>1:
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/groupby.py平均值(self)
668自组自组自组自组自组自组自组自组自组自组自组自组自组自组自组自组自组自组自组自组自组自组自组自组自组自组自组自组自组自组自组自组自组自组
669 f=λx:x.平均值(轴=自身轴)
-->670返回自我。通用(f)
671
672 def中位数(自身):
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/groupby.py in_python_agg_general(self、func、*args、**kwargs)
1012#在排除项之外的“列”中迭代以填充输出dict
1013输出={}
->1014对于名称,对象在self.\u迭代\u slices():
1015试试:
1016结果,计数=self.grouper.agg_系列(obj,f)
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/groupby.py in_iterate_slice(self)
650
651定义迭代切片(自):
-->652生成self.name,self.\u所选对象
653
654 def转换(self、func、*args、**kwargs):
/pandas.lib.cache\u readonly.中的Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/lib.so.\uuuu get\uuuuu(pandas/lib.c:37563)()
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/groupby.py in_selected_obj(self)
461返回self.obj
462其他:
-->463返回自我对象[自我选择]
464
465定义设置从grouper(自身)选择:
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/frame.py in_u___getitem_u_(self,key)
1682返回自我。\u获取项目\u多级(键)
1683其他:
->1684返回自我。\u获取项目\u列(键)
1685
1686 def _getitem_列(自身,键):
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/frame.py在_getitem_列中(self,key)
1689#获取列
1690如果self.columns.u是唯一的:
->1691返回自我。获取项目缓存(密钥)
1692
1693#重复列和可能的降维
/缓存中的Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/generic.py(self,item)
1050 res=cache.get(项)
1051如果res为无:
->1052值=自身数据获取(项目)
1053 res=自身值(项目,值)
1054缓存[项目]=res
/get中的Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/internals.py(self,item)
2535
2536如果不为空(项目):
->2537 loc=自身物品。获取物品位置(物品)
2538其他:
2539 indexer=np.arange(len(self.items))[isnull(self.items)]
/get_loc(self,key)中的Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/index.py
1154 loc:int如果是唯一索引,则可能是切片或掩码(如果不是)
1155 """
->1156返回self.\u引擎。获取\u loc(\u值\u来自\u对象(键))
1157
1158 def get_值(自身、系列、键):
/用户/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/index.so in pandas.index.IndexEngine.get_loc(pandas/index.c:3650)()
/用户/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/index.so in pandas.index.IndexEngine.get_loc(pandas/index.c:3577)()
键错误:“B”
如果你以f开头,你可以用u
In [11]: grp = f.groupby(pd.TimeGrouper('6H', level=0))
In [12]: grp['A'].sum()
Out[12]:
0
2001-01-01 -1.805954
Freq: 6H, Name: A, dtype: float64
In [13]: grp['B'].mean()
Out[13]:
0
2001-01-01 -0.461053
Freq: 6H, Name: B, dtype: float64
In [21]: grp2 = f.groupby([pd.TimeGrouper('6H', level=0),
f.index.get_level_values('extra')])
In [22]: grp2['A'].sum()
Out[22]:
0 extra
2001-01-01 0 2.030321
1 -3.836275
Name: A, dtype: float64
In [23]: grp2['B'].mean()
Out[23]:
0 extra
2001-01-01 0 -0.554839
1 -0.367267
Name: B, dtype: float64
In [31]: f2 = g.stack(level=1) # Note: use stack to get f from g
In [32]: pd.DataFrame({'A': grp['A'].sum(), 'B': grp['B'].mean()})
Out[32]:
A B
0 extra
2001-01-01 0 -2.762064 -0.269427
1 -2.006839 -0.026213
In [33]: _.unstack(level=1)
Out[33]:
A B
extra 0 1 0 1
0
2001-01-01 -2.762064 -2.006839 -0.269427 -0.026213
In [41]: dict(zip(g.columns,
map({'A': 'sum', 'B': 'mean'}.get,
[x[0] for x in g.columns])))
Out[41]: {('A', 0): 'sum', ('A', 1): 'sum', ('B', 0): 'mean', ('B', 1): 'mean'}
In [42]: g.resample('6H', _)
Out[42]:
A B A B
1 0 0 1
time
2001-01-01 -3.836275 -0.554839 2.030321 -0.367267