Python 每日数据，每3天重新采样，在后续5天内高效计算_Python_Pandas_Numpy

Python 每日数据，每3天重新采样，在后续5天内高效计算

python pandas numpy

Python 每日数据，每3天重新采样，在后续5天内高效计算,python,pandas,numpy,Python,Pandas,Numpy,考虑df tidx = pd.date_range('2012-12-31', periods=11, freq='D') df = pd.DataFrame(dict(A=np.arange(len(tidx))), tidx) df 我想计算连续5天的总和，每3天一次我想是这样的这是经过编辑的我所拥有的是不正确的@伊万·波兹德耶夫和@boud注意到这是一个中心窗口，这不是我的意图。为困惑而道歉。每个人的解决方案都抓住了我所追求的大部分标准我正在寻找能够扩展到大型数据集的智

考虑

df

tidx = pd.date_range('2012-12-31', periods=11, freq='D')
df = pd.DataFrame(dict(A=np.arange(len(tidx))), tidx)
df

我想计算连续5天的总和，每3天一次

我想是这样的

这是经过编辑的
我所拥有的是不正确的@伊万·波兹德耶夫和@boud注意到这是一个中心窗口，这不是我的意图。为困惑而道歉。
每个人的解决方案都抓住了我所追求的大部分

标准

我正在寻找能够扩展到大型数据集的智能高效解决方案
我将提供时间解决方案，并考虑优雅
解决方案还应适用于各种样本和回溯频率

来自评论

我想要一个通用的解决方案来处理指定频率的回溯，并获取属于该回溯的任何内容。
- 对于上面的示例，回溯是
```
5D
```
  ，可能有4或50个观察值属于该回溯
我希望时间戳是回望期内最后观察到的时间戳

             A
2012-12-31   0
2013-01-01   1
2013-01-02   2
2013-01-03   3
2013-01-04   4
2013-01-05   5
2013-01-06   6
2013-01-07   7
2013-01-08   8
2013-01-09   9
2013-01-10  10

df.rolling(5,min_periods=5).sum().dropna().resample('3D').first()
Out[36]: 
                 A
2013-01-04 10.0000
2013-01-07 25.0000
2013-01-10 40.0000

np.searchsorted

将熊猫作为pd导入
将numpy作为np导入
tidx=pd.日期范围（'2012-12-31'，期间=11，频率=D'）
df=pd.DataFrame（dict（A=np.arange（len（tidx））），tidx）
样本频率=3天
样本宽度=5天
采样频率*=每天86400秒
样本宽度*=每天86400秒
times=df.index.astype（np.int64）//10**9#时间戳数组（unix时间）
cumsum=np.cumsum（df.A）.as_matrix（）#累积和数组（可以消除重叠较大的额外和）
mat=np。数组（[times，cumsum]）#可以消除临时时间和cumsum变量
def产量步骤（垫，频率）：
normtime=（（mat[0]-mat[0,0]）/freq.astype（int）#表示样本编号的整数
对于范围内的i（最大（正常时间）+1）：
收益率np.searchsorted（normtime，i）#收益率窗口索引的开始
def sumwindow（mat，i，width）：#i是yieldstep返回的窗口的开始
normtime=（（mat[0，i:]-mat[0，i]）/width.astype（int）#与之前相同，但我们对窗口宽度进行了规范
j=np.searchsorted（normtime，i，side='right'）-1#查找窗口的右侧
#返回unix历元中窗口最右边的时间戳（以秒为单位）和窗口总和
return mat[0，j]，mat[1，j]-mat[1，i]#窗口的和只是结束-开始，因为我们之前做了一个求和
加窗求和=np.数组（[sumwindow（mat，i，样本宽度）用于i的yieldstep（mat，样本频率）]）

场景#1：每个日期有多个条目，但没有遗漏日期

def vectorized_app3_v2(df, S=3, W=5):  
    dt = df.index.values
    shifts = np.append(False,dt[1:] > dt[:-1])
    c = np.bincount(shifts.cumsum(),df.A.values)
    f = c.size+S-W
    out = c[:f:S].copy()
    for i in range(1,W):
        out += c[i:f+i:S]
    out_index = dt[np.nonzero(shifts)[0][W-2::S]]
    return pd.DataFrame(out,index=out_index,columns=['A'])

场景#2：每个日期和缺失日期有多个条目

场景#3：连续日期和每个日期仅一个条目

创建测试数据的建议

df_data

df_data = np.random.randint(0,9,(len(idx0)))

>>> n=5   # trailing periods for rolling sum
>>> k=3   # frequency of rolling sum calc

>>> df.rolling(n).sum()[-1::-k][::-1]

               A
2013-01-01   NaN
2013-01-04  10.0
2013-01-07  25.0
2013-01-10  40.0

df.resample('D').sum().fillna(0)

df.resample('D').sum().fillna(0).rolling(n).sum()[-1::-k][::-1]

np.random.seed(12345)
per = 11
tidx = np.random.choice( pd.date_range('2012-12-31', periods=per, freq='D'), per )
df = pd.DataFrame(dict(A=np.arange(len(tidx))), tidx).sort_index()

def rolleach(df, ndays, window):
    return df.rolling(window, center=True).sum()[ndays-1::ndays]

rolleach(df, 3, 5)
Out[95]: 
               A
2013-01-02  10.0
2013-01-05  25.0
2013-01-08  40.0

list（）

~500ms

from __future__ import division
import numpy as np

#The date column is unimportant for calculations.
# I leave extracting the numbers' column from the dataframe
# and adding a corresponding element from data column to each result
# as an exercise for the reader
data = np.random.randint(100,size=100000)

def calc_trailing_data_with_interval(data,n,k):
    """Iterate over `data', computing sums of `n' trailing elements
    for each `k'th element.
    @type data: ndarray
    @param n: number of trailing elements to sum up
    @param k: interval with which to calculate sums
    """
    lim_index=len(data)-k+1

    nsums = int(np.ceil(n/k))
    sums = np.zeros(nsums,dtype=data.dtype)
    M=n%k
    Mp=k-M

    index=0
    currentsum=0

    while index<lim_index:
        for _ in range(Mp):
            #np.take is awkward, requiring a full list of indices to take
            for i in range(currentsum,currentsum+nsums-1):
                sums[i%nsums]+=data[index]
            index+=1
        for _ in range(M):
            sums+=data[index]
            index+=1
        yield sums[currentsum]
        currentsum=(currentsum+1)%nsums

150ms

np.int

dtype

~11ms

np.empty

~6.5ms

~5.5ms

# Setup input for exactly one entry per date
S = 4 # Could be edited
W = 7
datasize = 3  # Decides datasize
tidx = pd.date_range('2012-12-31', periods=datasize*S + W-S, freq='D')
df = pd.DataFrame(dict(A=np.arange(len(tidx))), tidx)

>>> n=5   # trailing periods for rolling sum
>>> k=3   # frequency of rolling sum calc

>>> df.rolling(n).sum()[-1::-k][::-1]

               A
2013-01-01   NaN
2013-01-04  10.0
2013-01-07  25.0
2013-01-10  40.0

def rolling_sum(a, n=5, k=3):
    ret = np.cumsum(a.values)
    ret[n:] = ret[n:] - ret[:-n]
    return pd.DataFrame( ret[n-1:][-1::-k][::-1], 
                         index=a[n-1:][-1::-k][::-1].index )

rolling_sum(df,n=6,k=4)   # default n=5, k=3

df.resample('D').sum().fillna(0)

df.resample('D').sum().fillna(0).rolling(n).sum()[-1::-k][::-1]

rolling_sum( df.resample('D').sum().fillna(0) )

np.random.seed(12345)
per = 11
tidx = np.random.choice( pd.date_range('2012-12-31', periods=per, freq='D'), per )
df = pd.DataFrame(dict(A=np.arange(len(tidx))), tidx).sort_index()

def rolleach(df, ndays, window):
    return df.rolling(window, center=True).sum()[ndays-1::ndays]

rolleach(df, 3, 5)
Out[95]: 
               A
2013-01-02  10.0
2013-01-05  25.0
2013-01-08  40.0

from __future__ import division
import numpy as np

#The date column is unimportant for calculations.
# I leave extracting the numbers' column from the dataframe
# and adding a corresponding element from data column to each result
# as an exercise for the reader
data = np.random.randint(100,size=100000)

def calc_trailing_data_with_interval(data,n,k):
    """Iterate over `data', computing sums of `n' trailing elements
    for each `k'th element.
    @type data: ndarray
    @param n: number of trailing elements to sum up
    @param k: interval with which to calculate sums
    """
    lim_index=len(data)-k+1

    nsums = int(np.ceil(n/k))
    sums = np.zeros(nsums,dtype=data.dtype)
    M=n%k
    Mp=k-M

    index=0
    currentsum=0

    while index<lim_index:
        for _ in range(Mp):
            #np.take is awkward, requiring a full list of indices to take
            for i in range(currentsum,currentsum+nsums-1):
                sums[i%nsums]+=data[index]
            index+=1
        for _ in range(M):
            sums+=data[index]
            index+=1
        yield sums[currentsum]
        currentsum=(currentsum+1)%nsums

def calc_trailing_data_with_interval(data,n,k):
    """Iterate over `data', computing sums of `n' trailing elements
    for each `k'th element.
    @type data: ndarray
    @param n: number of trailing elements to sum up
    @param k: interval with which to calculate sums
    """
    lim_index=len(data)-k+1

    nsums = int(np.ceil(n/k))
    sums = np.zeros(nsums,dtype=data.dtype)
    M=n%k
    Mp=k-M
    RM=range(M)     #cache for efficiency
    RMp=range(Mp)   #cache for efficiency

    index=0
    currentsum=0
    currentsum_ranges=[range(currentsum,currentsum+nsums-1)
            for currentsum in range(nsums)]     #cache for efficiency

    while index<lim_index:
        for _ in RMp:
            #np.take is unusable as it allocates another array rather than view
            for i in currentsum_ranges[currentsum]:
                sums[i%nsums]+=data[index]
            index+=1
        for _ in RM:
            sums+=data[index]
            index+=1
        yield sums[currentsum]
        currentsum=(currentsum+1)%nsums