Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/python/303.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python 使用最小/最大值应用程序在可变滚动窗口上缓慢执行_Python_Performance_Pandas_Numpy_Max - Fatal编程技术网

Python 使用最小/最大值应用程序在可变滚动窗口上缓慢执行

Python 使用最小/最大值应用程序在可变滚动窗口上缓慢执行,python,performance,pandas,numpy,max,Python,Performance,Pandas,Numpy,Max,我正在计算时间序列上的值(通过MyValue表示)。下面的代码标识事件发生的位置(交叉索引),然后统计最后8个事件(n交叉)。与每行时间相关的第8个交叉点的索引在序列max_lookback中设置 df = pd.DataFrame([myvalues, cross, max_lookback, maxz, minz ] ).transpose() print(df.tail(n=60)) 总代码只需约0.5秒即可设置max_lookback。但是,当我运行pd.apply()以获取从当前索引

我正在计算时间序列上的值(通过MyValue表示)。下面的代码标识事件发生的位置(交叉索引),然后统计最后8个事件(n交叉)。与每行时间相关的第8个交叉点的索引在序列max_lookback中设置

df = pd.DataFrame([myvalues, cross, max_lookback, maxz, minz ] ).transpose()
print(df.tail(n=60))
总代码只需约0.5秒即可设置max_lookback。但是,当我运行pd.apply()以获取从当前索引到max_回溯的最小值和最大值时,代码运行大约需要22秒

我认为apply()应该比for循环更快地遍历行。为什么代码要花这么长时间才能执行,我怎样才能显著地提高速度

程序输出为

minmax的总时间为22.469秒

总运行时间为22.93秒


以图像为例,对于第19999行,我想在第19981行(max_lookback列)和第19999行之间找到MyValue的min/max,即-95和+97。

apply
通常不是一个非常有效的解决方案,因为它实际上只是引擎盖下的for循环本身

矢量化方法:

indices = pd.Series(range(20000))
sample_from = np.append(np.zeros(9), 1) #10% odds of selecting 1
cross = pd.Series(np.random.choice(sample_from, size=indices.size))
myvalues = pd.DataFrame(dict(Random=np.random.randint(-100, 
                                                      100,                       
                                                      size=indices.size)))

n_crosses = 8
nonzeros = cross.nonzero()[0]
diffs = (nonzeros-np.roll(nonzeros, n_crosses-1)).clip(0)
myvalues['lower'] = np.nan
myvalues.loc[nonzeros, 'lower'] = diffs
myvalues.lower = ((myvalues.index.to_series() - myvalues.lower)
                   .fillna(method='ffill')
                   .fillna(0).astype(np.int))
myvalues.loc[:(cross.cumsum() < n_crosses).sum()+1, 'lower'] = 0

reducer = np.empty((myvalues.shape[0]*2,), dtype=myvalues.lower.dtype)
reducer[::2] = myvalues.lower.values
reducer[1::2] = myvalues.index.values + 1
myvalues.loc[myvalues.shape[0]] = [0,0]
minmax_df = pd.DataFrame(
    {'min':np.minimum.reduceat(myvalues.Random.values, reducer)[::2],
     'max':np.maximum.reduceat(myvalues.Random.values, reducer)[::2]}
)
索引=pd.系列(范围(20000))
样本_from=np.append(np.zero(9),1)#选择1的10%几率
交叉=pd.系列(np.随机选择(样本来源,大小=指数大小))
myvalues=pd.DataFrame(dict(Random=np.Random.randint(-100,
100,                       
大小=索引。大小)
n_=8
非零=交叉。非零()
差异=(非零np.滚动(非零,n_交叉-1)).clip(0)
myvalues['lower']=np.nan
myvalues.loc[非零,'低']=diff
myvalues.lower=((myvalues.index.to_series()-myvalues.lower)
.fillna(方法='ffill')
.fillna(0.astype(np.int))
myvalues.loc[:(cross.cumsum()
这将产生与当前解决方案相同的最小/最大数据帧。基本思想是为
myvalues
中的每个索引生成最小/最大值的界限,然后使用来计算这些最小/最大值


在我的机器上,您当前的解决方案每个循环大约需要8.1秒,而上面的解决方案每个循环大约需要7.9毫秒,大约需要1025%的加速

这个答案是基于他出色的工作。我在代码中添加了注释,因为我花了大量时间来理解解决方案。我还发现了一些小问题

解决方案取决于numpy的函数

import pandas as pd
import numpy as np

indices = pd.Series(range(20000))
sample_from = np.append(np.zeros(2), 1) #10% odds of selecting 1
cross = pd.Series(np.random.choice(sample_from, size=indices.size))
myvalues = pd.DataFrame(dict(Random=np.random.randint(-100, 
                                                      100,                       
                                                      size=indices.size)))

n_crosses = 3

#eliminate nonzeros to speed up processing
nonzeros = cross.nonzero()[0]

#find the number of rows between each cross
diffs = (nonzeros-np.roll(nonzeros, n_crosses-1)).clip(0)

myvalues['lower'] = np.nan
myvalues.loc[nonzeros, 'lower'] = diffs

#set the index where a cross occurred
myvalues.lower = myvalues.index.to_series() - myvalues.lower

#fill the NA values with the previous cross index
myvalues.lower = myvalues.lower.fillna(method='ffill')
#fill the NaN values at the top of the series with 0
myvalues.lower = myvalues.lower.fillna(0).astype(np.int)

#set lower to 0 where crossses < n_crosses at the head of the Series
myvalues.loc[:(cross.cumsum() < n_crosses).sum()+1, 'lower'] = 0

#create a numpy array that lists the start and end index of events for each
# row in alternating order
reducer = np.empty((myvalues.shape[0]*2,), dtype=myvalues.lower.dtype)
reducer[::2] = myvalues.lower
reducer[1::2] = indices+1
reducer[len(reducer)-1] = indices[len(indices)-1]


myvalues['Cross'] = cross

#use reduceat to dramatically lower total execution time
myvalues['MinZ'] = np.minimum.reduceat( myvalues.iloc[:,0], reducer )[::2]
myvalues['MaxZ'] = np.maximum.reduceat( myvalues.iloc[:,0], reducer )[::2]

lastRow = len(myvalues)-1

#reduceat does not correctly identify the minimumu and maximum on the last row
#if a new min/max occurs on that row. This is a manual override

if myvalues.ix[lastRow,'MinZ'] >= myvalues.iloc[lastRow, 0]:
    myvalues.ix[lastRow,'MinZ'] = myvalues.iloc[lastRow, 0]

if myvalues.ix[lastRow,'MaxZ'] <= myvalues.iloc[lastRow, 0]:
    myvalues.ix[lastRow,'MaxZ'] = myvalues.iloc[lastRow, 0]    

print( myvalues.tail(n=60) )
将熊猫作为pd导入
将numpy作为np导入
指数=pd系列(范围(20000))
样本_from=np.append(np.zero(2),1)#选择1的10%几率
交叉=pd.系列(np.随机选择(样本来源,大小=指数大小))
myvalues=pd.DataFrame(dict(Random=np.Random.randint(-100,
100,                       
大小=索引。大小)
n_=3
#消除非零以加快处理速度
非零=交叉。非零()
#查找每个交叉点之间的行数
差异=(非零np.滚动(非零,n_交叉-1)).clip(0)
myvalues['lower']=np.nan
myvalues.loc[非零,'低']=diff
#设置发生交叉的索引
myvalues.lower=myvalues.index.to_series()-myvalues.lower
#用上一个交叉索引填充NA值
myvalues.lower=myvalues.lower.fillna(方法='ffill')
#用0填充序列顶部的NaN值
myvalues.lower=myvalues.lower.fillna(0).astype(np.int)
#当crossses=myvalues.iloc[lastRow,0]:
myvalues.ix[lastRow,'MinZ']=myvalues.iloc[lastRow,0]

如果myvalues.ix[lastRow,'MaxZ']与您所听到的相反,则行应用几乎从来都不是有效的解决方案。它不是矢量化的,实际上是引擎盖下的一个for循环。另外,你确定现在它工作正常吗?我在质疑你的
max\u lookback
值。。。据我所知,它们不会传播回8个事件之前。用for循环替换apply()会将时间增加到约27秒。我只是通过创建一个小屏幕截图再次检查了max lookback值。我将编辑帖子以确认工作。是的,当然,apply稍微快一点。然而,矢量化解决方案所带来的性能增益将大得多。我只是想拼凑一下你到底在做什么。啊,没关系,我知道你是如何使用
max\u lookback
的。现在看看。
import pandas as pd
import numpy as np

indices = pd.Series(range(20000))
sample_from = np.append(np.zeros(2), 1) #10% odds of selecting 1
cross = pd.Series(np.random.choice(sample_from, size=indices.size))
myvalues = pd.DataFrame(dict(Random=np.random.randint(-100, 
                                                      100,                       
                                                      size=indices.size)))

n_crosses = 3

#eliminate nonzeros to speed up processing
nonzeros = cross.nonzero()[0]

#find the number of rows between each cross
diffs = (nonzeros-np.roll(nonzeros, n_crosses-1)).clip(0)

myvalues['lower'] = np.nan
myvalues.loc[nonzeros, 'lower'] = diffs

#set the index where a cross occurred
myvalues.lower = myvalues.index.to_series() - myvalues.lower

#fill the NA values with the previous cross index
myvalues.lower = myvalues.lower.fillna(method='ffill')
#fill the NaN values at the top of the series with 0
myvalues.lower = myvalues.lower.fillna(0).astype(np.int)

#set lower to 0 where crossses < n_crosses at the head of the Series
myvalues.loc[:(cross.cumsum() < n_crosses).sum()+1, 'lower'] = 0

#create a numpy array that lists the start and end index of events for each
# row in alternating order
reducer = np.empty((myvalues.shape[0]*2,), dtype=myvalues.lower.dtype)
reducer[::2] = myvalues.lower
reducer[1::2] = indices+1
reducer[len(reducer)-1] = indices[len(indices)-1]


myvalues['Cross'] = cross

#use reduceat to dramatically lower total execution time
myvalues['MinZ'] = np.minimum.reduceat( myvalues.iloc[:,0], reducer )[::2]
myvalues['MaxZ'] = np.maximum.reduceat( myvalues.iloc[:,0], reducer )[::2]

lastRow = len(myvalues)-1

#reduceat does not correctly identify the minimumu and maximum on the last row
#if a new min/max occurs on that row. This is a manual override

if myvalues.ix[lastRow,'MinZ'] >= myvalues.iloc[lastRow, 0]:
    myvalues.ix[lastRow,'MinZ'] = myvalues.iloc[lastRow, 0]

if myvalues.ix[lastRow,'MaxZ'] <= myvalues.iloc[lastRow, 0]:
    myvalues.ix[lastRow,'MaxZ'] = myvalues.iloc[lastRow, 0]    

print( myvalues.tail(n=60) )