R 寻找一系列'n'真中第一个真的位置_R_Performance_Count_Boolean

R 寻找一系列'n'真中第一个真的位置

r performance

R 寻找一系列'n'真中第一个真的位置,r,performance,count,boolean,R,Performance,Count,Boolean,从真/假向量 set.seed(1) x = rnorm(1503501) > 0 我正在寻找一种性能（快速）方法，用于获取第一系列nTRUEs的第一个TRUE的位置。我正在处理的向量（x）正好包含1503501元素（除了一些更短的元素）。下面是我目前的解决方案。它使用for循环，但是for循环在R中非常慢。有更好的、特别是更快的解决方案吗 n = 20 count = 0 solution = -1 for (i in 1:length(x)){ if (x[i]){

从真/假向量

set.seed(1)
x = rnorm(1503501) > 0

我正在寻找一种性能（快速）方法，用于获取第一系列
n
TRUEs的第一个TRUE的位置。

我正在处理的向量（

）正好包含

元素（除了一些更短的元素）。下面是我目前的解决方案。它使用for循环，但是for循环在R中非常慢。有更好的、特别是更快的解决方案吗

n = 20

count = 0
solution = -1
for (i in 1:length(x)){
    if (x[i]){
        count = count + 1
        if (count == n){solution = i+1-n; break}
    } else {count = 0}
}
print(solution)
1182796

我在考虑使用向量化函数，然后做一些类似于

y=which（x）

或最终

y=paste（which（x））

的事情，并寻找特定的模式，但我不知道如何做。

你可以获取向量并添加一个FALSE（零）到开头并删除结尾，然后将此增强向量添加到原始向量（作为整数的0/1向量），然后再次执行相同的操作，从先前的增强向量开始再添加一个FALSE（零），删除结尾，然后将其添加到当前的滚动和向量（同样，以整数向量的形式进行加法），直到向量的移位拷贝总数达到n个为止。然后，您可以这样做（sum_x==n），其中sum_x是和向量，并取which（）返回的最小值，然后减去n-1，这将使您开始一行中第一次出现n个TRUE。如果n与向量的长度相比稍小，这将运行得更快。

查看此成绩单（仅使用更小的随机样本）.我认为很明显，编写一个函数可以很容易地找出满足连接条件的第一个位置，并在该点之前的长度上使用cumsum：

> x = rnorm(1500) > 0

> rle(x)
Run Length Encoding
  lengths: int [1:751] 1 1 1 2 1 3 1 2 2 1 ...
  values : logi [1:751] FALSE TRUE FALSE TRUE FALSE TRUE ...
> table( rle(x)$lengths )

  1   2   3   4   5   6   7   8   9 
368 193  94  46  33  10   2   4   1 
> table( rle(x)$lengths , rle(x)$values)

    FALSE TRUE
  1   175  193
  2   100   93
  3    47   47
  4    23   23
  5    21   12
  6     5    5
  7     2    0
  8     3    1
  9     0    1
> which( rle(x)$lengths==8 & rle(x)$values==TRUE)
[1] 542
> which( rle(x)$lengths==7 & rle(x)$values==TRUE)
integer(0)
> which( rle(x)$lengths==6 & rle(x)$values==TRUE)
[1]  12 484 510 720 744

这是我的候选函数：

 tpos <- function(x,pos) { rl <- rle(x); len <- rl$lengths; 
            sum(len[ 1:(which( len == pos & rl$values==TRUE)[1]-1)],1)}
 tpos(x,6)
#[1] 18

您可以使用

Rcpp

：

library(Rcpp)
cppFunction('int fC(LogicalVector x, int n) {
  int xs = x.size();
  int count = 0;
  int solution = -1;
  for (int i = 0; i < xs; ++i) {
    if (x[i]){
      if (++count == n){solution = i+2-n; break;}
    } else {
      count = 0;
    }
  }
  return solution;
}')

库（Rcpp）
cppFunction（'int fC（LogicalVector x，int n）{
int xs=x.size（）；
整数计数=0；
int解=-1；
对于（int i=0；i


以下是一项小型基准研究：
f1 <- function(x,n) {
  count = 0
  solution = -1
  for (i in 1:length(x)){
    if (x[i]){
      count = count + 1
      if (count == n){solution = i+1-n; break}
    } else {count = 0}
  }
  solution
}


set.seed(1)
x = rnorm(150350100) > 0
n = 20

print(f1(x,n)==fC(x,n))
# [1] TRUE


library(rbenchmark)
benchmark(f1(x,n),fC(x,n))
#       test replications elapsed relative user.self sys.self user.child sys.child
# 1 f1(x, n)          100  80.038  180.673    63.300   16.686          0         0
# 2 fC(x, n)          100   0.443    1.000     0.442    0.000          0         0

f1 0
n=20
打印（f1（x，n）=fC（x，n））
#[1]是的
图书馆（rbenchmark）
基准（f1（x，n），fC（x，n））
#测试复制经过相对user.self sys.self user.child sys.child
#1 f1（x，n）100 80.038 180.673 63.300 16.686 0 0
#2 fC（x，n）100 0.443 1.000 0.442 0.000 0

[更新的基准]
# Suggested by BondedDust
tpos <- function(x,pos) { rl <- rle(x); len <- rl$lengths; 
                          sum(len[ 1:(which( len == pos & rl$values==TRUE)[1]-1)],1)}

set.seed(1)
x = rnorm(1503501) > 0
n = 20

print(f1(x,n)==fC(x,n))
# [1] TRUE
print(f1(x,n)==tpos(x,n))
# [1] TRUE


benchmark(f1(x,n),fC(x,n),tpos(x,n),replications = 10)
#         test replications elapsed relative user.self sys.self user.child sys.child
# 1   f1(x, n)           10   4.756  110.605     4.735    0.020          0         0
# 2   fC(x, n)           10   0.043    1.000     0.043    0.000          0         0
# 3 tpos(x, n)           10   2.591   60.256     2.376    0.205          0         0

#BondedDust建议
TPO足够公平的问题（+1），但你刚刚发明了“性能”吗？哈哈。我是ESL。但我使用“性能”这个词并不是完全错误的（请参见）能够突破一定会提高性能。（我们的价值观是一致的。）我有点惊讶tpos比f1好，但rle可能是针对它的功能进行了优化。哦！我不知道有Rcpp这样的东西存在！这是如此强大，能够在R环境中用C快速定义我们的函数。+1非常感谢！这个答案对我来说将有比在pres中提出的问题更广泛的应用ent post.@BondedDust，我也很惊讶。rle
不是一个内部函数，而且非常简单。值得注意的是，compiler:：cmpfun
：cmpf1=compiler:：cmpfun（f1）
；rbenchmark:：benchmark（f1（x，n），fC（x，n），tpos（x，n），cmpf1（x，n），replications=10）
# Suggested by BondedDust
tpos <- function(x,pos) { rl <- rle(x); len <- rl$lengths; 
                          sum(len[ 1:(which( len == pos & rl$values==TRUE)[1]-1)],1)}

set.seed(1)
x = rnorm(1503501) > 0
n = 20

print(f1(x,n)==fC(x,n))
# [1] TRUE
print(f1(x,n)==tpos(x,n))
# [1] TRUE


benchmark(f1(x,n),fC(x,n),tpos(x,n),replications = 10)
#         test replications elapsed relative user.self sys.self user.child sys.child
# 1   f1(x, n)           10   4.756  110.605     4.735    0.020          0         0
# 2   fC(x, n)           10   0.043    1.000     0.043    0.000          0         0
# 3 tpos(x, n)           10   2.591   60.256     2.376    0.205          0         0