如何加速R循环

如何加速R循环,r,R,我有一些数据看起来像 dfr <- data.frame(pos=1:20,val=sample(90:120,20)) pos val 1 1 116 2 2 97 3 3 100 4 4 105 5 5 112 6 6 95 7 7 91 8 8 117 9 9 98 10 10 94 11 11 110 12 12 118 13 13 120 14 14 115 15 15 103 16 16 10

我有一些数据看起来像

dfr <- data.frame(pos=1:20,val=sample(90:120,20))

   pos val
1    1 116
2    2  97
3    3 100
4    4 105
5    5 112
6    6  95
7    7  91
8    8 117
9    9  98
10  10  94
11  11 110
12  12 118
13  13 120
14  14 115
15  15 103
16  16 102
17  17 109
18  18  90
19  19  93
20  20 107
基准测试

library(microbenchmark)
library(ggplot2)

autoplot(microbenchmark("loop"=fn_median(dfr,5),times=1000))


这个代码太慢了。我如何改进它以使其更快?也许使用apply函数系列?

您可以使用
Rcpp
加快循环速度

#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
DataFrame nMedianCpp(DataFrame df, int w) {

  NumericVector val = df["val"];
  int l = val.size() / w;
  int ind = w / 2;
  NumericVector res(l);
  NumericVector start(l);
  NumericVector end(l);
  NumericVector temp(l);

  for (int i = 0; i < l; i++) {

    end[i] = (i + 1) * w;
    start[i] = end[i] - w + 1;
    temp = val[Range(start[i] - 1, end[i] - 1)];
    temp.sort();
    if (w % 2 == 0) {
      res[i] = (temp[ind - 1] + temp[ind]) / 2;
    } else {
      res[i] = temp[ind];
    }
  }
  return DataFrame::create(_["start"] = start, _["end"] = end, _["median"] = res);
}

您可以使用
数据。表
并按
pos-1
的整数除以
5
(或其他
n
)进行分组

库(data.table)

fn\u中值
数据。表
使用行组进行汇总的解决方案

样本数据

dt <- fread("pos val
1 116
2  97
3 100
4 105
5 112
6  95
7  91
8 117
9  98
10  94
11 110
12 118
13 120
14 115
15 103
16 102
17 109
18  90
19  93
20 107")

如果你真的想,我想你可以把它放在一个自定义函数中,用
window
作为参数。

经过检查,我的答案基本上与@IceCreamToucan的答案相同……因为有人已经给了a+1,我不会删除它(现在)…如何将此函数分配给R对象?使用扩展名
.cpp
保存上述代码(在RStudio 1.2中,您甚至有“C++文件”模板),然后使用
Rcpp::sourceCpp('nMedianCpp.cpp')
对其进行源代码转换。
#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
DataFrame nMedianCpp(DataFrame df, int w) {

  NumericVector val = df["val"];
  int l = val.size() / w;
  int ind = w / 2;
  NumericVector res(l);
  NumericVector start(l);
  NumericVector end(l);
  NumericVector temp(l);

  for (int i = 0; i < l; i++) {

    end[i] = (i + 1) * w;
    start[i] = end[i] - w + 1;
    temp = val[Range(start[i] - 1, end[i] - 1)];
    temp.sort();
    if (w % 2 == 0) {
      res[i] = (temp[ind - 1] + temp[ind]) / 2;
    } else {
      res[i] = temp[ind];
    }
  }
  return DataFrame::create(_["start"] = start, _["end"] = end, _["median"] = res);
}
Unit: microseconds
       expr        min          lq        mean      median         uq        max neval
       Rcpp    586.711    614.9285    784.7421    656.9605   1067.383   1262.981   100
  fn_median 152008.741 153254.4405 158502.5013 154716.9210 158738.811 310708.593   100
 fn_median2   2287.717   2365.5755   2544.5946   2393.2325   2423.802   8331.622   100
library(data.table)
fn_median <- function(df, n){
  setDT(df)
  df[, .(start = pos[1], end = last(pos), median = median(val))
      , by = .(drop = (pos - 1) %/% n)][, -'drop']
}

fn_median(dfr, 5)

#    start end median
# 1:     1   5    105
# 2:     6  10     95
# 3:    11  15    115
# 4:    16  20    102
library(microbenchmark)
dfr <- data.frame(pos = seq_len(1e4), val = sample(1e4))
microbenchmark(fn_median(dfr, 5), fn_median2(dfr, 5), times = 10)
# Unit: milliseconds
#                expr        min         lq       mean     median         uq       max neval
#   fn_median(dfr, 5) 113.324354 131.217695 147.213517 139.283545 167.387556 188.76767    10
#  fn_median2(dfr, 5)   2.896002   3.026053   4.554341   3.448822   3.687797  15.40021    10

dfr <- data.frame(pos = seq_len(1e6), val = sample(1e6))
microbenchmark(fn_median(dfr, 5), fn_median2(dfr, 5), times = 5)
# Unit: milliseconds
#                expr        min         lq      mean     median         uq        max neval
#   fn_median(dfr, 5) 13295.8565 13710.4458 13729.029 13734.9328 13876.7450 14027.1664     5
#  fn_median2(dfr, 5)    97.7186   103.9742   120.471   119.3268   121.1799   160.1556     5
library(data.table)
fn_median2 <- function(df, n){
  setDT(df)
  df[, .(start = pos[1], end = last(pos), median = median(val))
      , by = .(drop = (pos - 1) %/% n)][, -'drop']
}



fn_median <- function(dfr,win=5)
{
  n <- nrow(dfr)
  vec_start <- vector(length=floor(n/win),mode="numeric")
  vec_end <- vector(length=floor(n/win),mode="numeric")
  vec_median <- vector(length=floor(n/win),mode="numeric")
  k <- 1
  i <- 1
  while(i<=n)
  {
    vec_start[k] <- dfr$pos[i]
    vec_end[k] <- dfr$pos[i+(win-1)]
    vec_median[k] <- median(dfr$val[i:(i+(win-1))])
    k <- k+1
    i <- i+win
  }

  return(data.frame(start=vec_start,end=vec_end,median=vec_median))
}
dt <- fread("pos val
1 116
2  97
3 100
4 105
5 112
6  95
7  91
8 117
9  98
10  94
11 110
12 118
13 120
14 115
15 103
16 102
17 109
18  90
19  93
20 107")
window <- 5
#create group-incides of window-length
dt[, group := (pos - 1) %/% window]
#and now you can (by these groups) summarise whatever you want
dt[, list(start = pos[1], end = pos[.N], median = median(val) ), by = group][, group:=NULL][]
#    start end median
# 1:     1   5    105
# 2:     6  10     95
# 3:    11  15    115
# 4:    16  20    102