C++ 如何在C++/Rcpp_C++_R_Rcpp_Armadillo

C++ 如何在C++/Rcpp

c++ r

C++ 如何在C++/Rcpp,c++,r,rcpp,armadillo,C++,R,Rcpp,Armadillo,我有一个大的向量，包含一堆双元素。给定一个百分位向量数组，例如percentile_vec=c（0.90,0.91,0.92,0.93,0.94,0.95）。我目前正在使用Rcppsort函数对大向量进行排序，然后找到相应的百分位值。以下是主要代码： // [[Rcpp::export]] NumericVector sort_rcpp(Rcpp::NumericVector& x) { std::vector<double> tmp = Rcpp::as<std:

我有一个大的向量，包含一堆双元素。给定一个百分位向量数组，例如

percentile_vec=c（0.90,0.91,0.92,0.93,0.94,0.95）

。我目前正在使用Rcpp

sort

函数对大向量进行排序，然后找到相应的百分位值。以下是主要代码：

// [[Rcpp::export]]
NumericVector sort_rcpp(Rcpp::NumericVector& x)
{
  std::vector<double> tmp = Rcpp::as<std::vector<double>> (x);    // or NumericVector tmp = clone(x);
  std::sort(tmp.begin(), tmp.end());
  return wrap(tmp);
}

// [[Rcpp::export]]
NumericVector percentile_rcpp(Rcpp::NumericVector& x, Rcpp::NumericVector& percentile)
{
  NumericVector tmp_sort = sort_rcpp(x);
  int size_per = percentile.size();
  NumericVector percentile_vec = no_init(size_per);
  for (int ii = 0; ii < size_per; ii++)
  {
    double size_per = tmp_sort.size() * percentile[ii];
    double size_per_round;
    if (size_per < 1.0)
    {
      size_per_round = 1.0;
    }
    else
    {
      size_per_round = std::round(size_per);
    }
    percentile_vec[ii] = tmp_sort[size_per_round-1];  // For extreme case such as size_per_round == tmp_sort.size() to avoid overflow
  }
  return percentile_vec;
}

如果（1E6）如下所示，则测试将针对

x=run进行：
microbenchmark(sub_percentile(x)->aa, percentile_rcpp(x, c(.90, .91, .92, .93, .94, .95))->bb)
#Unit: milliseconds
              expr      min       lq     mean   median       uq       max   neval
  sub_percentile(x) 99.00029 99.24160 99.35339 99.32162 99.41869 100.57160   100
 percentile_rcpp(~) 87.13393 87.30904 87.44847 87.40826 87.51547  88.41893   100

我希望百分位数的计算速度很快，但我认为std:：sort（tmp.begin（），tmp.end（））
会降低速度。有没有更好的方法来获得快速结果使用C++，RCPP/RCPARAMDILIO？谢谢。
循环中的分支肯定可以优化。将std:：min/max调用与int一起使用
我将通过以下方式解决阵列索引的百分比计算：
uint PerCentIndex( double pc, uint size )
{
    return 0.5 + ( double ) ( size - 1 ) * pc;
}

只有上述环中间的这一行：
percentile_vec[ii] 
 = tmp_sort[ PerCentIndex( percentile[ii], tmp_sort.size() ) ];

根据需要计算的百分位数和向量的大小，您可以做得比对整个向量排序（最多O（N*log（N））好得多（只有O（N））
我必须计算1%的向量（>=160K）元素，所以我做了以下工作：
void prctile_stl(double* in, const dim_t &len, const double &percent, std::vector<double> &range) {
// Calculates "percent" percentile.
// Linear interpolation inspired by prctile.m from MATLAB.

double r = (percent / 100.) * len;

double lower = 0;
double upper = 0;
double* min_ptr = NULL;
dim_t k = 0;

if(r >= len / 2.) {     // Second half is smaller
    dim_t idx_lo = max(r - 1, (double) 0.);
    nth_element(in, in + idx_lo, in + len);             // Complexity O(N)
    lower = in[idx_lo];
    if(idx_lo < len - 1) {
        min_ptr = min_element(&(in[idx_lo + 1]), in + len);
        upper = *min_ptr;
        }
    else
        upper = lower;
    }
else {                  // First half is smaller
    double* max_ptr;
    dim_t idx_up = ceil(max(r - 1, (double) 0.));
    nth_element(in, in + idx_up, in + len);             // Complexity O(N)
    upper = in[idx_up];
    if(idx_up > 0) {
        max_ptr = max_element(in, in + idx_up);
        lower = *max_ptr;
        }
    else
        lower = upper;
    }

// Linear interpolation
k = r + 0.5;        // Implicit floor
r = r - k;
range[1] = (0.5 - r) * lower + (0.5 + r) * upper;

min_ptr = min_element(in, in + len);
range[0] = *min_ptr;
}

void prctile\u stl（双*英寸，常数尺寸和长度，常数双和百分比，标准：：向量和范围）{
//计算“百分比”百分位数。
//线性插值的灵感来源于MATLAB中的prctile.m。
双r=（百分比/100.）*len；
双下=0；
双上限=0；
double*min_ptr=NULL；
尺寸k=0；
如果（r>=len/2.）{//则后半部分较小
dim_t idx_lo=最大值（r-1，（双）0.）；
第N个元素（in，in+idx\u-lo，in+len）；//复杂性O（N）
较低=英寸[idx_lo]；
如果（idx_lo0）{
max_ptr=max_元素（in，in+idx_up）；
下限=*最大值；
}
其他的
下=上；
}
//线性插值
k=r+0.5；//隐含楼层
r=r-k；
范围[1]=（0.5-r）*下限+（0.5+r）*上限；
min_ptr=min_元素（in，in+len）；
范围[0]=*min\u ptr；
}

另一种替代方法是数字接收机中的IQAgent算法。预计起飞时间。
它最初用于数据流，但您可以通过将大型数据向量拆分为较小的块（例如10K个元素）并计算每个块的百分位数（其中使用了对10K个块的排序）来欺骗它。如果一次处理一个块，每个连续块都会稍微修改百分位数的值，直到最后得到一个非常好的近似值。该算法给出了很好的结果（最多3或4位小数），但仍然比第n个元素的实现慢。
您可能已经意识到了这一点，但这些函数产生的结果略有不同。好的排序将是O（n log（n）），并且您无法获得比对向量排序更好的结果。之后，您将通过向量进行线性搜索，以找到相应的元素。您可能会从中受益，因为您有一个排序向量。@nurssell您完全正确，我也很好奇R是如何进行百分位数
计算的。我注意到，对于runif（1E6）
，两个结果略有不同，这在我的容许范围内。@NathanOliver感谢您的输入。需要一段时间look@Alvin我相信是基本R的分位数函数的实现。
void prctile_stl(double* in, const dim_t &len, const double &percent, std::vector<double> &range) {
// Calculates "percent" percentile.
// Linear interpolation inspired by prctile.m from MATLAB.

double r = (percent / 100.) * len;

double lower = 0;
double upper = 0;
double* min_ptr = NULL;
dim_t k = 0;

if(r >= len / 2.) {     // Second half is smaller
    dim_t idx_lo = max(r - 1, (double) 0.);
    nth_element(in, in + idx_lo, in + len);             // Complexity O(N)
    lower = in[idx_lo];
    if(idx_lo < len - 1) {
        min_ptr = min_element(&(in[idx_lo + 1]), in + len);
        upper = *min_ptr;
        }
    else
        upper = lower;
    }
else {                  // First half is smaller
    double* max_ptr;
    dim_t idx_up = ceil(max(r - 1, (double) 0.));
    nth_element(in, in + idx_up, in + len);             // Complexity O(N)
    upper = in[idx_up];
    if(idx_up > 0) {
        max_ptr = max_element(in, in + idx_up);
        lower = *max_ptr;
        }
    else
        lower = upper;
    }

// Linear interpolation
k = r + 0.5;        // Implicit floor
r = r - k;
range[1] = (0.5 - r) * lower + (0.5 + r) * upper;

min_ptr = min_element(in, in + len);
range[0] = *min_ptr;
}