R 超大字符矩阵循环的性能缩放

R 超大字符矩阵循环的性能缩放,r,performance,matrix,parallel-processing,vectorization,R,Performance,Matrix,Parallel Processing,Vectorization,我在R中有一个非常大的字符矩阵,大约[500000,5],包含名称。每行可能包含重复的名称。我想知道每行有多少不同的名字。据我所知,我不能向量化这个循环中的任何函数,对吗 例如: sampleNames <- c("Bob", "Elliot", "Sarah") # Dimensions [100000, 5] mat <- matrix(sampleNames[round(runif(500000, 1, 3))], ncol = 5) NamesPerRow <-

我在R中有一个非常大的字符矩阵,大约[500000,5],包含名称。每行可能包含重复的名称。我想知道每行有多少不同的名字。据我所知,我不能向量化这个循环中的任何函数,对吗

例如:

sampleNames <- c("Bob", "Elliot", "Sarah")


# Dimensions [100000, 5]
mat <- matrix(sampleNames[round(runif(500000, 1, 3))], ncol = 5)

NamesPerRow <- vector()

startTime <- Sys.time()
for(i in 1:dim(mat)[1]){
  NamesPerRow[i] <- length(unique(mat[i,])) 
}
Sys.time() - startTime

sampleNames使用
apply()
它是为矩阵设计的,可以节省大量时间。但在这里,您也可以通过分配返回向量而不是在循环中构建它来节省大量时间

sampleNames <- c("Bob", "Elliot", "Sarah")
# Dimensions [100000, 5]
mat <- matrix(sampleNames[round(runif(500000, 1, 3))], ncol = 5)
现在使用您当前的方法,我们有:

system.time({
  for(i in seq_along(NamesPerRow)) { ## seq_along() also slightly faster
    NamesPerRow[i] <- length(unique(mat[i,])) 
  }
})
#  user  system elapsed 
# 1.144   0.000   1.127 
检查:

identical(NamesPerRow, a)
# [1] TRUE

因此,只需简单地分配向量就可以节省大量时间。

您还可以使用
矩阵状态
包中的
行列表

# Dimensions [500000, 5]
mat <- matrix(sampleNames[round(runif(2500000, 1, 3))], ncol = 5)
library(matrixStats)
startTime <- Sys.time()
mat1 <- matrix(match(mat, sampleNames), ncol=5)
b <- rowSums(rowTabulates(mat1)!=0)
Sys.time() - startTime
# Time difference of 0.2012889 secs

为了提供第三个建议,您可以使用Rcpp:

library('Rcpp');

sampleNames <- c('Bob','Elliot','Sarah');
set.seed(1); mat <- matrix(sampleNames[round(runif(2500000,1,3))],ncol=5); ## 500000x5
head(mat);
##      [,1]     [,2]     [,3]     [,4]     [,5]
## [1,] "Elliot" "Elliot" "Bob"    "Elliot" "Elliot"
## [2,] "Elliot" "Sarah"  "Elliot" "Sarah"  "Elliot"
## [3,] "Elliot" "Elliot" "Elliot" "Bob"    "Bob"
## [4,] "Sarah"  "Bob"    "Bob"    "Sarah"  "Sarah"
## [5,] "Bob"    "Elliot" "Bob"    "Bob"    "Bob"
## [6,] "Sarah"  "Bob"    "Elliot" "Elliot" "Elliot"
cppFunction('
    IntegerVector distinctByRow(IntegerMatrix mat) {
        IntegerVector res(mat.nrow());
        if (mat.ncol() == 0) return res;
        std::vector<int> buf(mat.ncol());
        for (size_t r = 0; r < mat.nrow(); ++r) {
            IntegerMatrix::Row row = mat.row(r);
            buf.assign(row.begin(),row.end());
            std::sort(buf.begin(),buf.end());
            int count = 1;
            for (size_t c = 1; c < mat.ncol(); ++c)
                if (buf[c] != buf[c-1])
                    ++count;
            res(r) = count;
        }
        return res;
    }
');
res.rcpp <- distinctByRow(matrix(match(mat,sampleNames),nrow(mat)));
head(res.rcpp);
## [1] 2 2 2 2 2 3
最有趣的部分是结尾处的
for
循环;它们实际上是在循环输入中的每个唯一值,并获取行计数!我意识到,如果输入中有许多唯一的值,这可能会导致函数表现出较差的性能,而不像OP的示例数据那样,我们只有三个。因此,我做了另一个性能测试,这次使用了1000个唯一值,我还决定尝试使用更少的行和更多的列。正如你所看到的,结果与我上面得到的相反。这确实说明了算法的行为会因您向它们抛出的数据而异

## data 2 -- more names and columns
rstr <- function(N,charset=letters,lf=function(N) runif(N,trunc(lmin)-0.5,trunc(lmax)+0.5),lmin=1,lmax=10) {
    charset <- as.character(charset);
    len <- sort(as.integer(round(pmin(lmax,pmax(lmin,lf(N))))));
    rl <- rle(len);
    sample(do.call(c,Map(function(len,num) if (len == 0) rep('',num) else do.call(paste0,as.data.frame(matrix(sample(charset,len*num,replace=T),num))), rl$values, rl$lengths )));
};
set.seed(1); N <- 1e3; sampleNames <- rstr(N);
head(sampleNames);
## [1] "wcbzjxq"    "etxjz"      "ompognqack" "eufkli"     "rworpwkk"   "ghw"
mat <- matrix(sample(sampleNames,2500000,replace=T),ncol=500); ## 5000x500
head(mat[,1:6]);
##      [,1]       [,2]       [,3]        [,4]         [,5]      [,6]
## [1,] "qgrb"     "gb"       "pmiula"    "wrx"        "yr"      "kejil"
## [2,] "ivaqaaek" "alen"     "woenvkgkh" "zkocecowl"  "mjgv"    "ejqks"
## [3,] "nvz"      "yr"       "kyxmjjrnn" "vfzc"       "tnm"     "cnw"
## [4,] "ut"       "jgexsepo" "jh"        "ejqks"      "iy"      "galtchwmh"
## [5,] "ppxe"     "bnpqxbj"  "nvz"       "ruulsigdzq" "hpuw"    "rjsofvjev"
## [6,] "bdoxqim"  "qr"       "mgkkku"    "agjdgjhv"   "bdoxqim" "bdoxqim"

## proof of correctness 2
all.equal(f.loop.grow(mat),f.loop.prealloc(mat));
## [1] TRUE
all.equal(f.loop.prealloc(mat),f.apply(mat));
## [1] TRUE
all.equal(f.apply(mat),f.rowtab(mat));
## [1] TRUE
all.equal(f.rowtab(mat),f.rcpp(mat));
## [1] TRUE

## timing 2
microbenchmark(f.loop.grow(mat),f.loop.prealloc(mat),f.apply(mat),f.rowtab(mat),f.rcpp(mat),times=3L);
## Unit: milliseconds
##                  expr       min        lq      mean    median        uq       max neval
##      f.loop.grow(mat)  153.3568  157.6669  167.5521  161.9770  174.6497  187.3223     3
##  f.loop.prealloc(mat)  141.1644  142.8239  144.1546  144.4834  145.6497  146.8159     3
##          f.apply(mat)  166.2976  177.0187  195.1381  187.7397  209.5583  231.3770     3
##         f.rowtab(mat) 2590.8117 2623.3600 2665.5511 2655.9082 2702.9207 2749.9333     3
##           f.rcpp(mat)  197.6206  197.7765  202.5478  197.9324  205.0113  212.0903     3
##数据2——更多名称和列

rstr哇,这将时间从13+分钟减少到14秒!这是一个500000 x 5的矩阵。谢谢@新南威尔士州-请查看我的更新,以及其他新答案
matrix(unclass(factor(mat)),nrow(mat))
占用了
Rcpp
解决方案的大部分时间,否则它应该非常快。谢谢@experiator!我没有意识到我的
factor()
调用对性能有多大的负面影响。在通过窃取您的优秀
匹配(mat,sampleNames)
想法改进我的解决方案后,我现在发现我的Rcpp解决方案在团队中表现最好。谢谢大家!+你的答案是:)
# Dimensions [500000, 5]
mat <- matrix(sampleNames[round(runif(2500000, 1, 3))], ncol = 5)
library(matrixStats)
startTime <- Sys.time()
mat1 <- matrix(match(mat, sampleNames), ncol=5)
b <- rowSums(rowTabulates(mat1)!=0)
Sys.time() - startTime
# Time difference of 0.2012889 secs
startTime <- Sys.time()
a <- apply(mat, 1, function(x) length(unique(x)))
Sys.time() - startTime
# Time difference of 4.231503 secs
all.equal(a, b)
# [1] TRUE
library('Rcpp');

sampleNames <- c('Bob','Elliot','Sarah');
set.seed(1); mat <- matrix(sampleNames[round(runif(2500000,1,3))],ncol=5); ## 500000x5
head(mat);
##      [,1]     [,2]     [,3]     [,4]     [,5]
## [1,] "Elliot" "Elliot" "Bob"    "Elliot" "Elliot"
## [2,] "Elliot" "Sarah"  "Elliot" "Sarah"  "Elliot"
## [3,] "Elliot" "Elliot" "Elliot" "Bob"    "Bob"
## [4,] "Sarah"  "Bob"    "Bob"    "Sarah"  "Sarah"
## [5,] "Bob"    "Elliot" "Bob"    "Bob"    "Bob"
## [6,] "Sarah"  "Bob"    "Elliot" "Elliot" "Elliot"
cppFunction('
    IntegerVector distinctByRow(IntegerMatrix mat) {
        IntegerVector res(mat.nrow());
        if (mat.ncol() == 0) return res;
        std::vector<int> buf(mat.ncol());
        for (size_t r = 0; r < mat.nrow(); ++r) {
            IntegerMatrix::Row row = mat.row(r);
            buf.assign(row.begin(),row.end());
            std::sort(buf.begin(),buf.end());
            int count = 1;
            for (size_t c = 1; c < mat.ncol(); ++c)
                if (buf[c] != buf[c-1])
                    ++count;
            res(r) = count;
        }
        return res;
    }
');
res.rcpp <- distinctByRow(matrix(match(mat,sampleNames),nrow(mat)));
head(res.rcpp);
## [1] 2 2 2 2 2 3
## libs
library('Rcpp');
library('matrixStats');

## funcs
f.loop.grow <- function(mat) { res <- vector(); for (i in seq_len(nrow(mat))) res[i] <- length(unique(mat[i,])); res; };
f.loop.prealloc <- function(mat) { res <- vector('integer',nrow(mat)); for (i in seq_len(nrow(mat))) res[i] <- length(unique(mat[i,])); res; };
f.apply <- function(mat) apply(mat,1,function(x) length(unique(x)));
f.rowtab <- function(mat) rowSums(rowTabulates(matrix(match(mat,sampleNames),nrow(mat))) != 0L);
f.rcpp <- function(mat) distinctByRow(matrix(match(mat,sampleNames),nrow(mat)));

## data
sampleNames <- c('Bob','Elliot','Sarah');
set.seed(1); mat <- matrix(sampleNames[round(runif(2500000,1,3))],ncol=5); ## 500000x5

## proof of correctness
all.equal(f.loop.grow(mat),f.loop.prealloc(mat));
## [1] TRUE
all.equal(f.loop.prealloc(mat),f.apply(mat));
## [1] TRUE
all.equal(f.apply(mat),f.rowtab(mat));
## [1] TRUE
all.equal(f.rowtab(mat),f.rcpp(mat));
## [1] TRUE

## timing
microbenchmark(f.loop.grow(mat),f.loop.prealloc(mat),f.apply(mat),f.rowtab(mat),f.rcpp(mat),times=3L);
## Unit: milliseconds
##                  expr        min         lq        mean      median          uq         max neval
##      f.loop.grow(mat) 96624.4954 99011.9452 100625.0517 101399.3950 102625.3299 103851.2648     3
##  f.loop.prealloc(mat)  3572.0831  3574.6325   3616.9598   3577.1820   3639.3982   3701.6145     3
##          f.apply(mat)  3329.4926  3410.6111   3486.2511   3491.7296   3564.6304   3637.5311     3
##         f.rowtab(mat)   259.8664   288.6030    299.2716    317.3395    318.9742    320.6089     3
##           f.rcpp(mat)   122.1257   124.6957    163.4774    127.2657    184.1532    241.0407     3
rowTabulates <- function(x, values=NULL, ...) {
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # Validate arguments
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # Argument 'x':
    if (is.integer(x)) {
    } else if (is.raw(x)) {
    } else {
        stop("Argument 'x' is not of type integer or raw: ", class(x)[1]);
    }

    # Argument 'values':
    if (is.null(values)) {
        values <- as.vector(x);
        values <- unique(values);
        if (is.raw(values)) {
            values <- as.integer(values);
            values <- sort(values);
            # WORKAROUND: Cannot use "%#x" because it gives an error OSX with
            # R v2.9.0 devel (2009-01-13 r47593b) at R-forge. /HB 2009-06-20
            names <- sprintf("%x", values);
            names <- paste("0x", names, sep="");
            values <- as.raw(values);
        } else {
            values <- sort(values);
            names <- as.character(values);
        }
    } else {
        if (is.raw(values)) {
            names <- sprintf("%x", as.integer(values));
            names <- paste("0x", names, sep="");
        } else {
            names <- as.character(values);
        }
    }


    nbrOfValues <- length(values);
    counts <- matrix(0L, nrow=nrow(x), ncol=nbrOfValues);
    colnames(counts) <- names;

    for (kk in seq(length=nbrOfValues)) {
        counts[,kk] <- rowCounts(x, value=values[kk], ...);
    }

    counts;
}
## data 2 -- more names and columns
rstr <- function(N,charset=letters,lf=function(N) runif(N,trunc(lmin)-0.5,trunc(lmax)+0.5),lmin=1,lmax=10) {
    charset <- as.character(charset);
    len <- sort(as.integer(round(pmin(lmax,pmax(lmin,lf(N))))));
    rl <- rle(len);
    sample(do.call(c,Map(function(len,num) if (len == 0) rep('',num) else do.call(paste0,as.data.frame(matrix(sample(charset,len*num,replace=T),num))), rl$values, rl$lengths )));
};
set.seed(1); N <- 1e3; sampleNames <- rstr(N);
head(sampleNames);
## [1] "wcbzjxq"    "etxjz"      "ompognqack" "eufkli"     "rworpwkk"   "ghw"
mat <- matrix(sample(sampleNames,2500000,replace=T),ncol=500); ## 5000x500
head(mat[,1:6]);
##      [,1]       [,2]       [,3]        [,4]         [,5]      [,6]
## [1,] "qgrb"     "gb"       "pmiula"    "wrx"        "yr"      "kejil"
## [2,] "ivaqaaek" "alen"     "woenvkgkh" "zkocecowl"  "mjgv"    "ejqks"
## [3,] "nvz"      "yr"       "kyxmjjrnn" "vfzc"       "tnm"     "cnw"
## [4,] "ut"       "jgexsepo" "jh"        "ejqks"      "iy"      "galtchwmh"
## [5,] "ppxe"     "bnpqxbj"  "nvz"       "ruulsigdzq" "hpuw"    "rjsofvjev"
## [6,] "bdoxqim"  "qr"       "mgkkku"    "agjdgjhv"   "bdoxqim" "bdoxqim"

## proof of correctness 2
all.equal(f.loop.grow(mat),f.loop.prealloc(mat));
## [1] TRUE
all.equal(f.loop.prealloc(mat),f.apply(mat));
## [1] TRUE
all.equal(f.apply(mat),f.rowtab(mat));
## [1] TRUE
all.equal(f.rowtab(mat),f.rcpp(mat));
## [1] TRUE

## timing 2
microbenchmark(f.loop.grow(mat),f.loop.prealloc(mat),f.apply(mat),f.rowtab(mat),f.rcpp(mat),times=3L);
## Unit: milliseconds
##                  expr       min        lq      mean    median        uq       max neval
##      f.loop.grow(mat)  153.3568  157.6669  167.5521  161.9770  174.6497  187.3223     3
##  f.loop.prealloc(mat)  141.1644  142.8239  144.1546  144.4834  145.6497  146.8159     3
##          f.apply(mat)  166.2976  177.0187  195.1381  187.7397  209.5583  231.3770     3
##         f.rowtab(mat) 2590.8117 2623.3600 2665.5511 2655.9082 2702.9207 2749.9333     3
##           f.rcpp(mat)  197.6206  197.7765  202.5478  197.9324  205.0113  212.0903     3