R中的性能问题_R_Performance_Match

R中的性能问题

r performance

R中的性能问题,r,performance,match,R,Performance,Match,在R中，我试图在大量数据文件中创建元素计数矩阵： rnames <- c("N","A") mymatrix <- matrix(nrow=2,ncol=0,dimnames=list(rnames)) #loop through hundreds of large files (MB) #make the vector "names" contain all elements within each file for(name in names) { #if name is a

在R中，我试图在大量数据文件中创建元素计数矩阵：

rnames <- c("N","A")
mymatrix <- matrix(nrow=2,ncol=0,dimnames=list(rnames))

#loop through hundreds of large files (MB)
#make the vector "names" contain all elements within each file 
for(name in names)
{
#if name is already in the matrix increment by 1 the second row
  if(name %in% colnames(mymatrix))
  {
    mymatrix[2,name] = mymatrix[2,name]+1
  }
#else add a column to the matrix with the specified name
  else
  {
    mymatrix <- transform(mymatrix,name)
    mymatrix[2,name] = 1
  }    
}

rnames我不知道你是如何确定match
是瓶颈的。可能是这样，但您提供的示例没有显示这一点
rnames <- c("N","A")
mymatrix <- matrix(nrow=2, ncol=0, dimnames=list(rnames))
set.seed(21)
names <- sample(letters, 1e6, TRUE)
Rprof()
for(name in names) {
  if(name %in% colnames(mymatrix)) {
    mymatrix[2,name] <- mymatrix[2,name] + 1
  } else {
    mymatrix <- transform(mymatrix,name)
    mymatrix[2,name] <- 1
  }
}
Rprof(NULL)

避免调用transform
，您的代码将大大加快。而mymatrix2
实际上是一个矩阵，而mymatrix2
是一个data.frame
rnames <- c("N","A")
mymatrix2 <- matrix(nrow=2, ncol=0, dimnames=list(rnames))
set.seed(21)
names <- sample(letters, 1e6, TRUE)
Rprof()
for(name in names) {
  if(name %in% colnames(mymatrix)) {
    mymatrix2[2,name] <- mymatrix2[2,name] + 1
  } else {
    mymatrix2 <- cbind(mymatrix2, matrix(c(NA,1), 2, 1, dimnames=list(rnames, name)))
  }
}
Rprof(NULL)
lapply(summaryRprof(), head)
$by.self
                self.time self.pct total.time total.pct
"match"              1.28    41.83       2.70     88.24
"colnames"           0.78    25.49       1.42     46.41
"is.data.frame"      0.58    18.95       0.58     18.95
"%in%"               0.34    11.11       3.04     99.35
"dimnames"           0.06     1.96       0.06      1.96
"+"                  0.02     0.65       0.02      0.65

$by.total
                total.time total.pct self.time self.pct
"%in%"                3.04     99.35      0.34    11.11
"match"               2.70     88.24      1.28    41.83
"colnames"            1.42     46.41      0.78    25.49
"is.data.frame"       0.58     18.95      0.58    18.95
"dimnames"            0.06      1.96      0.06     1.96
"+"                   0.02      0.65      0.02     0.65
identical(mymatrix2, as.matrix(mymatrix))
[1] TRUE

rnames我在你的代码中没有看到名称。那应该是rnames？不，名称是另一个不同于rnames的载体。我评论了我是如何填写这个向量的，但是如果你想要源代码，我可以提供它：mydataframe你能用最少的输入和预期的输出发布一个可复制的例子吗？如果我们不能运行代码，就很难帮助你。试试看。我用一个可复制的代码和预期的输出更新了我的问题。但是，只有在读取较大的文件时才会出现性能问题谢谢您的详细输入，我注意到通过使用cbind而不是transform来处理一个文件夹中的文件，性能从7.46s提高到了0.92。下一步我将在整个数据集上试用它
R> lapply(summaryRprof(), head)
$by.self
                 self.time self.pct total.time total.pct
"[<-.data.frame"     12.02    26.15      25.90     56.35
"[.data.frame"        7.22    15.71      13.32     28.98
"match"               7.20    15.67      11.40     24.80
"%in%"                2.38     5.18      12.34     26.85
"anyDuplicated"       2.22     4.83       3.08      6.70
"names"               2.16     4.70       2.16      4.70

$by.total
                 total.time total.pct self.time self.pct
"[<-"                 27.06     58.88      1.16     2.52
"[<-.data.frame"      25.90     56.35     12.02    26.15
"["                   14.32     31.16      1.00     2.18
"[.data.frame"        13.32     28.98      7.22    15.71
"%in%"                12.34     26.85      2.38     5.18
"match"               11.40     24.80      7.20    15.67

$sample.interval
[1] 0.02

$sampling.time
[1] 45.96

rnames <- c("N","A")
mymatrix2 <- matrix(nrow=2, ncol=0, dimnames=list(rnames))
set.seed(21)
names <- sample(letters, 1e6, TRUE)
Rprof()
for(name in names) {
  if(name %in% colnames(mymatrix)) {
    mymatrix2[2,name] <- mymatrix2[2,name] + 1
  } else {
    mymatrix2 <- cbind(mymatrix2, matrix(c(NA,1), 2, 1, dimnames=list(rnames, name)))
  }
}
Rprof(NULL)
lapply(summaryRprof(), head)
$by.self
                self.time self.pct total.time total.pct
"match"              1.28    41.83       2.70     88.24
"colnames"           0.78    25.49       1.42     46.41
"is.data.frame"      0.58    18.95       0.58     18.95
"%in%"               0.34    11.11       3.04     99.35
"dimnames"           0.06     1.96       0.06      1.96
"+"                  0.02     0.65       0.02      0.65

$by.total
                total.time total.pct self.time self.pct
"%in%"                3.04     99.35      0.34    11.11
"match"               2.70     88.24      1.28    41.83
"colnames"            1.42     46.41      0.78    25.49
"is.data.frame"       0.58     18.95      0.58    18.95
"dimnames"            0.06      1.96      0.06     1.96
"+"                   0.02      0.65      0.02     0.65
identical(mymatrix2, as.matrix(mymatrix))
[1] TRUE