R中的性能问题
在R中,我试图在大量数据文件中创建元素计数矩阵:R中的性能问题,r,performance,match,R,Performance,Match,在R中,我试图在大量数据文件中创建元素计数矩阵: rnames <- c("N","A") mymatrix <- matrix(nrow=2,ncol=0,dimnames=list(rnames)) #loop through hundreds of large files (MB) #make the vector "names" contain all elements within each file for(name in names) { #if name is a
rnames <- c("N","A")
mymatrix <- matrix(nrow=2,ncol=0,dimnames=list(rnames))
#loop through hundreds of large files (MB)
#make the vector "names" contain all elements within each file
for(name in names)
{
#if name is already in the matrix increment by 1 the second row
if(name %in% colnames(mymatrix))
{
mymatrix[2,name] = mymatrix[2,name]+1
}
#else add a column to the matrix with the specified name
else
{
mymatrix <- transform(mymatrix,name)
mymatrix[2,name] = 1
}
}
rnames我不知道你是如何确定match
是瓶颈的。可能是这样,但您提供的示例没有显示这一点
rnames <- c("N","A")
mymatrix <- matrix(nrow=2, ncol=0, dimnames=list(rnames))
set.seed(21)
names <- sample(letters, 1e6, TRUE)
Rprof()
for(name in names) {
if(name %in% colnames(mymatrix)) {
mymatrix[2,name] <- mymatrix[2,name] + 1
} else {
mymatrix <- transform(mymatrix,name)
mymatrix[2,name] <- 1
}
}
Rprof(NULL)
避免调用transform
,您的代码将大大加快。而mymatrix2
实际上是一个矩阵,而mymatrix2
是一个data.frame
rnames <- c("N","A")
mymatrix2 <- matrix(nrow=2, ncol=0, dimnames=list(rnames))
set.seed(21)
names <- sample(letters, 1e6, TRUE)
Rprof()
for(name in names) {
if(name %in% colnames(mymatrix)) {
mymatrix2[2,name] <- mymatrix2[2,name] + 1
} else {
mymatrix2 <- cbind(mymatrix2, matrix(c(NA,1), 2, 1, dimnames=list(rnames, name)))
}
}
Rprof(NULL)
lapply(summaryRprof(), head)
$by.self
self.time self.pct total.time total.pct
"match" 1.28 41.83 2.70 88.24
"colnames" 0.78 25.49 1.42 46.41
"is.data.frame" 0.58 18.95 0.58 18.95
"%in%" 0.34 11.11 3.04 99.35
"dimnames" 0.06 1.96 0.06 1.96
"+" 0.02 0.65 0.02 0.65
$by.total
total.time total.pct self.time self.pct
"%in%" 3.04 99.35 0.34 11.11
"match" 2.70 88.24 1.28 41.83
"colnames" 1.42 46.41 0.78 25.49
"is.data.frame" 0.58 18.95 0.58 18.95
"dimnames" 0.06 1.96 0.06 1.96
"+" 0.02 0.65 0.02 0.65
identical(mymatrix2, as.matrix(mymatrix))
[1] TRUE
rnames我在你的代码中没有看到名称。那应该是rnames
?不,名称是另一个不同于rnames的载体。我评论了我是如何填写这个向量的,但是如果你想要源代码,我可以提供它:mydataframe你能用最少的输入和预期的输出发布一个可复制的例子吗?如果我们不能运行代码,就很难帮助你。试试看。我用一个可复制的代码和预期的输出更新了我的问题。但是,只有在读取较大的文件时才会出现性能问题谢谢您的详细输入,我注意到通过使用cbind而不是transform来处理一个文件夹中的文件,性能从7.46s提高到了0.92。下一步我将在整个数据集上试用它
R> lapply(summaryRprof(), head)
$by.self
self.time self.pct total.time total.pct
"[<-.data.frame" 12.02 26.15 25.90 56.35
"[.data.frame" 7.22 15.71 13.32 28.98
"match" 7.20 15.67 11.40 24.80
"%in%" 2.38 5.18 12.34 26.85
"anyDuplicated" 2.22 4.83 3.08 6.70
"names" 2.16 4.70 2.16 4.70
$by.total
total.time total.pct self.time self.pct
"[<-" 27.06 58.88 1.16 2.52
"[<-.data.frame" 25.90 56.35 12.02 26.15
"[" 14.32 31.16 1.00 2.18
"[.data.frame" 13.32 28.98 7.22 15.71
"%in%" 12.34 26.85 2.38 5.18
"match" 11.40 24.80 7.20 15.67
$sample.interval
[1] 0.02
$sampling.time
[1] 45.96
rnames <- c("N","A")
mymatrix2 <- matrix(nrow=2, ncol=0, dimnames=list(rnames))
set.seed(21)
names <- sample(letters, 1e6, TRUE)
Rprof()
for(name in names) {
if(name %in% colnames(mymatrix)) {
mymatrix2[2,name] <- mymatrix2[2,name] + 1
} else {
mymatrix2 <- cbind(mymatrix2, matrix(c(NA,1), 2, 1, dimnames=list(rnames, name)))
}
}
Rprof(NULL)
lapply(summaryRprof(), head)
$by.self
self.time self.pct total.time total.pct
"match" 1.28 41.83 2.70 88.24
"colnames" 0.78 25.49 1.42 46.41
"is.data.frame" 0.58 18.95 0.58 18.95
"%in%" 0.34 11.11 3.04 99.35
"dimnames" 0.06 1.96 0.06 1.96
"+" 0.02 0.65 0.02 0.65
$by.total
total.time total.pct self.time self.pct
"%in%" 3.04 99.35 0.34 11.11
"match" 2.70 88.24 1.28 41.83
"colnames" 1.42 46.41 0.78 25.49
"is.data.frame" 0.58 18.95 0.58 18.95
"dimnames" 0.06 1.96 0.06 1.96
"+" 0.02 0.65 0.02 0.65
identical(mymatrix2, as.matrix(mymatrix))
[1] TRUE