R 如何使循环和附加更有效
我有以下代码:R 如何使循环和附加更有效,r,for-loop,R,For Loop,我有以下代码: library(dplyr) library(splitstackshape) datalist = list() files <- list.files("/WEIGHTS1/Retina", pattern=".RDat", ignore.case=T) for(i in files) { a<-get(load(i)) names <- rownames(a) data <- as.data.frame(c
library(dplyr)
library(splitstackshape)
datalist = list()
files <- list.files("/WEIGHTS1/Retina", pattern=".RDat", ignore.case=T)
for(i in files)
{
a<-get(load(i))
names <- rownames(a)
data <- as.data.frame(cbind(names,a))
rownames(data) <- NULL
dd=na.omit(concat.split.multiple(data = data, split.cols = c("names"), seps = ":"))
dd=select(dd,names_1,blup,names_3,names_4)
colnames(dd)=c("rsid","weight","ref_allele","eff_allele")
dd$WGT<-i
datalist[[i]] <- dd # add it to your list
}
big_data = do.call(rbind, datalist)
及
如果我只加载一个.RDat文件:
i="retina.ENSG00000135776.wgt.RDat"
a<-get(load(i))
> head(a)
top1 blup lasso enet
rs72763981:228705421:C:G 0.9729755 9.376766e-09 0 0
rs144383755:228705758:A:G -0.2748957 -2.093346e-09 0 0
rs1925716 -0.7398840 -1.993259e-08 0 0
rs1925717:228707734:T:C 0.7258831 1.511376e-08 0 0
rs61827307:228708434:C:A -0.7834896 -1.625302e-08 0 0
rs61827308:228708526:G:C -0.7834896 -1.625302e-08 0 0
i=“retina.ensg0000135776.wgt.RDat”
a头(a)
top1 blup套索网
rs72763981:228705421:C:G 0.9729755 9.376766e-09 0
rs144383755:228705758:A:G-0.2748957-2.093346e-090
rs1925716-0.7398840-1.993259e-08 0
rs1925717:228707734:T:C 0.7258831.511376e-08 0
rs61827307:228708434:C:A-0.7834896-1.625302e-08 0
rs61827308:228708526:G:C-0.7834896-1.625302e-08 0
可以看到,在“名称”列中,我创建了3个单独的列:“rsid”、“ref_等位基因”和“eff_等位基因”。
这个循环需要很长时间才能执行。有没有办法让这更快
我正在尝试@akrun代码:
library(parallel)
library(data.table)
library(foreach)
library(doSNOW)
n <- parallel::detectCores()
cl <- parallel::makeCluster(n, type = "SOCK")
doSNOW::registerDoSNOW(cl)
files <- list.files("/WEIGHTS1/Retina", pattern=".RDat", ignore.case=T)
lst_out <- foreach::foreach(i = seq_along(files),
.packages = c("data.table") ) %dopar% {
a <- get(load(files[i]))
names <- rownames(a)
if("blup" %in% colnames(a)) {
data <- data.table(names, a["blup"])
nm1 <- c("rsid", "ref_allele", "eff_allele")
data[, (nm1) := tstrsplit(names, ":")[-2]]
out <- data[, .(rsid, weight = blup, ref_allele, eff_allele)][,
WGT := files[i]][]
} else {
data <- data.table(names)
nm1 <- c("rsid", "ref_allele", "eff_allele")
data[, (nm1) := tstrsplit(names, ":")[-2]]
out <- data[, .(rsid, ref_allele, eff_allele)][,
WGT := files[i]][]
}
return(out)
rm(data, a)
gc()
}
Error in { : task 12 failed - "object 'blup' not found"
big_data <- rbindlist(lst_out)
库(并行)
库(数据表)
图书馆(foreach)
图书馆(doSNOW)
n有两种方法可以加快这一过程。1) 一个选项是使用数据中的tstrsplit
。表
,2)使用并行
library(parallel)
library(data.table)
n <- parallel::detectCores()
cl <- parallel::makeCluster(n, type = "SOCK")
doSNOW::registerDoSNOW(cl)
files <- list.files("/WEIGHTS1/Retina", pattern=".RDat", ignore.case=TRUE)
lst_out <- foreach::foreach(i = seq_along(files),
.packages = c("data.table") ) %dopar% {
tmp <- as.data.frame(get(load(files[i])))
a <- data.table::copy(tmp)
rm(tmp)
gc()
names <- rownames(a)
if("blup" %in% colnames(a)) {
data <- data.table(names, a["blup"])
nm1 <- c("rsid", "ref_allele", "eff_allele")
data[, (nm1) := tstrsplit(names, ":")[-2]]
out <- data[, .(rsid, weight = blup, ref_allele, eff_allele)][,
WGT := files[i]][]
} else {
data <- data.table(names)
nm1 <- c("rsid", "ref_allele", "eff_allele")
data[, (nm1) := tstrsplit(names, ":")[-2]]
out <- data[, .(rsid, ref_allele, eff_allele)][,
WGT := files[i]][]
}
return(out)
rm(data)
gc()
}
parallel::stopCluster(cl)
big_data <- rbindlist(lst_out, fill = TRUE)
库(并行)
库(数据表)
n有两种方法可以加快这一过程。1) 一个选项是使用数据中的tstrsplit
。表
,2)使用并行
library(parallel)
library(data.table)
n <- parallel::detectCores()
cl <- parallel::makeCluster(n, type = "SOCK")
doSNOW::registerDoSNOW(cl)
files <- list.files("/WEIGHTS1/Retina", pattern=".RDat", ignore.case=TRUE)
lst_out <- foreach::foreach(i = seq_along(files),
.packages = c("data.table") ) %dopar% {
tmp <- as.data.frame(get(load(files[i])))
a <- data.table::copy(tmp)
rm(tmp)
gc()
names <- rownames(a)
if("blup" %in% colnames(a)) {
data <- data.table(names, a["blup"])
nm1 <- c("rsid", "ref_allele", "eff_allele")
data[, (nm1) := tstrsplit(names, ":")[-2]]
out <- data[, .(rsid, weight = blup, ref_allele, eff_allele)][,
WGT := files[i]][]
} else {
data <- data.table(names)
nm1 <- c("rsid", "ref_allele", "eff_allele")
data[, (nm1) := tstrsplit(names, ":")[-2]]
out <- data[, .(rsid, ref_allele, eff_allele)][,
WGT := files[i]][]
}
return(out)
rm(data)
gc()
}
parallel::stopCluster(cl)
big_data <- rbindlist(lst_out, fill = TRUE)
库(并行)
库(数据表)
n评论不用于扩展讨论;此对话已结束。评论不用于扩展讨论;这个对话已经结束了。您的代码只需要a=as.data.frame(a)在a之后,我将只添加这一行。非常感谢!您的代码只需要a=as.data.frame(a)在a之后,我将只添加这一行。非常感谢!
library(parallel)
library(data.table)
n <- parallel::detectCores()
cl <- parallel::makeCluster(n, type = "SOCK")
doSNOW::registerDoSNOW(cl)
files <- list.files("/WEIGHTS1/Retina", pattern=".RDat", ignore.case=TRUE)
lst_out <- foreach::foreach(i = seq_along(files),
.packages = c("data.table") ) %dopar% {
tmp <- as.data.frame(get(load(files[i])))
a <- data.table::copy(tmp)
rm(tmp)
gc()
names <- rownames(a)
if("blup" %in% colnames(a)) {
data <- data.table(names, a["blup"])
nm1 <- c("rsid", "ref_allele", "eff_allele")
data[, (nm1) := tstrsplit(names, ":")[-2]]
out <- data[, .(rsid, weight = blup, ref_allele, eff_allele)][,
WGT := files[i]][]
} else {
data <- data.table(names)
nm1 <- c("rsid", "ref_allele", "eff_allele")
data[, (nm1) := tstrsplit(names, ":")[-2]]
out <- data[, .(rsid, ref_allele, eff_allele)][,
WGT := files[i]][]
}
return(out)
rm(data)
gc()
}
parallel::stopCluster(cl)
big_data <- rbindlist(lst_out, fill = TRUE)
a1 <- structure(list(top1 = c(0.9729755, -0.2748957, -0.739884, 0.7258831,
-0.7834896, -0.7834896), blup = c(9.376766e-09, -2.093346e-09,
-1.993259e-08, 1.511376e-08, -1.625302e-08, -1.625302e-08), lasso = c(0L,
0L, 0L, 0L, 0L, 0L), enet = c(0L, 0L, 0L, 0L, 0L, 0L)), class = "data.frame", row.names = c("rs72763981:228705421:C:G",
"rs144383755:228705758:A:G", "rs1925716", "rs1925717:228707734:T:C",
"rs61827307:228708434:C:A", "rs61827308:228708526:G:C"))
lst1 <- replicate(16, a1, simplify = FALSE)
file_nm <- sprintf("retina.ENSG00000%d.wgt.RDat", 135776:135791)
library(foreach)
library(parallel)
library(data.table)
n <- parallel::detectCores()
cl <- parallel::makeCluster(n, type = "SOCK")
doSNOW::registerDoSNOW(cl)
lst_out <- foreach::foreach(i = seq_along(lst1),
.packages = c("data.table") ) %dopar% {
a <- lst1[[i]]
names <- rownames(a)
data <- data.table(names, a["blup"])
nm1 <- c("rsid", "ref_allele", "eff_allele")
data[, (nm1) := tstrsplit(names, ":")[-2]]
return(data[, .(rsid, weight = blup, ref_allele, eff_allele)][,
WGT := file_nm[i]][])
}
parallel::stopCluster(cl)
big_data <- rbindlist(lst_out)
dim(big_data)
#[1] 96 5