R 如何使循环和附加更有效

R 如何使循环和附加更有效,r,for-loop,R,For Loop,我有以下代码: library(dplyr) library(splitstackshape) datalist = list() files <- list.files("/WEIGHTS1/Retina", pattern=".RDat", ignore.case=T) for(i in files) { a<-get(load(i)) names <- rownames(a) data <- as.data.frame(c

我有以下代码:

library(dplyr)
library(splitstackshape)

datalist = list()
files <- list.files("/WEIGHTS1/Retina", pattern=".RDat", ignore.case=T)

for(i in files)
{
a<-get(load(i))
names <- rownames(a)
data <- as.data.frame(cbind(names,a))
rownames(data) <- NULL
dd=na.omit(concat.split.multiple(data = data, split.cols = c("names"), seps = ":"))
dd=select(dd,names_1,blup,names_3,names_4)
colnames(dd)=c("rsid","weight","ref_allele","eff_allele")
dd$WGT<-i
datalist[[i]] <- dd # add it to your list
}

big_data = do.call(rbind, datalist)

如果我只加载一个.RDat文件:

i="retina.ENSG00000135776.wgt.RDat"

a<-get(load(i))

> head(a)
                            top1          blup lasso enet
rs72763981:228705421:C:G   0.9729755  9.376766e-09     0    0
rs144383755:228705758:A:G -0.2748957 -2.093346e-09     0    0
rs1925716                 -0.7398840 -1.993259e-08     0    0
rs1925717:228707734:T:C    0.7258831  1.511376e-08     0    0
rs61827307:228708434:C:A  -0.7834896 -1.625302e-08     0    0
rs61827308:228708526:G:C  -0.7834896 -1.625302e-08     0    0
i=“retina.ensg0000135776.wgt.RDat”
a头(a)
top1 blup套索网
rs72763981:228705421:C:G 0.9729755 9.376766e-09 0
rs144383755:228705758:A:G-0.2748957-2.093346e-090
rs1925716-0.7398840-1.993259e-08 0
rs1925717:228707734:T:C 0.7258831.511376e-08 0
rs61827307:228708434:C:A-0.7834896-1.625302e-08 0
rs61827308:228708526:G:C-0.7834896-1.625302e-08 0
可以看到,在“名称”列中,我创建了3个单独的列:“rsid”、“ref_等位基因”和“eff_等位基因”。 这个循环需要很长时间才能执行。有没有办法让这更快

我正在尝试@akrun代码:

library(parallel)
library(data.table)
library(foreach)
library(doSNOW)

n <-  parallel::detectCores()
cl <- parallel::makeCluster(n, type = "SOCK")   
doSNOW::registerDoSNOW(cl)
files <- list.files("/WEIGHTS1/Retina", pattern=".RDat", ignore.case=T)

lst_out <- foreach::foreach(i = seq_along(files), 
              .packages = c("data.table") ) %dopar% {

   a <- get(load(files[i]))
   names <- rownames(a)
   if("blup" %in% colnames(a)) {
     data <- data.table(names, a["blup"])
     nm1 <- c("rsid", "ref_allele", "eff_allele")
     data[,  (nm1) := tstrsplit(names, ":")[-2]]
     out <- data[, .(rsid, weight = blup, ref_allele, eff_allele)][,
               WGT := files[i]][]
    } else {
  
     data <- data.table(names)
     nm1 <- c("rsid", "ref_allele", "eff_allele")
     data[,  (nm1) := tstrsplit(names, ":")[-2]]
     out <- data[, .(rsid,  ref_allele, eff_allele)][,
               WGT := files[i]][]
       }

   return(out)
   rm(data, a)
   gc()
 }

Error in { : task 12 failed - "object 'blup' not found"


big_data <- rbindlist(lst_out)      
库(并行)
库(数据表)
图书馆(foreach)
图书馆(doSNOW)

n有两种方法可以加快这一过程。1) 一个选项是使用
数据中的
tstrsplit
。表
,2)使用
并行

library(parallel)
library(data.table)
n <-  parallel::detectCores()
cl <- parallel::makeCluster(n, type = "SOCK")   
doSNOW::registerDoSNOW(cl)
files <- list.files("/WEIGHTS1/Retina", pattern=".RDat", ignore.case=TRUE)

lst_out <- foreach::foreach(i = seq_along(files), 
                  .packages = c("data.table") ) %dopar% {

   tmp <-  as.data.frame(get(load(files[i])))
   a <- data.table::copy(tmp)
   rm(tmp)
   gc()
   
   names <- rownames(a)
   if("blup" %in% colnames(a)) {
     data <- data.table(names, a["blup"])
     nm1 <- c("rsid", "ref_allele", "eff_allele")
     data[,  (nm1) := tstrsplit(names, ":")[-2]]
     out <- data[, .(rsid, weight = blup, ref_allele, eff_allele)][,
               WGT := files[i]][]
    } else {
      
     data <- data.table(names)
     nm1 <- c("rsid", "ref_allele", "eff_allele")
     data[,  (nm1) := tstrsplit(names, ":")[-2]]
     out <- data[, .(rsid,  ref_allele, eff_allele)][,
               WGT := files[i]][]
   }

   return(out)
   rm(data)
   gc()
 }
parallel::stopCluster(cl)

big_data <- rbindlist(lst_out, fill = TRUE)    
库(并行)
库(数据表)

n有两种方法可以加快这一过程。1) 一个选项是使用
数据中的
tstrsplit
。表
,2)使用
并行

library(parallel)
library(data.table)
n <-  parallel::detectCores()
cl <- parallel::makeCluster(n, type = "SOCK")   
doSNOW::registerDoSNOW(cl)
files <- list.files("/WEIGHTS1/Retina", pattern=".RDat", ignore.case=TRUE)

lst_out <- foreach::foreach(i = seq_along(files), 
                  .packages = c("data.table") ) %dopar% {

   tmp <-  as.data.frame(get(load(files[i])))
   a <- data.table::copy(tmp)
   rm(tmp)
   gc()
   
   names <- rownames(a)
   if("blup" %in% colnames(a)) {
     data <- data.table(names, a["blup"])
     nm1 <- c("rsid", "ref_allele", "eff_allele")
     data[,  (nm1) := tstrsplit(names, ":")[-2]]
     out <- data[, .(rsid, weight = blup, ref_allele, eff_allele)][,
               WGT := files[i]][]
    } else {
      
     data <- data.table(names)
     nm1 <- c("rsid", "ref_allele", "eff_allele")
     data[,  (nm1) := tstrsplit(names, ":")[-2]]
     out <- data[, .(rsid,  ref_allele, eff_allele)][,
               WGT := files[i]][]
   }

   return(out)
   rm(data)
   gc()
 }
parallel::stopCluster(cl)

big_data <- rbindlist(lst_out, fill = TRUE)    
库(并行)
库(数据表)

n评论不用于扩展讨论;此对话已结束。评论不用于扩展讨论;这个对话已经结束了。您的代码只需要a=as.data.frame(a)在a之后,我将只添加这一行。非常感谢!您的代码只需要a=as.data.frame(a)在a之后,我将只添加这一行。非常感谢!
library(parallel)
library(data.table)
n <-  parallel::detectCores()
cl <- parallel::makeCluster(n, type = "SOCK")   
doSNOW::registerDoSNOW(cl)
files <- list.files("/WEIGHTS1/Retina", pattern=".RDat", ignore.case=TRUE)

lst_out <- foreach::foreach(i = seq_along(files), 
                  .packages = c("data.table") ) %dopar% {

   tmp <-  as.data.frame(get(load(files[i])))
   a <- data.table::copy(tmp)
   rm(tmp)
   gc()
   
   names <- rownames(a)
   if("blup" %in% colnames(a)) {
     data <- data.table(names, a["blup"])
     nm1 <- c("rsid", "ref_allele", "eff_allele")
     data[,  (nm1) := tstrsplit(names, ":")[-2]]
     out <- data[, .(rsid, weight = blup, ref_allele, eff_allele)][,
               WGT := files[i]][]
    } else {
      
     data <- data.table(names)
     nm1 <- c("rsid", "ref_allele", "eff_allele")
     data[,  (nm1) := tstrsplit(names, ":")[-2]]
     out <- data[, .(rsid,  ref_allele, eff_allele)][,
               WGT := files[i]][]
   }

   return(out)
   rm(data)
   gc()
 }
parallel::stopCluster(cl)

big_data <- rbindlist(lst_out, fill = TRUE)    
a1 <- structure(list(top1 = c(0.9729755, -0.2748957, -0.739884, 0.7258831, 
-0.7834896, -0.7834896), blup = c(9.376766e-09, -2.093346e-09, 
-1.993259e-08, 1.511376e-08, -1.625302e-08, -1.625302e-08), lasso = c(0L, 
0L, 0L, 0L, 0L, 0L), enet = c(0L, 0L, 0L, 0L, 0L, 0L)), class = "data.frame", row.names = c("rs72763981:228705421:C:G", 
"rs144383755:228705758:A:G", "rs1925716", "rs1925717:228707734:T:C", 
"rs61827307:228708434:C:A", "rs61827308:228708526:G:C"))


lst1 <- replicate(16, a1, simplify = FALSE)
file_nm <- sprintf("retina.ENSG00000%d.wgt.RDat", 135776:135791)

library(foreach)
library(parallel)
library(data.table)
n <-  parallel::detectCores()
cl <- parallel::makeCluster(n, type = "SOCK")   
doSNOW::registerDoSNOW(cl)


lst_out <- foreach::foreach(i = seq_along(lst1), 
                  .packages = c("data.table") ) %dopar% {

   a <- lst1[[i]]
   names <- rownames(a)
   data <- data.table(names, a["blup"])
   nm1 <- c("rsid", "ref_allele", "eff_allele")
   data[,  (nm1) := tstrsplit(names, ":")[-2]]
   return(data[, .(rsid, weight = blup, ref_allele, eff_allele)][,
               WGT := file_nm[i]][])
 }
parallel::stopCluster(cl)

big_data <- rbindlist(lst_out) 

dim(big_data)
#[1] 96  5