R 扩展数据框以将行名与名称列表匹配的最快方法

R 扩展数据框以将行名与名称列表匹配的最快方法,r,performance,dataframe,R,Performance,Dataframe,我的目标是通过以下方式扩展具有20万列的大型数据帧: 我有一个按特定顺序排列的名称列表,我想扩展我的原始数据框,它有一些与此名称列表相同的行名,这样扩展的数据框有一个列,其中的条目与名称列表中的条目完全相同,并且在数据框中缺少值的地方,它应该有零 最小可复制示例: entities<-c("C1","C3","C4","C5","E1","E2") list_of_nam

我的目标是通过以下方式扩展具有20万列的大型数据帧:

我有一个按特定顺序排列的名称列表,我想扩展我的原始数据框,它有一些与此名称列表相同的行名,这样扩展的数据框有一个列,其中的条目与名称列表中的条目完全相同,并且在数据框中缺少值的地方,它应该有零

最小可复制示例:

entities<-c("C1","C3","C4","C5","E1","E2")
list_of_names<-data.frame(entities)
df<-data.frame(matrix(runif(30), nrow = 3, ncol = 10))
rownames(df)<-c("E1", "C1","C4")


> df
          X1        X2         X3        X4        X5           X6        X7        X8         X9        X10
E1 0.6228159 0.0744723 0.34010018 0.5720812 0.3179460 0.2405663696 0.6198333 0.9367545 0.95729614 0.10671495
C1 0.2435715 0.4046881 0.01714992 0.7806331 0.1113826 0.0003340805 0.1747583 0.5234170 0.07591773 0.76162701
C4 0.3804787 0.8364844 0.85255672 0.8909652 0.0665782 0.7447360640 0.3022620 0.2111203 0.72338528 0.04510692

实体我建议下一个
dplyr
方法。众所周知,
merge()
可能很慢,但用于合并的
dplyr
函数速度更快。代码如下:

library(dplyr)
#Data
entities<-c("C1","C3","C4","C5","E1","E2")
list_of_names<-data.frame(entities)
df<-data.frame(matrix(runif(30), nrow = 3, ncol = 10))
df$entities<-c("E1", "C1","C4")
#Dplyr
df_extended <- list_of_names %>% left_join(df) %>% replace(is.na(.),0)
此外,使用更大的数据帧(10.000列)进行测试,这是性能(包括填充零级):


也许您可以尝试先初始化矩阵
v
,然后根据匹配的行名将
df
中的行分配给矩阵,例如

v <- matrix(0,nrow = nrow(list_of_names),ncol = ncol(df))
inds <- match(row.names(df),list_of_names$entities)
v[inds,] <- as.matrix(df)
df_extended <- cbind(list_of_names,`colnames<-`(v,names(df)))

性能测试

给定
2e6
列的
df

entities<-c("C1","C3","C4","C5","E1","E2")
list_of_names<-data.frame(entities)
df<-data.frame(matrix(runif(30), nrow = 3, ncol = 2e6))
row.names(df)<-c("E1", "C1","C4")
然后我们再看看

> system.time(f())
   user  system elapsed 
  13.67    1.25   14.93

你能把东西当作矩阵,装一个贝壳容器吗?在我的机器上,200k cols大约需要8.6ms

entities<-c("C1","C3","C4","C5","E1","E2")

# Zero filled container
shell <- matrix(rep(0,length(entities)*10), length(entities),10)
rownames(shell) <- entities

# Data to fill container based on row
m <- matrix(runif(30), nrow = 3, ncol = 10)
rownames(m)<-c("E1", "C1","C4")

# Fill the container
shell[rownames(m),] <- m

shell

实体使用
setdiff

df[setdiff(entities, row.names(df)),] <- 0
df
#          X1         X2        X3        X4         X5        X6        X7         X8        X9       X10
#E1 0.4820426 0.67437639 0.3518886 0.9188573 0.72839443 0.3952201 0.6982616 0.42842151 0.2608569 0.8319276
#C1 0.2529649 0.04766363 0.4089440 0.2825283 0.68637508 0.4778454 0.9156835 0.54208037 0.3971520 0.1528872
#C4 0.2162548 0.70085309 0.8209513 0.9611048 0.05284394 0.5602533 0.6183512 0.05847849 0.1977447 0.8034185
#C3 0.0000000 0.00000000 0.0000000 0.0000000 0.00000000 0.0000000 0.0000000 0.00000000 0.0000000 0.0000000
#C5 0.0000000 0.00000000 0.0000000 0.0000000 0.00000000 0.0000000 0.0000000 0.00000000 0.0000000 0.0000000
#E2 0.0000000 0.00000000 0.0000000 0.0000000 0.00000000 0.0000000 0.0000000 0.00000000 0.0000000 0.0000000
df[setdiff(实体,行名称(df)),]
entities<-c("C1","C3","C4","C5","E1","E2")
list_of_names<-data.frame(entities)
df<-data.frame(matrix(runif(30), nrow = 3, ncol = 2e6))
row.names(df)<-c("E1", "C1","C4")
f <- function() {
  v <- matrix(0, nrow = nrow(list_of_names), ncol = ncol(df))
  inds <- match(row.names(df), list_of_names$entities)
  v[inds, ] <- as.matrix(df)
  df_extended <- cbind(list_of_names, `colnames<-`(v, names(df)))
}
> system.time(f())
   user  system elapsed 
  13.67    1.25   14.93
entities<-c("C1","C3","C4","C5","E1","E2")

# Zero filled container
shell <- matrix(rep(0,length(entities)*10), length(entities),10)
rownames(shell) <- entities

# Data to fill container based on row
m <- matrix(runif(30), nrow = 3, ncol = 10)
rownames(m)<-c("E1", "C1","C4")

# Fill the container
shell[rownames(m),] <- m

shell
df[setdiff(entities, row.names(df)),] <- 0
df
#          X1         X2        X3        X4         X5        X6        X7         X8        X9       X10
#E1 0.4820426 0.67437639 0.3518886 0.9188573 0.72839443 0.3952201 0.6982616 0.42842151 0.2608569 0.8319276
#C1 0.2529649 0.04766363 0.4089440 0.2825283 0.68637508 0.4778454 0.9156835 0.54208037 0.3971520 0.1528872
#C4 0.2162548 0.70085309 0.8209513 0.9611048 0.05284394 0.5602533 0.6183512 0.05847849 0.1977447 0.8034185
#C3 0.0000000 0.00000000 0.0000000 0.0000000 0.00000000 0.0000000 0.0000000 0.00000000 0.0000000 0.0000000
#C5 0.0000000 0.00000000 0.0000000 0.0000000 0.00000000 0.0000000 0.0000000 0.00000000 0.0000000 0.0000000
#E2 0.0000000 0.00000000 0.0000000 0.0000000 0.00000000 0.0000000 0.0000000 0.00000000 0.0000000 0.0000000