R 扩展数据框以将行名与名称列表匹配的最快方法
我的目标是通过以下方式扩展具有20万列的大型数据帧: 我有一个按特定顺序排列的名称列表,我想扩展我的原始数据框,它有一些与此名称列表相同的行名,这样扩展的数据框有一个列,其中的条目与名称列表中的条目完全相同,并且在数据框中缺少值的地方,它应该有零 最小可复制示例:R 扩展数据框以将行名与名称列表匹配的最快方法,r,performance,dataframe,R,Performance,Dataframe,我的目标是通过以下方式扩展具有20万列的大型数据帧: 我有一个按特定顺序排列的名称列表,我想扩展我的原始数据框,它有一些与此名称列表相同的行名,这样扩展的数据框有一个列,其中的条目与名称列表中的条目完全相同,并且在数据框中缺少值的地方,它应该有零 最小可复制示例: entities<-c("C1","C3","C4","C5","E1","E2") list_of_nam
entities<-c("C1","C3","C4","C5","E1","E2")
list_of_names<-data.frame(entities)
df<-data.frame(matrix(runif(30), nrow = 3, ncol = 10))
rownames(df)<-c("E1", "C1","C4")
> df
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
E1 0.6228159 0.0744723 0.34010018 0.5720812 0.3179460 0.2405663696 0.6198333 0.9367545 0.95729614 0.10671495
C1 0.2435715 0.4046881 0.01714992 0.7806331 0.1113826 0.0003340805 0.1747583 0.5234170 0.07591773 0.76162701
C4 0.3804787 0.8364844 0.85255672 0.8909652 0.0665782 0.7447360640 0.3022620 0.2111203 0.72338528 0.04510692
实体我建议下一个dplyr
方法。众所周知,merge()
可能很慢,但用于合并的dplyr
函数速度更快。代码如下:
library(dplyr)
#Data
entities<-c("C1","C3","C4","C5","E1","E2")
list_of_names<-data.frame(entities)
df<-data.frame(matrix(runif(30), nrow = 3, ncol = 10))
df$entities<-c("E1", "C1","C4")
#Dplyr
df_extended <- list_of_names %>% left_join(df) %>% replace(is.na(.),0)
此外,使用更大的数据帧(10.000列)进行测试,这是性能(包括填充零级):
也许您可以尝试先初始化矩阵v
,然后根据匹配的行名将df
中的行分配给矩阵,例如
v <- matrix(0,nrow = nrow(list_of_names),ncol = ncol(df))
inds <- match(row.names(df),list_of_names$entities)
v[inds,] <- as.matrix(df)
df_extended <- cbind(list_of_names,`colnames<-`(v,names(df)))
性能测试
给定2e6
列的df
entities<-c("C1","C3","C4","C5","E1","E2")
list_of_names<-data.frame(entities)
df<-data.frame(matrix(runif(30), nrow = 3, ncol = 2e6))
row.names(df)<-c("E1", "C1","C4")
然后我们再看看
> system.time(f())
user system elapsed
13.67 1.25 14.93
你能把东西当作矩阵,装一个贝壳容器吗?在我的机器上,200k cols大约需要8.6ms
entities<-c("C1","C3","C4","C5","E1","E2")
# Zero filled container
shell <- matrix(rep(0,length(entities)*10), length(entities),10)
rownames(shell) <- entities
# Data to fill container based on row
m <- matrix(runif(30), nrow = 3, ncol = 10)
rownames(m)<-c("E1", "C1","C4")
# Fill the container
shell[rownames(m),] <- m
shell
实体使用setdiff
df[setdiff(entities, row.names(df)),] <- 0
df
# X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
#E1 0.4820426 0.67437639 0.3518886 0.9188573 0.72839443 0.3952201 0.6982616 0.42842151 0.2608569 0.8319276
#C1 0.2529649 0.04766363 0.4089440 0.2825283 0.68637508 0.4778454 0.9156835 0.54208037 0.3971520 0.1528872
#C4 0.2162548 0.70085309 0.8209513 0.9611048 0.05284394 0.5602533 0.6183512 0.05847849 0.1977447 0.8034185
#C3 0.0000000 0.00000000 0.0000000 0.0000000 0.00000000 0.0000000 0.0000000 0.00000000 0.0000000 0.0000000
#C5 0.0000000 0.00000000 0.0000000 0.0000000 0.00000000 0.0000000 0.0000000 0.00000000 0.0000000 0.0000000
#E2 0.0000000 0.00000000 0.0000000 0.0000000 0.00000000 0.0000000 0.0000000 0.00000000 0.0000000 0.0000000
df[setdiff(实体,行名称(df)),]
entities<-c("C1","C3","C4","C5","E1","E2")
list_of_names<-data.frame(entities)
df<-data.frame(matrix(runif(30), nrow = 3, ncol = 2e6))
row.names(df)<-c("E1", "C1","C4")
f <- function() {
v <- matrix(0, nrow = nrow(list_of_names), ncol = ncol(df))
inds <- match(row.names(df), list_of_names$entities)
v[inds, ] <- as.matrix(df)
df_extended <- cbind(list_of_names, `colnames<-`(v, names(df)))
}
> system.time(f())
user system elapsed
13.67 1.25 14.93
entities<-c("C1","C3","C4","C5","E1","E2")
# Zero filled container
shell <- matrix(rep(0,length(entities)*10), length(entities),10)
rownames(shell) <- entities
# Data to fill container based on row
m <- matrix(runif(30), nrow = 3, ncol = 10)
rownames(m)<-c("E1", "C1","C4")
# Fill the container
shell[rownames(m),] <- m
shell
df[setdiff(entities, row.names(df)),] <- 0
df
# X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
#E1 0.4820426 0.67437639 0.3518886 0.9188573 0.72839443 0.3952201 0.6982616 0.42842151 0.2608569 0.8319276
#C1 0.2529649 0.04766363 0.4089440 0.2825283 0.68637508 0.4778454 0.9156835 0.54208037 0.3971520 0.1528872
#C4 0.2162548 0.70085309 0.8209513 0.9611048 0.05284394 0.5602533 0.6183512 0.05847849 0.1977447 0.8034185
#C3 0.0000000 0.00000000 0.0000000 0.0000000 0.00000000 0.0000000 0.0000000 0.00000000 0.0000000 0.0000000
#C5 0.0000000 0.00000000 0.0000000 0.0000000 0.00000000 0.0000000 0.0000000 0.00000000 0.0000000 0.0000000
#E2 0.0000000 0.00000000 0.0000000 0.0000000 0.00000000 0.0000000 0.0000000 0.00000000 0.0000000 0.0000000