R函数按列分类?
我想写一个函数,它接受一个数据帧,计算多个列的出现次数,然后根据列名称的出现情况为行分配一个“类别” 以该df为例:R函数按列分类?,r,function,R,Function,我想写一个函数,它接受一个数据帧,计算多个列的出现次数,然后根据列名称的出现情况为行分配一个“类别” 以该df为例: df <- data.frame(k1 = c(0,0,3,4,5,1), k2 = c(1,0,0,4,5,0), k3 = c(0,0,0,8,0,0), k4 = c(2,5,0,3,4,5)) df 结果: k1 k2 k3 k4 Categor
df <- data.frame(k1 = c(0,0,3,4,5,1),
k2 = c(1,0,0,4,5,0),
k3 = c(0,0,0,8,0,0),
k4 = c(2,5,0,3,4,5))
df
结果:
k1 k2 k3 k4 Category
1 0 1 0 2 k2_k4_
2 0 0 0 5 k4_
3 3 0 0 0 k1_
4 4 4 8 3 k1_k2_k3_k4_
5 5 5 0 4 k1_k2_k4_
6 1 0 0 5 k1_k4_
也许有更有效的方法。我太新手了。使用数据。表
:
library(data.table)
setDT(df)
df[ , I := .I]
df[melt(df, id.vars = "I")[value != 0,
paste(variable, collapse = "_"),
keyby = I],
Category := i.V1, on = "I"][]
# k1 k2 k3 k4 I Category
# 1: 0 1 0 2 1 k2_k4
# 2: 0 0 0 5 2 k4
# 3: 3 0 0 0 3 k1
# 4: 4 4 8 3 4 k1_k2_k3_k4
# 5: 5 5 0 4 5 k1_k2_k4
# 6: 1 0 0 5 6 k1_k4
进近示意图:
- 添加行ID以跟踪它
melt
数据——让我们可以“按行”操作,而无需转换为矩阵
- 消除“空”行/列组合
- 在每个行ID内,将所有剩余列名粘贴在一起
- 将其合并回原始数据
您可以使用data.table::transpose()
函数使每一行成为一个向量,然后使用sapply
在列表中循环并在值不为零的位置粘贴相应的列名:
df$category = sapply(data.table::transpose(df),
function(r) paste0(names(df)[r != 0], collapse = "_"))
df
# k1 k2 k3 k4 category
#1 0 1 0 2 k2_k4
#2 0 0 0 5 k4
#3 3 0 0 0 k1
#4 4 4 8 3 k1_k2_k3_k4
#5 5 5 0 4 k1_k2_k4
#6 1 0 0 5 k1_k4
在base R中,有很多选项。一:
df$Category <- apply(df > 0, 1, function(x){toString(names(df)[x])})
df
## k1 k2 k3 k4 Category
## 1 0 1 0 2 k2, k4
## 2 0 0 0 5 k4
## 3 3 0 0 0 k1
## 4 4 4 8 3 k1, k2, k3, k4
## 5 5 5 0 4 k1, k2, k4
## 6 1 0 0 5 k1, k4
我们可以在baser
中以矢量化的形式执行此操作(不使用包)
功能
基准输出-微基准
讨论/评论
@Eric的方法是最快的,但是当列的数量更多时,嵌套的ifelse
语句也会更多。data.table方法比其他一些方法更快。另外,请参阅更多备选方法
df$category = sapply(data.table::transpose(df),
function(r) paste0(names(df)[r != 0], collapse = "_"))
df
# k1 k2 k3 k4 category
#1 0 1 0 2 k2_k4
#2 0 0 0 5 k4
#3 3 0 0 0 k1
#4 4 4 8 3 k1_k2_k3_k4
#5 5 5 0 4 k1_k2_k4
#6 1 0 0 5 k1_k4
df$Category <- apply(df > 0, 1, function(x){toString(names(df)[x])})
df
## k1 k2 k3 k4 Category
## 1 0 1 0 2 k2, k4
## 2 0 0 0 5 k4
## 3 3 0 0 0 k1
## 4 4 4 8 3 k1, k2, k3, k4
## 5 5 5 0 4 k1, k2, k4
## 6 1 0 0 5 k1, k4
df$Category <- apply(df > 0, 1, function(x){paste(names(df)[x], collapse = '_')})
df
## k1 k2 k3 k4 Category
## 1 0 1 0 2 k2_k4
## 2 0 0 0 5 k4
## 3 3 0 0 0 k1
## 4 4 4 8 3 k1_k2_k3_k4
## 5 5 5 0 4 k1_k2_k4
## 6 1 0 0 5 k1_k4
library(purrr)
df %>% by_row(~toString(names(.)[.x > 0]), .collate = 'cols', .to = 'Category')
## # A tibble: 6 × 5
## k1 k2 k3 k4 Category
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 0 1 0 2 k2, k4
## 2 0 0 0 5 k4
## 3 3 0 0 0 k1
## 4 4 4 8 3 k1, k2, k3, k4
## 5 5 5 0 4 k1, k2, k4
## 6 1 0 0 5 k1, k4
df$category <- gsub('^NA_|NA_+|_NA', '', do.call(paste,
c(as.data.frame(`dim<-`(names(df)[(NA^!df)*col(df)], dim(df))), sep="_")))
df$category
#[1] "k2_k4" "k4" "k1" "k1_k2_k3_k4" "k1_k2_k4" "k1_k4"
set.seed(24)
df <- data.frame(k1 = sample(0:5, 1e6, replace=TRUE),
k2 = sample(0:7, 1e6, replace = TRUE),
k3 = sample(0:8, 1e6, replace=TRUE),
k4 = sample(0:4, 1e6, replace = TRUE))
df2 <- copy(df)
setDT(df2)
psidom <- function(){
sapply(data.table::transpose(df),
function(r) paste0(names(df)[r != 0], collapse = "_"))}
akrun <- function(){
gsub('^NA_|NA_+|_NA', '', do.call(paste,
c(as.data.frame(`dim<-`(names(df)[(NA^!df)*col(df)], dim(df))), sep="_")))
}
ae <- function(){
apply(df > 0, 1, function(x){toString(names(df)[x])})}
ae2 <- function(){
df %>%
by_row(~toString(names(.)[.x > 0]),
.collate = 'cols', .to = 'Category')
}
MC <- function(){
df2[ , I := .I]
df2[melt(df2, id.vars = "I")[value != 0,
paste(variable, collapse = "_"),
keyby = I],
Category := i.V1, on = "I"][]
}
Eric <- function() {
paste(ifelse(df$k1>0, 'k1_',''),
ifelse(df$k2>0, 'k2_',''),
ifelse(df$k3>0, 'k3_',''),
ifelse(df$k4>0, 'k4_',''), sep='')
}
system.time(psidom())
# user system elapsed
# 7.91 0.06 7.97
system.time(ae())
# user system elapsed
# 10.22 0.00 10.22
system.time(ae2())
# user system elapsed
# 100.60 0.27 101.44
system.time(MC())
# user system elapsed
# 4.22 0.03 4.25
system.time(Eric())
# user system elapsed
# 1.40 0.00 1.41
system.time(akrun())
# user system elapsed
# 1.53 0.00 1.53
library(microbenchmark)
microbenchmark(psidom(), akrun(), ae(), ae2(), MC(), Eric(), unit = "relative",
times = 10)
#Unit: relative
# expr min lq mean median uq max neval
# psidom() 4.0824126 4.1283338 3.9332463 4.237229 3.4060509 4.2147045 10
# akrun() 1.0000000 1.0000000 1.0000000 1.000000 1.0000000 1.0000000 10
# ae() 6.7507093 6.9175511 6.0683960 6.725867 5.1087104 5.1901925 10
# ae2() 62.4294377 61.4709644 53.7637154 59.873279 44.9316386 44.9233634 10
# MC() 3.1439541 3.4666872 3.1479070 3.559120 2.7554062 2.8741309 10
# Eric() 0.9091862 0.9628939 0.9702425 1.042875 0.9878793 0.9686051 10