R Data.table:合并列(聚合的单个函数)
我有一些类似的数据R Data.table:合并列(聚合的单个函数),r,data.table,R,Data.table,我有一些类似的数据 foo另一种方法是: foo1 <- CJ(var1 = c(T,F), var2 = c(T,F))[, var3 := c('z', 'y', 'x', 'w')] setkey(foo, var1, var2) foo[foo1, var3 :=i.var3][order(uid)][, c(1L, 4L)] # uid var3 #1: a x #2: b y #3: c z foo1这可能也没有经过优化,但我发现它比@Da
foo另一种方法是:
foo1 <- CJ(var1 = c(T,F), var2 = c(T,F))[, var3 := c('z', 'y', 'x', 'w')]
setkey(foo, var1, var2)
foo[foo1, var3 :=i.var3][order(uid)][, c(1L, 4L)]
# uid var3
#1: a x
#2: b y
#3: c z
foo1这可能也没有经过优化,但我发现它比@David Arenburg的解决方案更容易阅读
foo[, `:=` (var3 = ifelse(var1 & var2, "w", ifelse(var1, "x", ifelse(var2, "y", "z"))),
var1 = NULL, var2 = NULL)]
foo
# uid var3
# 1: a x
# 2: b y
# 3: c z
我个人不喜欢ifelse
,尤其是嵌套的ifelse
s:)。我想,在这种情况下,我们可以不用它,用这样的东西,也许
foo[, `:=`(var3 = factor(2*var1+var2, levels=3:0, labels=c("w","x","y","z")),
var2 = NULL, var1 = NULL)]
# uid var3
# 1: a x
# 2: b y
# 3: c z
我做了一些基准测试,@akrun的解决方案似乎是更大数据集的最快解决方案。为了使结果具有可比性,必须进行一些更改(在代码中对所有更改进行注释),但这不会对性能产生太大影响
# setup of data
require(data.table)
require(microbenchmark)
set.seed(1)
Nsims <- 1e4 # size of dataset
foo <- data.table(uid = 1:Nsims,
var1 = sample(c(TRUE, FALSE), Nsims, TRUE),
var2 = sample(c(TRUE, FALSE), Nsims, TRUE))
# benchmarktest
microbenchmark(
{ #@shadow
foo1 <- copy(foo)
foo1[, `:=` (var3=ifelse(var1&var2, "w", ifelse(var1, "x", ifelse(var2, "y", "z"))),
var1=NULL, var2=NULL)]
}
,
{ #@Arun
foo2 <- copy(foo)
foo2[, `:=`(var3 = as.character(factor(2*var1+var2, levels=3:0, labels=c("w","x","y","z"))),
# used as.character to give same result as other solutions
var2 = NULL, var1 = NULL)]
},
{ #@akrun
foo3 <- copy(foo)
foo.index <- CJ(var1 = c(T,F), var2 = c(T,F))[, var3 := c('z', 'y', 'x', 'w')]
setkey(foo3, var1, var2)
foo3 <- foo3[foo.index, var3 := i.var3][, `:=` (var1=NULL, var2=NULL)][order(uid)]
# assigned to foo3 to get same result as other solutions and used var1:=NULL, etc to achieve OP's
# requirement "Moreover, any additional column that foo has besides (var1, var2) should be taken over into the new data.table"
}
)
# min lq median uq max neval
# 19.635460 19.801922 19.93224 20.814533 22.57868 100
# 12.611448 12.762514 12.79219 12.864043 48.10415 100
# 4.691303 4.945683 4.98808 5.084922 7.21636 100
#
# making sure they give the same solutions
all.equal(foo1, foo2)
# [1] TRUE
all.equal(foo1, foo3)
# [1] TRUE
#数据设置
要求(数据表)
要求(微基准)
种子(1)
Nsims@eddi。谢谢你的编辑。我尝试了CJ
方法,但不知何故,var3列没有正确排序。CJ
和expand.grid
的排序不同-可能是您遇到问题的地方。我第一次看到在r
中使用了中间“值矩阵”,这是一项很好的技能。@davidernburg不是真的,但我确实发现这种方法非常有趣:)@shadow的解决方案是您应该将集
与之进行比较的解决方案。
# setup of data
require(data.table)
require(microbenchmark)
set.seed(1)
Nsims <- 1e4 # size of dataset
foo <- data.table(uid = 1:Nsims,
var1 = sample(c(TRUE, FALSE), Nsims, TRUE),
var2 = sample(c(TRUE, FALSE), Nsims, TRUE))
# benchmarktest
microbenchmark(
{ #@shadow
foo1 <- copy(foo)
foo1[, `:=` (var3=ifelse(var1&var2, "w", ifelse(var1, "x", ifelse(var2, "y", "z"))),
var1=NULL, var2=NULL)]
}
,
{ #@Arun
foo2 <- copy(foo)
foo2[, `:=`(var3 = as.character(factor(2*var1+var2, levels=3:0, labels=c("w","x","y","z"))),
# used as.character to give same result as other solutions
var2 = NULL, var1 = NULL)]
},
{ #@akrun
foo3 <- copy(foo)
foo.index <- CJ(var1 = c(T,F), var2 = c(T,F))[, var3 := c('z', 'y', 'x', 'w')]
setkey(foo3, var1, var2)
foo3 <- foo3[foo.index, var3 := i.var3][, `:=` (var1=NULL, var2=NULL)][order(uid)]
# assigned to foo3 to get same result as other solutions and used var1:=NULL, etc to achieve OP's
# requirement "Moreover, any additional column that foo has besides (var1, var2) should be taken over into the new data.table"
}
)
# min lq median uq max neval
# 19.635460 19.801922 19.93224 20.814533 22.57868 100
# 12.611448 12.762514 12.79219 12.864043 48.10415 100
# 4.691303 4.945683 4.98808 5.084922 7.21636 100
#
# making sure they give the same solutions
all.equal(foo1, foo2)
# [1] TRUE
all.equal(foo1, foo3)
# [1] TRUE