R data.table中的重塑难题
R data.table中的重塑难题,r,data.table,reshape,R,Data.table,Reshape,data.table set.seed(1234) DT <- data.table(x=rep(c(1,2,3),each=4), y=c("A","B"), v=sample(1:100,12)) # x y v # 1: 1 A 12 # 2: 1 B 62 ... #11: 3 A 63 #12: 3 B 49 编辑:这是我糟糕的解决方案,显然过于复杂 #first step is to create cumsum columns colNames <- c("x"
data.table
set.seed(1234)
DT <- data.table(x=rep(c(1,2,3),each=4), y=c("A","B"), v=sample(1:100,12))
# x y v
# 1: 1 A 12
# 2: 1 B 62
...
#11: 3 A 63
#12: 3 B 49
编辑:这是我糟糕的解决方案,显然过于复杂
#first step is to create cumsum columns
colNames <- c("x","v"); newColNames <- paste0("SUM.",colNames)
DT[, newColNames:=lapply(.SD,cumsum) ,by=y, .SDcols=colNames, with=F];
#now we need to reshape each SUM.* to SUM.*.{yvalue}
DT[,N:=.I]; setattr(DT,"sorted","N")
g <- function(DT,SD){
cols <- c('N',grep('SUM',colnames(SD), value=T));
Yval <- unique(SD[,y]);
merge(DT, SD[,cols, with=F], suffixe=c('',paste0('.',Yval)), all.x=T);
}
DT <- Reduce(f=g,init=DT,x=split(DT,DT$y));
locf = function(x) {
ind = which(!is.na(x))
if(is.na(x[1])) ind = c(1,ind)
rep(x[ind], times = diff( c(ind, length(x) + 1) ))
}
newColNames <- grep('SUM',colnames(DT),value=T);
DT <- DT[, (newColNames):=lapply(.SD, locf), .SDcols=newColNames]
#第一步是创建cumsum列
colNames不确定这是最佳解决方案,但您可以执行以下操作
set.seed(1234)
DT <- data.table(x=rep(c(1,2,3),each=4), y=c("A","B"), v=sample(1:100,12))
DT[, id := seq_len(nrow(DT))]
setkey(DT, y)
uniqY <- unique(DT$y)
for(jj in uniqY){
nc <- do.call(paste, c(expand.grid('Sum', c('x','v'),jj), sep ='.'))
DT[.(jj), (nc) := list(cumsum(x), cumsum(v))]
}
setkey(DT, id)
DT[, 5:8 := lapply(.SD, function(x) {
xn <- is.na(x)
x[xn] <- -Inf
xx <- cummax(x)
# deal with leading NA values
if(xn[1]){
xn1 <- which(xn)[1]
xx[seq_len(xn1)] <- NA}
xx }), .SDcols = 5:8]
set.seed(1234)
DT还有另一种方法:
ys <- unique(DT$y)
sdcols <- c("x", "v")
cols <- paste0("SUM.", sdcols)
DT[, c(cols) := lapply(.SD, cumsum), by = y, .SDcols = sdcols]
for( i in seq_along(ys)) {
cols <- paste0("SUM.", sdcols, ".", ys[i])
DT[, c("v1", "v2") := list(SUM.x, SUM.v[i]), by = SUM.x]
DT[, c("v1", "v2") := list(c(rep(NA_integer_, (i-1)), v1)[seq_len(.N)],
c(rep(NA_integer_, (i-1)), v2)[seq_len(.N)])]
setnames(DT, c("v1", "v2"), cols)
}
结果数据表“m”(mnel)
结果数据。表“s”(statquant's)
结果数据表“g”(grothendieck's)
试试这个:
cumsum0 <- function(x) { x <- cumsum(x); ifelse(x == 0, NA, x) }
DT2 <- DT[, {SUM.<-y; lapply(data.table(model.matrix(~ SUM.:x + SUM.:v + 0)), cumsum0)}]
setnames(DT2, sub("(.):(.)", "\\2.\\1", names(DT2)))
因此,如果这是足够的,最后一行可以删除,因为它的唯一目的是使名称与问题中的名称完全相同
结果(没有简化)是:
是的,它比我的要好得多,我总是忘了对于那项工作的王,循环,最后的5:8
很容易重放。还有,为什么不使用na.locf呢?看看na.locf的源代码,它看起来比你需要做的工作多得多(虽然没有速度检查),也不使用不必加载ZooFor循环的方法,如果实现的话,就不需要了。cumsum(.SD)是漂亮的语法,但至少创建了.SD对象的三个副本。@mnel和Arun,我用一个基准来回答,删除了NA作业,并用mnel建议进行了更新……您可以替换SUM。请注意3.0.0新闻中的这一点:为了支持选项(stringsAsFactors=FALSE),model.frame()、model.matrix()和replications()现在自动将字符向量转换为因子,而无需发出警告。我认为这个问题非常具体。在SO中有一些使用数据.table
的重塑解决方案(可能是您发布的?),但在这一点上,我不确定重塑是否是数据的最佳使用。table
没有进一步的开发。@G.Grothendieck我试图将您的内容包装在一个函数中,并使用:=
而不是创建DT2
,但是,DT
不会得到更新。看起来DT
在lappy(data.table(model.matrix(~SUM.:x+SUM.:v+0)),cumsum0)中被修改了
我对吗?试试.Internal(inspect(DT));DT0
arun <- function(DT) {
ys <- unique(DT$y)
sdcols <- c("x", "v")
cols <- paste0("SUM.", sdcols)
DT[, c(cols) := lapply(.SD, cumsum), by = y, .SDcols = sdcols]
for( i in seq_along(ys)) {
cols <- paste0("SUM.", sdcols, ".", ys[i])
DT[, c("v1", "v2") := list(SUM.x, SUM.v[i]), by = SUM.x]
DT[, c("v1", "v2") := list(c(rep(NA_integer_, (i-1)), v1)[seq_len(.N)],
c(rep(NA_integer_, (i-1)), v2)[seq_len(.N)])]
setnames(DT, c("v1", "v2"), cols)
}
DT
}
mnel <- function(DT) {
set.seed(1234)
DT <- data.table(x=rep(c(1,2,3),each=4), y=c("A","B"), v=sample(1:100,12))
DT[, id := seq_len(nrow(DT))]
setkey(DT, y)
uniqY <- unique(DT$y)
for(jj in uniqY){
nc <- do.call(paste, c(expand.grid('Sum', c('x','v'),jj), sep ='.'))
DT[.(jj), (nc) := list(cumsum(x), cumsum(v))]
}
setkey(DT, id)
DT[, 5:8 := lapply(.SD, function(x) {
xn <- is.na(x)
x[xn] <- -Inf
xx <- cummax(x)
# deal with leading NA values
if(xn[1]){
xn1 <- which(xn)[1]
xx[seq_len(xn1)] <- NA}
xx }), .SDcols = 5:8]
}
statquant <- function(DT){
#first step is to create cumsum columns
colNames <- c("x","v")
DT[, paste0("SUM.",colNames):=lapply(.SD,cumsum) ,by=y, .SDcols=colNames];
#now we need to reshape each SUM.* to SUM.*.{yvalue}
DT[,N:=.I]; setattr(DT,"sorted","N")
g <- function(DT,SD){
cols <- c('N',grep('SUM',colnames(SD), value=T));
Yval <- unique(SD[,y]);
merge(DT, SD[,cols, with=F], suffix=c('',paste0('.',Yval)), all.x=T);
}
DT <- Reduce(f=g,init=DT,x=split(DT,DT$y));
locf = function(x) {
ind = which(!is.na(x))
if(is.na(x[1])) ind = c(1,ind)
rep(x[ind], times = diff( c(ind, length(x) + 1) ))
}
newColNames <- grep('SUM',colnames(DT),value=T);
DT <- DT[, (newColNames):=lapply(.SD, locf), .SDcols=newColNames]
DT
}
grothendieck <- function(DT) {
cumsum0 <- function(x) { x <- cumsum(x); ifelse(x == 0, NA, x) }
DT2 <- DT[, {SUM.<-y; lapply(data.table(model.matrix(~ SUM.:x + SUM.:v + 0)), cumsum0)}]
setnames(DT2, sub("(.):(.)", "\\2.\\1", names(DT2)))
DT2
}
library(data.table)
library(zoo)
set.seed(1234)
DT <- data.table(x=rep(c(1,2,3),each=4), y=c("A","B"), v=sample(1:100,12))
library(microbenchmark)
microbenchmark( s <- statquant(copy(DT)), g <- grothendieck(copy(DT)),
m <- mnel(copy(DT)), a <- arun(copy(DT)), times = 1e3)
# Unit: milliseconds
# expr min lq median uq max neval
# s <- statquant(copy(DT)) 13.041125 13.674083 14.493870 17.273151 144.74186 1000
# g <- grothendieck(copy(DT)) 3.634120 3.859143 4.006085 4.443388 80.01984 1000
# m <- mnel(copy(DT)) 7.819286 8.234178 8.596090 10.423668 87.07668 1000
# a <- arun(copy(DT)) 6.925419 7.369286 7.703003 9.262719 53.39823 1000
# x y v SUM.x SUM.v SUM.x.A SUM.v.A SUM.x.B SUM.v.B
# 1: 1 A 12 1 12 1 12 NA NA
# 2: 1 B 62 1 62 1 12 1 62
# 3: 1 A 60 2 72 2 72 1 62
# 4: 1 B 61 2 123 2 72 2 123
# 5: 2 A 83 4 155 4 155 2 123
# 6: 2 B 97 4 220 4 155 4 220
# 7: 2 A 1 6 156 6 156 4 220
# 8: 2 B 22 6 242 6 156 6 242
# 9: 3 A 99 9 255 9 255 6 242
# 10: 3 B 47 9 289 9 255 9 289
# 11: 3 A 63 12 318 12 318 9 289
# 12: 3 B 49 12 338 12 318 12 338
# x y v id Sum.x.A Sum.v.A Sum.x.B Sum.v.B
# 1: 1 A 12 1 1 12 NA NA
# 2: 1 B 62 2 1 12 1 62
# 3: 1 A 60 3 2 72 1 62
# 4: 1 B 61 4 2 72 2 123
# 5: 2 A 83 5 4 155 2 123
# 6: 2 B 97 6 4 155 4 220
# 7: 2 A 1 7 6 156 4 220
# 8: 2 B 22 8 6 156 6 242
# 9: 3 A 99 9 9 255 6 242
# 10: 3 B 47 10 9 255 9 289
# 11: 3 A 63 11 12 318 9 289
# 12: 3 B 49 12 12 318 12 338
# N x y v SUM.x SUM.v SUM.x.A SUM.v.A SUM.x.B SUM.v.B
# 1: 1 1 A 12 1 12 1 12 NA NA
# 2: 2 1 B 62 1 62 1 12 1 62
# 3: 3 1 A 60 2 72 2 72 1 62
# 4: 4 1 B 61 2 123 2 72 2 123
# 5: 5 2 A 83 4 155 4 155 2 123
# 6: 6 2 B 97 4 220 4 155 4 220
# 7: 7 2 A 1 6 156 6 156 4 220
# 8: 8 2 B 22 6 242 6 156 6 242
# 9: 9 3 A 99 9 255 9 255 6 242
# 10: 10 3 B 47 9 289 9 255 9 289
# 11: 11 3 A 63 12 318 12 318 9 289
# 12: 12 3 B 49 12 338 12 318 12 338
# SUM.x.A SUM.x.B SUM.v.A SUM.v.B
# 1: 1 NA 12 NA
# 2: 1 1 12 62
# 3: 2 1 72 62
# 4: 2 2 72 123
# 5: 4 2 155 123
# 6: 4 4 155 220
# 7: 6 4 156 220
# 8: 6 6 156 242
# 9: 9 6 255 242
# 10: 9 9 255 289
# 11: 12 9 318 289
# 12: 12 12 318 338
cumsum0 <- function(x) { x <- cumsum(x); ifelse(x == 0, NA, x) }
DT2 <- DT[, {SUM.<-y; lapply(data.table(model.matrix(~ SUM.:x + SUM.:v + 0)), cumsum0)}]
setnames(DT2, sub("(.):(.)", "\\2.\\1", names(DT2)))
> names(DT2)
[1] "SUM.A:x" "SUM.B:x" "SUM.A:v" "SUM.B:v"
> DT2
SUM.x.A SUM.x.B SUM.v.A SUM.v.B
1: 1 NA 12 NA
2: 1 1 12 62
3: 2 1 72 62
4: 2 2 72 123
5: 4 2 155 123
6: 4 4 155 220
7: 6 4 156 220
8: 6 6 156 242
9: 9 6 255 242
10: 9 9 255 289
11: 12 9 318 289
12: 12 12 318 338