按组添加包含R的空行的汇总数据列

按组添加包含R的空行的汇总数据列,r,R,我试图向数据集中添加一列,该列显示另一列中每个组id在一列中的数据总和。sum或total列将有空行,每组有一个sum aggregate(Diff~Group,data.set,sum)给出了正确的总和,但去掉了所有其他行。而类似于: data.set$Total试试这个。我们首先聚合,然后合并到现有的数据集 result <- merge(data.set,setNames(aggregate(Diff ~ Group, data.set, sum),c("Group","Total"

我试图向数据集中添加一列,该列显示另一列中每个组id在一列中的数据总和。sum或total列将有空行,每组有一个sum

aggregate(Diff~Group,data.set,sum)
给出了正确的总和,但去掉了所有其他行。而类似于:
data.set$Total试试这个。我们首先聚合,然后合并到现有的数据集

result <- merge(data.set,setNames(aggregate(Diff ~ Group, data.set, sum),c("Group","Total")),all.x=TRUE)

result如果34908出现在每个“1”前面,而不仅仅出现在最后一个“1”前面,是否可以

如果不是,您可以从库“sqldf”中使用它:

此外,如果您确实希望在示例中使用NA like,您可以添加以下内容:

 for (i in 1:(dim(new_data)[1]-1)){
  if (new_data[i,"groupe"] == new_data[i+1,"groupe"]){
    new_data[i,'Total'] = NA
  }
}

假设您的
Diff
列是数字,而这些空格实际上是
NA
,您可以执行以下操作:

library(data.table)
dt <- data.table(Group = c(1,1,1,1,1,2,2,2,3,3), Diff = c(NA,-16055,-1313, 45707,6569,NA,-7249,2,NA,-384724))

dt[,total := ifelse(seq_len(.N) == .N, sum(Diff, na.rm = T), NA), by = Group]

 #   Group    Diff   total
 #1:     1      NA      NA
 #2:     1  -16055      NA
 #3:     1   -1313      NA
 #4:     1   45707      NA
 #5:     1    6569   34908
 #6:     2      NA      NA
 #7:     2   -7249      NA
 #8:     2       2   -7247
 #9:     3      NA      NA
#10:     3 -384724 -384724

定时比较:

为了了解@Frank的合并选项和我最初的解决方案之间的时间比较,我改变了#组和#观察值(两者都是高和低选项),并在4个独立的数据集上运行了一个微基准。结果如下,看起来在任何情况下,其他Frank的合并选项都是最快的。我认为我的解决方案中的瓶颈是
ifelse
,如果您可以删除它,它可能会更快,尽管速度有多快还不确定

set.seed(1)
high_grp <- 1:10000; high_obs = 1000000;
low_grp <- 1:100; low_obs = 50000;
low_grp_high_obs <- data.table(Group = sample(low_grp, high_obs, replace = T), Diff = sample(-60000:60000, high_obs, replace = T)) 
high_grp_high_obs <- data.table(Group = sample(high_grp, high_obs, replace = T), Diff = sample(-60000:60000, high_obs, replace = T)) 
low_grp_low_obs <- data.table(Group = sample(low_grp, low_obs, replace = T), Diff = sample(-60000:60000, low_obs, replace = T)) 
high_grp_low_obs <- data.table(Group = sample(high_grp, low_obs, replace = T), Diff = sample(-60000:60000, low_obs, replace = T)) 

comparison_sets <- list("Low Group; High Obs" = low_grp_high_obs, "High Group; High Obs" = high_grp_high_obs, 
                        "Low Group; Low Obs" = low_grp_low_obs, "High Group; Low Obs" = high_grp_low_obs)

comparison <- lapply(comparison_sets, function(dt) {microbenchmark::microbenchmark(orig = dt[,total := ifelse(seq_len(.N) == .N, sum(Diff, na.rm = T), NA), by = Group],
                                                                     merge = dt[dt[, sum(Diff, na.rm=TRUE), by=Group], on=.(Group), mult="last", total := i.V1 ])} )


comparison

#$`Low Group; High Obs`
#Unit: milliseconds
#  expr      min       lq     mean   median       uq      max neval
#  orig 53.16160 58.00227 69.93443 60.08673 62.57489 191.1628   100
# merge 12.93931 15.15634 17.90187 15.56495 18.33738 147.9433   100
#
#$`High Group; High Obs`
#Unit: milliseconds
#  expr       min       lq      mean    median       uq      max neval
#  orig 143.60222 151.8497 161.65825 154.85638 158.2183 281.2311   100
# merge  23.18698  23.7380  29.20126  24.86465  29.9832 153.7919   100
#
#$`Low Group; Low Obs`
#Unit: milliseconds
#  expr      min       lq     mean   median       uq      max neval
#  orig 3.047569 3.190157 3.957012 3.378145 3.692857 8.087345   100
# merge 1.685882 1.808594 1.928094 1.846520 1.953369 5.998864   100
#
#$`High Group; Low Obs`
#Unit: milliseconds
#  expr       min        lq      mean    median        uq       max neval
#  orig 65.903991 68.727469 69.861163 69.857406 70.950330 76.351860   100
# merge  3.418077  3.595673  3.831805  3.855684  3.952869  5.069314   100
set.seed(1)

high_grp另一种可能的方法是:


DF是您的
Diff
列实际上为空的位置
NA
s?或者它不是一个数值变量?它们是空的。每行中还有另一列称为“排列a值”,而Diff是从值2中减去值1,从值3中减去值2的结果,等等。计算如下:数据。交替设置$Diff,
dt[dt[,sum(Diff,na.rm=TRUE),by=Group],on=(Group),mult=“last”,total:=i.V1]
与这里的@Frank有些相似,谢谢!我没有想过使用合并。我将更新我的答案以包含此附加选项。好的,很酷。FIY,在这种情况下的基准测试中,考虑不只是行,也可以考虑β组是很重要的。Mike,你能告诉我在哪里可以学到.N符号吗?或者它的名称,很难查找,而且我无法通过查找数据找到它。表聚合教程错误是我自己的。我没有首先调用库(data.table)。此解决方案通常有效,但在组只有一行(为空)的情况下,它会完全删除整行。这是一个修改,在只有一行的组实例中,Diff可能为空,它只插入NA并离开该行?我向merge函数添加了
all.x=TRUE
,因此保留了所有data.set,它现在应该可以使用此解决方案。这并不影响数学计算,但当我在另一列(PriceDiff/PriceTotal)上使用相同的解决方案时,原始Diff列中总计的位置会转移到组中看似随机的一行,而不是停留在最后一行。同样,这实际上并不影响结果。我只是好奇你是否知道为什么会这样。新PriceTotal列中的结果总是如预期的那样位于组的最后一行。这真的很奇怪,我看不出一个值如何可以跳转行,如果你能给我一个可复制的示例,我将研究它
 for (i in 1:(dim(new_data)[1]-1)){
  if (new_data[i,"groupe"] == new_data[i+1,"groupe"]){
    new_data[i,'Total'] = NA
  }
}
library(data.table)
dt <- data.table(Group = c(1,1,1,1,1,2,2,2,3,3), Diff = c(NA,-16055,-1313, 45707,6569,NA,-7249,2,NA,-384724))

dt[,total := ifelse(seq_len(.N) == .N, sum(Diff, na.rm = T), NA), by = Group]

 #   Group    Diff   total
 #1:     1      NA      NA
 #2:     1  -16055      NA
 #3:     1   -1313      NA
 #4:     1   45707      NA
 #5:     1    6569   34908
 #6:     2      NA      NA
 #7:     2   -7249      NA
 #8:     2       2   -7247
 #9:     3      NA      NA
#10:     3 -384724 -384724
dt[dt[, sum(Diff, na.rm=TRUE), by=Group], on=.(Group), mult="last", total := i.V1 ]
set.seed(1)
high_grp <- 1:10000; high_obs = 1000000;
low_grp <- 1:100; low_obs = 50000;
low_grp_high_obs <- data.table(Group = sample(low_grp, high_obs, replace = T), Diff = sample(-60000:60000, high_obs, replace = T)) 
high_grp_high_obs <- data.table(Group = sample(high_grp, high_obs, replace = T), Diff = sample(-60000:60000, high_obs, replace = T)) 
low_grp_low_obs <- data.table(Group = sample(low_grp, low_obs, replace = T), Diff = sample(-60000:60000, low_obs, replace = T)) 
high_grp_low_obs <- data.table(Group = sample(high_grp, low_obs, replace = T), Diff = sample(-60000:60000, low_obs, replace = T)) 

comparison_sets <- list("Low Group; High Obs" = low_grp_high_obs, "High Group; High Obs" = high_grp_high_obs, 
                        "Low Group; Low Obs" = low_grp_low_obs, "High Group; Low Obs" = high_grp_low_obs)

comparison <- lapply(comparison_sets, function(dt) {microbenchmark::microbenchmark(orig = dt[,total := ifelse(seq_len(.N) == .N, sum(Diff, na.rm = T), NA), by = Group],
                                                                     merge = dt[dt[, sum(Diff, na.rm=TRUE), by=Group], on=.(Group), mult="last", total := i.V1 ])} )


comparison

#$`Low Group; High Obs`
#Unit: milliseconds
#  expr      min       lq     mean   median       uq      max neval
#  orig 53.16160 58.00227 69.93443 60.08673 62.57489 191.1628   100
# merge 12.93931 15.15634 17.90187 15.56495 18.33738 147.9433   100
#
#$`High Group; High Obs`
#Unit: milliseconds
#  expr       min       lq      mean    median       uq      max neval
#  orig 143.60222 151.8497 161.65825 154.85638 158.2183 281.2311   100
# merge  23.18698  23.7380  29.20126  24.86465  29.9832 153.7919   100
#
#$`Low Group; Low Obs`
#Unit: milliseconds
#  expr      min       lq     mean   median       uq      max neval
#  orig 3.047569 3.190157 3.957012 3.378145 3.692857 8.087345   100
# merge 1.685882 1.808594 1.928094 1.846520 1.953369 5.998864   100
#
#$`High Group; Low Obs`
#Unit: milliseconds
#  expr       min        lq      mean    median        uq       max neval
#  orig 65.903991 68.727469 69.861163 69.857406 70.950330 76.351860   100
# merge  3.418077  3.595673  3.831805  3.855684  3.952869  5.069314   100
DF <- data.frame(Group=c(1,1,1,1,1,2,2,2,3,3), 
                 Diff=c(NA,-16055,-1313,45707,6569,NA,-7249,2,NA,-384724))

customSum <- function(x){ 
  v <- x
  v[] <- NA
  v[length(v)] <- sum(x,na.rm = T)
  return(v)
}
DF$Total <- unsplit(lapply(split(DF$Diff,DF$Group),customSum),DF$Group)

> DF
   Group    Diff   Total
1      1      NA      NA
2      1  -16055      NA
3      1   -1313      NA
4      1   45707      NA
5      1    6569   34908
6      2      NA      NA
7      2   -7249      NA
8      2       2   -7247
9      3      NA      NA
10     3 -384724 -384724