R 在水平方向与垂直方向上进行变异

R 在水平方向与垂直方向上进行变异,r,dplyr,tidyverse,cumsum,R,Dplyr,Tidyverse,Cumsum,我试图使用mutate_at仅在某些列上应用函数 数据如下: structure(list(LoB = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1", "2", "3", "4"), class = "factor"), AY = c(1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005), R_0

我试图使用mutate_at仅在某些列上应用函数

数据如下:

structure(list(LoB = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L), .Label = c("1", "2", "3", "4"), class = "factor"), 
AY = c(1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 
2003, 2004, 2005), R_0 = c(50135, 46530, 38295, 12033, 13332, 
35064, 15695, 41227, 88360, 29500, 30158, 47589), R_1 = c(76631, 
4908, 30427, 4268, 1994, 48426, 4585, 15578, 8112, 30945, 
8141, 11594), R_2 = c(28763, 2634, 374, 0, 216, 0, 555, 0, 
7161, 2192, 0, 772), R_3 = c(0, 1409, 470, 0, 203, 0, 0, 
0, 0, 1556, 0, 675), R_4 = c(16433, 0, 436, 0, 202, 2115, 
0, 0, 0, 1271, 0, 535), R_5 = c(6301, 0, 0, 0, 179, 0, 0, 
0, 183, 1052, 0, 0), R_6 = c(0, 0, 0, 0, 147, 0, 0, 0, 0, 
982, 0, 0), R_7 = c(0, 0, 0, 0, 135, 0, 0, 0, 0, 907, 2356, 
0), R_8 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 902, 0, 0), R_9 = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 833, 0, 0), R_10 = c(0, 0, 0, 0, 
0, 0, 0, 0, 0, 800, 0, 0), R_11 = c(0, 0, 0, 0, 0, 0, 0, 
0, 0, 684, 0, 0)), row.names = c(NA, -12L), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), vars = "LoB", drop = TRUE, indices = list(
0:11), group_sizes = 12L, biggest_group_size = 12L, labels = structure(list(
LoB = structure(1L, .Label = c("1", "2", "3", "4"), class = "factor")), row.names = c(NA, 
-1L), class = "data.frame", vars = "LoB", drop = TRUE))
如下所示:

# A tibble: 12 x 14
# Groups:   LoB [1]
  LoB      AY   R_0   R_1   R_2   R_3   R_4   R_5   R_6   R_7   R_8   R_9  R_10  R_11
  <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1 1      1994 50135 76631 28763     0 16433  6301     0     0     0     0     0     0
 2 1      1995 46530  4908  2634  1409     0     0     0     0     0     0     0     0
 3 1      1996 38295 30427   374   470   436     0     0     0     0     0     0     0
 4 1      1997 12033  4268     0     0     0     0     0     0     0     0     0     0
 5 1      1998 13332  1994   216   203   202   179   147   135     0     0     0     0
 6 1      1999 35064 48426     0     0  2115     0     0     0     0     0     0     0
 7 1      2000 15695  4585   555     0     0     0     0     0     0     0     0     0
 8 1      2001 41227 15578     0     0     0     0     0     0     0     0     0     0
 9 1      2002 88360  8112  7161     0     0   183     0     0     0     0     0     0
10 1      2003 29500 30945  2192  1556  1271  1052   982   907   902   833   800   684
11 1      2004 30158  8141     0     0     0     0     0  2356     0     0     0     0
12 1      2005 47589 11594   772   675   535     0     0     0     0     0     0     0
这给了我以下输出:

# A tibble: 12 x 14
# Groups:   LoB [1]
   LoB      AY    R_0    R_1   R_2   R_3   R_4   R_5   R_6   R_7   R_8   R_9  R_10  R_11
   <fct> <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1 1      1994  50135  76631 28763     0 16433  6301     0     0     0     0     0     0
 2 1      1995  96665  81539 31397  1409 16433  6301     0     0     0     0     0     0
 3 1      1996 134960 111966 31771  1879 16869  6301     0     0     0     0     0     0
 4 1      1997 146993 116234 31771  1879 16869  6301     0     0     0     0     0     0
 5 1      1998 160325 118228 31987  2082 17071  6480   147   135     0     0     0     0
 6 1      1999 195389 166654 31987  2082 19186  6480   147   135     0     0     0     0
 7 1      2000 211084 171239 32542  2082 19186  6480   147   135     0     0     0     0
 8 1      2001 252311 186817 32542  2082 19186  6480   147   135     0     0     0     0
 9 1      2002 340671 194929 39703  2082 19186  6663   147   135     0     0     0     0
10 1      2003 370171 225874 41895  3638 20457  7715  1129  1042   902   833   800   684
11 1      2004 400329 234015 41895  3638 20457  7715  1129  3398   902   833   800   684
12 1      2005 447918 245609 42667  4313 20992  7715  1129  3398   902   833   800   684

这里的问题是,累积和是通过变量垂直完成的,而不是水平完成的。如何在dplyr中实现这一点?

我不确定是否有不使用聚集和扩散的方法来实现这一点。我会这样做的。首先,我会将数据重塑为长数据,然后我们需要使用group_by,以便我们只计算原始data.frame中每一行的总和。如果这没有充分分组,我们可以向数据中添加一个行编号,并按该编号分组。在此之后,我们进行变异,然后传播,以使数据恢复到广域。最后,我们按照@Gregor的建议添加selectnamesdf以保留原始列顺序

df %>%
    gather(variable, value, contains('R_')) %>% # reshape wide to long
    group_by(LoB, AY) %>% # group by for each row in original data
    mutate(value = cumsum(value)) %>% # calculate cumsum
    spread(variable, value) %>% # reshape back from long to wide
    select(names(df)) # added to retain original column order

#   LoB      AY   R_0    R_1    R_2    R_3 ...
#   <fct> <dbl> <dbl>  <dbl>  <dbl>  <dbl> ... 
# 1 1      1994 50135 126766 155529 155529 ...
# 2 1      1995 46530  51438  54072  55481 ...
# 3 1      1996 38295  68722  69096  69566 ... 
# 4 1      1997 12033  16301  16301  16301 ...

我不确定有没有不使用“聚集和扩散”的方法来做到这一点。我会这样做的。首先,我会将数据重塑为长数据,然后我们需要使用group_by,以便我们只计算原始data.frame中每一行的总和。如果这没有充分分组,我们可以向数据中添加一个行编号,并按该编号分组。在此之后,我们进行变异,然后传播,以使数据恢复到广域。最后,我们按照@Gregor的建议添加selectnamesdf以保留原始列顺序

df %>%
    gather(variable, value, contains('R_')) %>% # reshape wide to long
    group_by(LoB, AY) %>% # group by for each row in original data
    mutate(value = cumsum(value)) %>% # calculate cumsum
    spread(variable, value) %>% # reshape back from long to wide
    select(names(df)) # added to retain original column order

#   LoB      AY   R_0    R_1    R_2    R_3 ...
#   <fct> <dbl> <dbl>  <dbl>  <dbl>  <dbl> ... 
# 1 1      1994 50135 126766 155529 155529 ...
# 2 1      1995 46530  51438  54072  55481 ...
# 3 1      1996 38295  68722  69096  69566 ... 
# 4 1      1997 12033  16301  16301  16301 ...

与bouncyball的答案类似,但它保持列顺序,并在LoB、AY不是主键的情况下使用通用id:

df %>% 
  mutate(id = 1:n()) %>% 
  gather(old_name, value, starts_with("R_")) %>% 
  arrange(id, nchar(old_name), old_name) %>% 
  group_by(id) %>% 
  mutate(value = cumsum(value)) %>% 
  ungroup() %>% 
  select(-id) %>% 
  spread(old_name, value) %>% 
  select(names(df)) %>% 
  select(AY, everything())

与bouncyball的答案类似,但它保持列顺序,并在LoB、AY不是主键的情况下使用通用id:

df %>% 
  mutate(id = 1:n()) %>% 
  gather(old_name, value, starts_with("R_")) %>% 
  arrange(id, nchar(old_name), old_name) %>% 
  group_by(id) %>% 
  mutate(value = cumsum(value)) %>% 
  ungroup() %>% 
  select(-id) %>% 
  spread(old_name, value) %>% 
  select(names(df)) %>% 
  select(AY, everything())

行操作通常对矩阵更有效。为了避免聚集/扩散的麻烦,我将提取R_uu列,使用apply隐式转换为矩阵,然后将结果分配回原始数据:

这就是说,数据似乎不是很整洁。你最好选择长格式,并保持长格式

result = dd %>% ungroup %>%
  select(starts_with("R_")) %>%
  apply(1, cumsum) %>% 
  t

dd[, grepl("^R_", names(dd))] = result

dd
# # A tibble: 12 x 14
# # Groups:   LoB [1]
#    LoB      AY   R_0    R_1    R_2    R_3    R_4    R_5    R_6    R_7    R_8    R_9   R_10   R_11
#    <fct> <dbl> <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
#  1 1      1994 50135 126766 155529 155529 171962 178263 178263 178263 178263 178263 178263 178263
#  2 1      1995 46530  51438  54072  55481  55481  55481  55481  55481  55481  55481  55481  55481
#  3 1      1996 38295  68722  69096  69566  70002  70002  70002  70002  70002  70002  70002  70002
#  4 1      1997 12033  16301  16301  16301  16301  16301  16301  16301  16301  16301  16301  16301
#  5 1      1998 13332  15326  15542  15745  15947  16126  16273  16408  16408  16408  16408  16408
#  6 1      1999 35064  83490  83490  83490  85605  85605  85605  85605  85605  85605  85605  85605
#  7 1      2000 15695  20280  20835  20835  20835  20835  20835  20835  20835  20835  20835  20835
#  8 1      2001 41227  56805  56805  56805  56805  56805  56805  56805  56805  56805  56805  56805
#  9 1      2002 88360  96472 103633 103633 103633 103816 103816 103816 103816 103816 103816 103816
# 10 1      2003 29500  60445  62637  64193  65464  66516  67498  68405  69307  70140  70940  71624
# 11 1      2004 30158  38299  38299  38299  38299  38299  38299  40655  40655  40655  40655  40655
# 12 1      2005 47589  59183  59955  60630  61165  61165  61165  61165  61165  61165  61165  61165

行操作通常对矩阵更有效。为了避免聚集/扩散的麻烦,我将提取R_uu列,使用apply隐式转换为矩阵,然后将结果分配回原始数据:

这就是说,数据似乎不是很整洁。你最好选择长格式,并保持长格式

result = dd %>% ungroup %>%
  select(starts_with("R_")) %>%
  apply(1, cumsum) %>% 
  t

dd[, grepl("^R_", names(dd))] = result

dd
# # A tibble: 12 x 14
# # Groups:   LoB [1]
#    LoB      AY   R_0    R_1    R_2    R_3    R_4    R_5    R_6    R_7    R_8    R_9   R_10   R_11
#    <fct> <dbl> <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
#  1 1      1994 50135 126766 155529 155529 171962 178263 178263 178263 178263 178263 178263 178263
#  2 1      1995 46530  51438  54072  55481  55481  55481  55481  55481  55481  55481  55481  55481
#  3 1      1996 38295  68722  69096  69566  70002  70002  70002  70002  70002  70002  70002  70002
#  4 1      1997 12033  16301  16301  16301  16301  16301  16301  16301  16301  16301  16301  16301
#  5 1      1998 13332  15326  15542  15745  15947  16126  16273  16408  16408  16408  16408  16408
#  6 1      1999 35064  83490  83490  83490  85605  85605  85605  85605  85605  85605  85605  85605
#  7 1      2000 15695  20280  20835  20835  20835  20835  20835  20835  20835  20835  20835  20835
#  8 1      2001 41227  56805  56805  56805  56805  56805  56805  56805  56805  56805  56805  56805
#  9 1      2002 88360  96472 103633 103633 103633 103816 103816 103816 103816 103816 103816 103816
# 10 1      2003 29500  60445  62637  64193  65464  66516  67498  68405  69307  70140  70940  71624
# 11 1      2004 30158  38299  38299  38299  38299  38299  38299  40655  40655  40655  40655  40655
# 12 1      2005 47589  59183  59955  60630  61165  61165  61165  61165  61165  61165  61165  61165

我认为它没有正确计算总和,因为聚集后的行顺序与列R_10在R_2之前的原始顺序不一致。我不同意。最终结果中列的顺序是不同的,而不是行的顺序。总和计算正确。通过检查采集线之后的数据,可以很容易地验证这一点。True。对不起,我弄错了。我假设gather会按字母顺序对行进行排序。我认为它无法正确计算总和,因为gather之后的行顺序与列R_10在R_2之前的原始顺序不一致。我不同意。最终结果中列的顺序是不同的,而不是行的顺序。总和计算正确。通过检查采集线之后的数据,可以很容易地验证这一点。True。对不起,我弄错了。我假设gather会按字母顺序对行进行排序。事实上,这样更好!我忘记了正确的顺序仍然存在于原始变量中。谢谢有了selectnamesdf,我认为selectAY,一切都是多余的,事实上,这是更好的!我忘记了正确的顺序仍然存在于原始变量中。谢谢有了selectnamesdf,我认为selectAY,一切都是多余的