R 在水平方向与垂直方向上进行变异
我试图使用mutate_at仅在某些列上应用函数 数据如下:R 在水平方向与垂直方向上进行变异,r,dplyr,tidyverse,cumsum,R,Dplyr,Tidyverse,Cumsum,我试图使用mutate_at仅在某些列上应用函数 数据如下: structure(list(LoB = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1", "2", "3", "4"), class = "factor"), AY = c(1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005), R_0
structure(list(LoB = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = c("1", "2", "3", "4"), class = "factor"),
AY = c(1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
2003, 2004, 2005), R_0 = c(50135, 46530, 38295, 12033, 13332,
35064, 15695, 41227, 88360, 29500, 30158, 47589), R_1 = c(76631,
4908, 30427, 4268, 1994, 48426, 4585, 15578, 8112, 30945,
8141, 11594), R_2 = c(28763, 2634, 374, 0, 216, 0, 555, 0,
7161, 2192, 0, 772), R_3 = c(0, 1409, 470, 0, 203, 0, 0,
0, 0, 1556, 0, 675), R_4 = c(16433, 0, 436, 0, 202, 2115,
0, 0, 0, 1271, 0, 535), R_5 = c(6301, 0, 0, 0, 179, 0, 0,
0, 183, 1052, 0, 0), R_6 = c(0, 0, 0, 0, 147, 0, 0, 0, 0,
982, 0, 0), R_7 = c(0, 0, 0, 0, 135, 0, 0, 0, 0, 907, 2356,
0), R_8 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 902, 0, 0), R_9 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 833, 0, 0), R_10 = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 800, 0, 0), R_11 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 684, 0, 0)), row.names = c(NA, -12L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), vars = "LoB", drop = TRUE, indices = list(
0:11), group_sizes = 12L, biggest_group_size = 12L, labels = structure(list(
LoB = structure(1L, .Label = c("1", "2", "3", "4"), class = "factor")), row.names = c(NA,
-1L), class = "data.frame", vars = "LoB", drop = TRUE))
如下所示:
# A tibble: 12 x 14
# Groups: LoB [1]
LoB AY R_0 R_1 R_2 R_3 R_4 R_5 R_6 R_7 R_8 R_9 R_10 R_11
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1994 50135 76631 28763 0 16433 6301 0 0 0 0 0 0
2 1 1995 46530 4908 2634 1409 0 0 0 0 0 0 0 0
3 1 1996 38295 30427 374 470 436 0 0 0 0 0 0 0
4 1 1997 12033 4268 0 0 0 0 0 0 0 0 0 0
5 1 1998 13332 1994 216 203 202 179 147 135 0 0 0 0
6 1 1999 35064 48426 0 0 2115 0 0 0 0 0 0 0
7 1 2000 15695 4585 555 0 0 0 0 0 0 0 0 0
8 1 2001 41227 15578 0 0 0 0 0 0 0 0 0 0
9 1 2002 88360 8112 7161 0 0 183 0 0 0 0 0 0
10 1 2003 29500 30945 2192 1556 1271 1052 982 907 902 833 800 684
11 1 2004 30158 8141 0 0 0 0 0 2356 0 0 0 0
12 1 2005 47589 11594 772 675 535 0 0 0 0 0 0 0
这给了我以下输出:
# A tibble: 12 x 14
# Groups: LoB [1]
LoB AY R_0 R_1 R_2 R_3 R_4 R_5 R_6 R_7 R_8 R_9 R_10 R_11
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1994 50135 76631 28763 0 16433 6301 0 0 0 0 0 0
2 1 1995 96665 81539 31397 1409 16433 6301 0 0 0 0 0 0
3 1 1996 134960 111966 31771 1879 16869 6301 0 0 0 0 0 0
4 1 1997 146993 116234 31771 1879 16869 6301 0 0 0 0 0 0
5 1 1998 160325 118228 31987 2082 17071 6480 147 135 0 0 0 0
6 1 1999 195389 166654 31987 2082 19186 6480 147 135 0 0 0 0
7 1 2000 211084 171239 32542 2082 19186 6480 147 135 0 0 0 0
8 1 2001 252311 186817 32542 2082 19186 6480 147 135 0 0 0 0
9 1 2002 340671 194929 39703 2082 19186 6663 147 135 0 0 0 0
10 1 2003 370171 225874 41895 3638 20457 7715 1129 1042 902 833 800 684
11 1 2004 400329 234015 41895 3638 20457 7715 1129 3398 902 833 800 684
12 1 2005 447918 245609 42667 4313 20992 7715 1129 3398 902 833 800 684
这里的问题是,累积和是通过变量垂直完成的,而不是水平完成的。如何在dplyr中实现这一点?我不确定是否有不使用聚集和扩散的方法来实现这一点。我会这样做的。首先,我会将数据重塑为长数据,然后我们需要使用group_by,以便我们只计算原始data.frame中每一行的总和。如果这没有充分分组,我们可以向数据中添加一个行编号,并按该编号分组。在此之后,我们进行变异,然后传播,以使数据恢复到广域。最后,我们按照@Gregor的建议添加selectnamesdf以保留原始列顺序
df %>%
gather(variable, value, contains('R_')) %>% # reshape wide to long
group_by(LoB, AY) %>% # group by for each row in original data
mutate(value = cumsum(value)) %>% # calculate cumsum
spread(variable, value) %>% # reshape back from long to wide
select(names(df)) # added to retain original column order
# LoB AY R_0 R_1 R_2 R_3 ...
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> ...
# 1 1 1994 50135 126766 155529 155529 ...
# 2 1 1995 46530 51438 54072 55481 ...
# 3 1 1996 38295 68722 69096 69566 ...
# 4 1 1997 12033 16301 16301 16301 ...
我不确定有没有不使用“聚集和扩散”的方法来做到这一点。我会这样做的。首先,我会将数据重塑为长数据,然后我们需要使用group_by,以便我们只计算原始data.frame中每一行的总和。如果这没有充分分组,我们可以向数据中添加一个行编号,并按该编号分组。在此之后,我们进行变异,然后传播,以使数据恢复到广域。最后,我们按照@Gregor的建议添加selectnamesdf以保留原始列顺序
df %>%
gather(variable, value, contains('R_')) %>% # reshape wide to long
group_by(LoB, AY) %>% # group by for each row in original data
mutate(value = cumsum(value)) %>% # calculate cumsum
spread(variable, value) %>% # reshape back from long to wide
select(names(df)) # added to retain original column order
# LoB AY R_0 R_1 R_2 R_3 ...
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> ...
# 1 1 1994 50135 126766 155529 155529 ...
# 2 1 1995 46530 51438 54072 55481 ...
# 3 1 1996 38295 68722 69096 69566 ...
# 4 1 1997 12033 16301 16301 16301 ...
与bouncyball的答案类似,但它保持列顺序,并在LoB、AY不是主键的情况下使用通用id:
df %>%
mutate(id = 1:n()) %>%
gather(old_name, value, starts_with("R_")) %>%
arrange(id, nchar(old_name), old_name) %>%
group_by(id) %>%
mutate(value = cumsum(value)) %>%
ungroup() %>%
select(-id) %>%
spread(old_name, value) %>%
select(names(df)) %>%
select(AY, everything())
与bouncyball的答案类似,但它保持列顺序,并在LoB、AY不是主键的情况下使用通用id:
df %>%
mutate(id = 1:n()) %>%
gather(old_name, value, starts_with("R_")) %>%
arrange(id, nchar(old_name), old_name) %>%
group_by(id) %>%
mutate(value = cumsum(value)) %>%
ungroup() %>%
select(-id) %>%
spread(old_name, value) %>%
select(names(df)) %>%
select(AY, everything())
行操作通常对矩阵更有效。为了避免聚集/扩散的麻烦,我将提取R_uu列,使用apply隐式转换为矩阵,然后将结果分配回原始数据: 这就是说,数据似乎不是很整洁。你最好选择长格式,并保持长格式
result = dd %>% ungroup %>%
select(starts_with("R_")) %>%
apply(1, cumsum) %>%
t
dd[, grepl("^R_", names(dd))] = result
dd
# # A tibble: 12 x 14
# # Groups: LoB [1]
# LoB AY R_0 R_1 R_2 R_3 R_4 R_5 R_6 R_7 R_8 R_9 R_10 R_11
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 1994 50135 126766 155529 155529 171962 178263 178263 178263 178263 178263 178263 178263
# 2 1 1995 46530 51438 54072 55481 55481 55481 55481 55481 55481 55481 55481 55481
# 3 1 1996 38295 68722 69096 69566 70002 70002 70002 70002 70002 70002 70002 70002
# 4 1 1997 12033 16301 16301 16301 16301 16301 16301 16301 16301 16301 16301 16301
# 5 1 1998 13332 15326 15542 15745 15947 16126 16273 16408 16408 16408 16408 16408
# 6 1 1999 35064 83490 83490 83490 85605 85605 85605 85605 85605 85605 85605 85605
# 7 1 2000 15695 20280 20835 20835 20835 20835 20835 20835 20835 20835 20835 20835
# 8 1 2001 41227 56805 56805 56805 56805 56805 56805 56805 56805 56805 56805 56805
# 9 1 2002 88360 96472 103633 103633 103633 103816 103816 103816 103816 103816 103816 103816
# 10 1 2003 29500 60445 62637 64193 65464 66516 67498 68405 69307 70140 70940 71624
# 11 1 2004 30158 38299 38299 38299 38299 38299 38299 40655 40655 40655 40655 40655
# 12 1 2005 47589 59183 59955 60630 61165 61165 61165 61165 61165 61165 61165 61165
行操作通常对矩阵更有效。为了避免聚集/扩散的麻烦,我将提取R_uu列,使用apply隐式转换为矩阵,然后将结果分配回原始数据: 这就是说,数据似乎不是很整洁。你最好选择长格式,并保持长格式
result = dd %>% ungroup %>%
select(starts_with("R_")) %>%
apply(1, cumsum) %>%
t
dd[, grepl("^R_", names(dd))] = result
dd
# # A tibble: 12 x 14
# # Groups: LoB [1]
# LoB AY R_0 R_1 R_2 R_3 R_4 R_5 R_6 R_7 R_8 R_9 R_10 R_11
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 1994 50135 126766 155529 155529 171962 178263 178263 178263 178263 178263 178263 178263
# 2 1 1995 46530 51438 54072 55481 55481 55481 55481 55481 55481 55481 55481 55481
# 3 1 1996 38295 68722 69096 69566 70002 70002 70002 70002 70002 70002 70002 70002
# 4 1 1997 12033 16301 16301 16301 16301 16301 16301 16301 16301 16301 16301 16301
# 5 1 1998 13332 15326 15542 15745 15947 16126 16273 16408 16408 16408 16408 16408
# 6 1 1999 35064 83490 83490 83490 85605 85605 85605 85605 85605 85605 85605 85605
# 7 1 2000 15695 20280 20835 20835 20835 20835 20835 20835 20835 20835 20835 20835
# 8 1 2001 41227 56805 56805 56805 56805 56805 56805 56805 56805 56805 56805 56805
# 9 1 2002 88360 96472 103633 103633 103633 103816 103816 103816 103816 103816 103816 103816
# 10 1 2003 29500 60445 62637 64193 65464 66516 67498 68405 69307 70140 70940 71624
# 11 1 2004 30158 38299 38299 38299 38299 38299 38299 40655 40655 40655 40655 40655
# 12 1 2005 47589 59183 59955 60630 61165 61165 61165 61165 61165 61165 61165 61165
我认为它没有正确计算总和,因为聚集后的行顺序与列R_10在R_2之前的原始顺序不一致。我不同意。最终结果中列的顺序是不同的,而不是行的顺序。总和计算正确。通过检查采集线之后的数据,可以很容易地验证这一点。True。对不起,我弄错了。我假设gather会按字母顺序对行进行排序。我认为它无法正确计算总和,因为gather之后的行顺序与列R_10在R_2之前的原始顺序不一致。我不同意。最终结果中列的顺序是不同的,而不是行的顺序。总和计算正确。通过检查采集线之后的数据,可以很容易地验证这一点。True。对不起,我弄错了。我假设gather会按字母顺序对行进行排序。事实上,这样更好!我忘记了正确的顺序仍然存在于原始变量中。谢谢有了selectnamesdf,我认为selectAY,一切都是多余的,事实上,这是更好的!我忘记了正确的顺序仍然存在于原始变量中。谢谢有了selectnamesdf,我认为selectAY,一切都是多余的