使用dplyr按组按最低和最高年份过滤
我觉得这里的答案很明显,但我无法确定。我有这个数据框:使用dplyr按组按最低和最高年份过滤,r,filter,dplyr,R,Filter,Dplyr,我觉得这里的答案很明显,但我无法确定。我有这个数据框: df <- structure(list(SIC = c(3L, 12L, 11L, 7L, 18L, 5L, 19L, 17L, 1L, 10L, 8L, 16L, 14L, 2L, 15L, 6L, 9L, 4L, 13L, 3L, 12L, 11L, 7L, 18L, 5L, 19L, 17L, 1L, 10L, 8L, 16L, 14L, 2L, 15L, 6L, 9L, 4L, 13L, 3L, 12L, 11L, 7
df <- structure(list(SIC = c(3L, 12L, 11L, 7L, 18L, 5L, 19L, 17L, 1L,
10L, 8L, 16L, 14L, 2L, 15L, 6L, 9L, 4L, 13L, 3L, 12L, 11L, 7L,
18L, 5L, 19L, 17L, 1L, 10L, 8L, 16L, 14L, 2L, 15L, 6L, 9L, 4L,
13L, 3L, 12L, 11L, 7L, 18L, 5L, 19L, 17L, 1L, 10L, 8L, 16L, 14L,
2L, 15L, 6L, 9L, 4L, 13L, 3L, 12L, 11L, 7L, 18L, 5L, 19L, 17L,
1L, 10L, 8L, 16L, 14L, 2L, 15L, 6L, 9L, 4L, 13L, 3L, 12L, 11L,
7L, 18L, 5L, 19L, 17L, 1L, 10L, 8L, 16L, 14L, 2L, 15L, 6L, 9L,
4L, 13L, 3L, 12L, 11L, 7L, 18L, 5L, 19L, 17L, 1L, 10L, 8L, 16L,
14L, 2L, 15L, 6L, 9L, 4L, 13L, 3L, 12L, 11L, 7L, 18L, 5L, 19L,
17L, 1L, 10L, 8L, 16L, 14L, 2L, 15L, 6L, 9L, 4L, 13L, 3L, 12L,
11L, 7L, 18L, 5L, 19L, 17L, 1L, 10L, 8L, 16L, 14L, 2L, 15L, 6L,
9L, 4L, 13L), year = c(2011, 2011, 2011, 2011, 2011, 2011, 2011,
2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011,
2011, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2013,
2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,
2013, 2013, 2013, 2013, 2013, 2013, 2014, 2014, 2014, 2014, 2014,
2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014,
2014, 2014, 2014, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015,
2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015,
2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016,
2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2017, 2017, 2017,
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017,
2017, 2017, 2017, 2017, 2017, 2018, 2018, 2018, 2018, 2018, 2018,
2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018,
2018, 2018), value = c(NA, 0.081, 0.218, 0.212, NA, 0.092, 0.142,
0.001, 0.045, 0.143, 0.361, 0.175, 0.295, 0.003, 0.146, 0.01,
0.163, NA, 0.225, NA, 0.108, 0.274, 0.219, NA, 0.097, 0.148,
-0.049, 0.098, 0.26, 0.251, 0.153, 0.262, 0.119, 0.096, 0, 0.149,
NA, NA, NA, 0.064, 0.27, 0.16, NA, 0.103, 0.148, -0.029, 0.084,
0.219, 0.314, 0.142, 0.255, 0.026, 0.031, -0.003, 0.164, NA,
NA, NA, NA, 0.257394804, 0.124025397, NA, 0.071727544, 0.13439,
-0.017419321, 0.091993981, 0.171021874, 0.308369685, 0.208573024,
0.310316421, 0.06216349, 0.074913633, -0.034273066, 0.181129287,
0.07876301, 0.121, NA, -0.063226494, 0.233968039, 0.179367136,
NA, 0.105362761, 0.15319907, -0.033967241, -0.035027867, 0.144316565,
0.304955404, 0.069662044, 0.304262651, 0.075256422, 0.051273353,
-0.067541918, 0.157814304, 0.050231459, 0.06308377, NA, -8.4,
21, 17.9, NA, 7.3, 12.6, -1.2, 4.1, 10.3, 30.2, 8.7, 28.1, 4.7,
1.3, -7.7, 12.5, 15.9, 19.4, 16.9, 4, 18.2, 13.5, NA, 10.9, 12.8,
-0.7, 4.2, 7.5, 26.8, 5, 30.3, 0.9, 2.5, -2.4, 13.5, 12.8, 17,
NA, NA, 17.2, 17.7, NA, 0.6, 11.6, -2.9, 3, 18.7, 31, 6.2, 30.1,
-1.1, 5.7, -0.5, 13.6, 6.1, -7)), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -152L))
有什么想法吗?谢谢
编辑:
数据的一个简单版本是这样过滤:
tibble(
SIC = c(1,1,1,2,2, 2),
year = c(2011, 2012, 2013, 2011, 2012, 2013),
value = c(3, 4, NA, NA, 4, NA)
) %>%
filter(!is.na(value))
# A tibble: 3 x 3
SIC year value
<dbl> <dbl> <dbl>
1 1 2011 3
2 1 2012 4
3 2 2012 4
所有出现一次的行应同时计为最小值和最大值。有没有办法为应用过滤器后只出现一次的行创建一个重复行?我的理解是,您希望SIC有两行,其最大年份和最小年份相同。我想你可以把这两行分开绑起来,这样它仍然有两行
library(dplyr)
min_year <- df %>%
group_by(SIC) %>%
filter(!is.na(value)) %>%
filter(year %in% c(min(year)))
max_year <- df %>%
group_by(SIC) %>%
filter(!is.na(value)) %>%
filter(year %in% c(max(year)))
total <- min_year %>% rbind(max_year)
我的理解是,您希望SIC有两行,其最大年份和最小年份相同。我想你可以把这两行分开绑起来,这样它仍然有两行
library(dplyr)
min_year <- df %>%
group_by(SIC) %>%
filter(!is.na(value)) %>%
filter(year %in% c(min(year)))
max_year <- df %>%
group_by(SIC) %>%
filter(!is.na(value)) %>%
filter(year %in% c(max(year)))
total <- min_year %>% rbind(max_year)
尝试使用slice:
输出:
# A tibble: 4 x 3
# Groups: SIC [2]
SIC year value
<dbl> <dbl> <dbl>
1 1 2011 3
2 1 2012 4
3 2 2012 4
4 2 2012 4
i、 e.它重复年份,而过滤器只保留与参数对应的内容:
# A tibble: 35 x 3
# Groups: SIC [18]
SIC year value
<int> <dbl> <dbl>
1 1 2011 0.045
2 1 2018 3
3 2 2011 0.003
4 2 2018 -1.1
5 3 2017 16.9
6 4 2014 0.0788
我正在使用dplyr 0.8。尝试使用slice:
输出:
# A tibble: 4 x 3
# Groups: SIC [2]
SIC year value
<dbl> <dbl> <dbl>
1 1 2011 3
2 1 2012 4
3 2 2012 4
4 2 2012 4
i、 e.它重复年份,而过滤器只保留与参数对应的内容:
# A tibble: 35 x 3
# Groups: SIC [18]
SIC year value
<int> <dbl> <dbl>
1 1 2011 0.045
2 1 2018 3
3 2 2011 0.003
4 2 2018 -1.1
5 3 2017 16.9
6 4 2014 0.0788
我正在使用dplyr 0.8。我认为这就是您希望实现的目标。希望有帮助:
#Create two tibbles (MAX & MIN)
max.vals<-df %>%
group_by(year) %>%
slice(which.max(value))
min.vals<-df %>%
group_by(year) %>%
slice(which.min(value))
#Create new DF, with MAX & MIN for each (unique) year:
clean.df<-data.frame(SIC=max.vals$SIC,
year = max.vals$year,
max.value = max.vals$value,
min.value = min.vals$value,stringsAsFactors = FALSE)
> head(clean.df)
SIC year max.value min.value
1 8 2011 0.3610000 0.00100000
2 11 2012 0.2740000 -0.04900000
3 8 2013 0.3140000 -0.02900000
4 14 2014 0.3103164 -0.03427307
5 8 2015 0.3049554 -0.06754192
6 8 2016 30.2000000 -8.40000000
编辑
我是一个可怕的人,我只是看到你想要它的SIC。别担心,希望这就是你需要的:
SIC.low<-df %>%
group_by(SIC) %>%
slice(which.min(value))
SIC.high<-df %>%
group_by(SIC) %>%
slice(which.max(value))
clean.df2<-data.frame(SIC=SIC.high$SIC,
year.high = SIC.high$year,
max.value = SIC.high$value,
year.low= SIC.low$year,
min.value = SIC.low$value,stringsAsFactors = FALSE)
> head(clean.df2)
SIC year.high max.value year.low min.value
1 1 2017 4.20 2015 -0.03502787
2 2 2016 4.70 2018 -1.10000000
3 3 2017 16.90 2017 16.90000000
4 4 2016 15.90 2015 0.05023146
5 5 2017 10.90 2014 0.07172754
6 6 2011 0.01 2016 -7.70000000
我认为这就是你希望实现的目标。希望有帮助:
#Create two tibbles (MAX & MIN)
max.vals<-df %>%
group_by(year) %>%
slice(which.max(value))
min.vals<-df %>%
group_by(year) %>%
slice(which.min(value))
#Create new DF, with MAX & MIN for each (unique) year:
clean.df<-data.frame(SIC=max.vals$SIC,
year = max.vals$year,
max.value = max.vals$value,
min.value = min.vals$value,stringsAsFactors = FALSE)
> head(clean.df)
SIC year max.value min.value
1 8 2011 0.3610000 0.00100000
2 11 2012 0.2740000 -0.04900000
3 8 2013 0.3140000 -0.02900000
4 14 2014 0.3103164 -0.03427307
5 8 2015 0.3049554 -0.06754192
6 8 2016 30.2000000 -8.40000000
编辑
我是一个可怕的人,我只是看到你想要它的SIC。别担心,希望这就是你需要的:
SIC.low<-df %>%
group_by(SIC) %>%
slice(which.min(value))
SIC.high<-df %>%
group_by(SIC) %>%
slice(which.max(value))
clean.df2<-data.frame(SIC=SIC.high$SIC,
year.high = SIC.high$year,
max.value = SIC.high$value,
year.low= SIC.low$year,
min.value = SIC.low$value,stringsAsFactors = FALSE)
> head(clean.df2)
SIC year.high max.value year.low min.value
1 1 2017 4.20 2015 -0.03502787
2 2 2016 4.70 2018 -1.10000000
3 3 2017 16.90 2017 16.90000000
4 4 2016 15.90 2015 0.05023146
5 5 2017 10.90 2014 0.07172754
6 6 2011 0.01 2016 -7.70000000
你能发布你的预期输出吗?@arg0naut,请看我的编辑。@elliot请看下面,我认为slice应该这样做,因为它会自动重复行。你能发布你的预期输出吗?@arg0naut,请看我的编辑。@elliot请看下面,我认为slice应该这样做,因为它会自动重复行。谢谢这项工作。注意,我确实需要更新我的dplyr,以便在slice中使用两个变量。不客气,谢谢你让我知道,我认为它在0.8之前就已经存在了-我已经更新了我的帖子。谢谢你的帮助。注意,我确实需要更新我的dplyr以在slice中使用两个变量。不客气,谢谢你让我知道,我认为它在0.8之前就已经存在了-我已经更新了我的帖子。