来自rsamples的嵌套简历-不确定超过2组的结果
我运行以下命令:来自rsamples的嵌套简历-不确定超过2组的结果,r,R,我运行以下命令: data(sunspots) data <- sunspots %>% as_tibble() %>% mutate(category = sample(c('male', 'female'), nrow(.), replace=TRUE)) data library(rsample) periods_train <- 100 periods_test <- 1 skip_span <- 0 cv_rolling
data(sunspots)
data <- sunspots %>%
as_tibble() %>%
mutate(category = sample(c('male', 'female'), nrow(.), replace=TRUE))
data
library(rsample)
periods_train <- 100
periods_test <- 1
skip_span <- 0
cv_rolling <- nested_cv(data,
outside = group_vfold_cv(group = "category"),
inside = rolling_origin(
initial = periods_train,
assess = periods_test,
cumulative = FALSE,
skip = skip_span))
> cv_rolling$inner_resamples
$`1`
# Rolling origin forecast resampling
# A tibble: 1,311 x 2
splits id
<list> <chr>
1 <split [100/1]> Slice0001
2 <split [100/1]> Slice0002
3 <split [100/1]> Slice0003
4 <split [100/1]> Slice0004
5 <split [100/1]> Slice0005
6 <split [100/1]> Slice0006
7 <split [100/1]> Slice0007
8 <split [100/1]> Slice0008
9 <split [100/1]> Slice0009
10 <split [100/1]> Slice0010
# ... with 1,301 more rows
$`2`
# Rolling origin forecast resampling
# A tibble: 1,309 x 2
splits id
<list> <chr>
1 <split [100/1]> Slice0001
2 <split [100/1]> Slice0002
3 <split [100/1]> Slice0003
4 <split [100/1]> Slice0004
5 <split [100/1]> Slice0005
6 <split [100/1]> Slice0006
7 <split [100/1]> Slice0007
8 <split [100/1]> Slice0008
9 <split [100/1]> Slice0009
10 <split [100/1]> Slice0010
# ... with 1,299 more rows
这给了我:
# A tibble: 2,820 x 2
x category
<dbl> <chr>
1 58 male
2 62.6 male
3 70 male
4 55.7 male
5 85 male
6 83.5 female
7 94.8 female
8 66.3 female
9 75.9 male
10 75.5 female
# ... with 2,810 more rows
[1] "nested_cv" "group_vfold_cv" "rset" "tbl_df" "tbl" "data.frame"
# Nested resampling:
# outer: Group -fold cross-validation
# inner: Rolling origin forecast resampling
# A tibble: 2 x 3
splits id inner_resamples
<named list> <chr> <named list>
1 <split [1.4K/1.4K]> Resample1 <tibble [1,311 x 2]>
2 <split [1.4K/1.4K]> Resample2 <tibble [1,309 x 2]>
似乎也为“男性”提供了正确的输出(将1
替换为2
提供了“女性”输出)。作为随机检查,我还进行了以下检查:
第一个代码给了我一个结果,第二个没有(这是意料之中的,因为我只分析男性1
split。)
问题
我回到开头,添加一个类别other
data(sunspots)
data <- sunspots %>%
as_tibble() %>%
mutate(category = sample(c('male', 'female', 'other'), nrow(.), replace=TRUE))
最后:
cv_rolling$inner_resamples$`1`$splits[[650]] %>%
analysis() %>%
filter(category == "male")
cv_rolling$inner_resamples$`1`$splits[[650]] %>%
analysis() %>%
filter(category == "other")
cv_rolling$inner_resamples$`1`$splits[[650]] %>%
analysis() %>%
filter(category == "female")
它返回其他
和女性
的结果,但不返回男性
我做错了什么?如果我想扩展类别的数量,如何对每个类别执行滚动_origin
data(sunspots)
data <- sunspots %>%
as_tibble() %>%
mutate(category = sample(c('male', 'female', 'other'), nrow(.), replace=TRUE))
> data
# A tibble: 2,820 x 2
x category
<dbl> <chr>
1 58 other
2 62.6 male
3 70 female
4 55.7 male
5 85 male
6 83.5 male
7 94.8 other
8 66.3 other
9 75.9 female
10 75.5 male
# ... with 2,810 more rows
library(rsample)
periods_train <- 100
periods_test <- 1
skip_span <- 0
cv_rolling <- nested_cv(data,
outside = group_vfold_cv(group = "category"),
inside = rolling_origin(
initial = periods_train,
assess = periods_test,
cumulative = FALSE,
skip = skip_span))
cv_rolling
[1] "nested_cv" "group_vfold_cv" "rset" "tbl_df" "tbl" "data.frame"
# Nested resampling:
# outer: Group -fold cross-validation
# inner: Rolling origin forecast resampling
# A tibble: 3 x 3
splits id inner_resamples
<named list> <chr> <named list>
1 <split [1.9K/961]> Resample1 <tibble [1,759 x 2]>
2 <split [1.9K/939]> Resample2 <tibble [1,781 x 2]>
3 <split [1.9K/920]> Resample3 <tibble [1,800 x 2]>
> cv_rolling$inner_resamples
$`1`
# Rolling origin forecast resampling
# A tibble: 1,759 x 2
splits id
<list> <chr>
1 <split [100/1]> Slice0001
2 <split [100/1]> Slice0002
3 <split [100/1]> Slice0003
4 <split [100/1]> Slice0004
5 <split [100/1]> Slice0005
6 <split [100/1]> Slice0006
7 <split [100/1]> Slice0007
8 <split [100/1]> Slice0008
9 <split [100/1]> Slice0009
10 <split [100/1]> Slice0010
# ... with 1,749 more rows
$`2`
# Rolling origin forecast resampling
# A tibble: 1,781 x 2
splits id
<list> <chr>
1 <split [100/1]> Slice0001
2 <split [100/1]> Slice0002
3 <split [100/1]> Slice0003
4 <split [100/1]> Slice0004
5 <split [100/1]> Slice0005
6 <split [100/1]> Slice0006
7 <split [100/1]> Slice0007
8 <split [100/1]> Slice0008
9 <split [100/1]> Slice0009
10 <split [100/1]> Slice0010
# ... with 1,771 more rows
$`3`
# Rolling origin forecast resampling
# A tibble: 1,800 x 2
splits id
<list> <chr>
1 <split [100/1]> Slice0001
2 <split [100/1]> Slice0002
3 <split [100/1]> Slice0003
4 <split [100/1]> Slice0004
5 <split [100/1]> Slice0005
6 <split [100/1]> Slice0006
7 <split [100/1]> Slice0007
8 <split [100/1]> Slice0008
9 <split [100/1]> Slice0009
10 <split [100/1]> Slice0010
# ... with 1,790 more rows
> map(cv_rolling$inner_resamples$`2`$splits, ~ analysis(.x)) %>%
+ head()
[[1]]
# A tibble: 100 x 2
x category
<dbl> <chr>
1 58 other
2 62.6 male
3 55.7 male
4 85 male
5 83.5 male
6 94.8 other
7 66.3 other
8 75.5 male
9 85.2 male
10 73.3 other
# ... with 90 more rows
cv_rolling$inner_resamples$`1`$splits[[650]] %>%
analysis() %>%
filter(category == "male")
cv_rolling$inner_resamples$`1`$splits[[650]] %>%
analysis() %>%
filter(category == "other")
cv_rolling$inner_resamples$`1`$splits[[650]] %>%
analysis() %>%
filter(category == "female")