来自rsamples的嵌套简历-不确定超过2组的结果

来自rsamples的嵌套简历-不确定超过2组的结果,r,R,我运行以下命令: data(sunspots) data <- sunspots %>% as_tibble() %>% mutate(category = sample(c('male', 'female'), nrow(.), replace=TRUE)) data library(rsample) periods_train <- 100 periods_test <- 1 skip_span <- 0 cv_rolling

我运行以下命令:

data(sunspots)
data <- sunspots %>% 
  as_tibble() %>% 
  mutate(category = sample(c('male', 'female'), nrow(.), replace=TRUE))

data
library(rsample)
periods_train <- 100
periods_test  <- 1
skip_span     <- 0


cv_rolling <- nested_cv(data, 
                        outside = group_vfold_cv(group = "category"),
                        inside = rolling_origin(
                          initial    = periods_train,
                          assess     = periods_test,
                          cumulative = FALSE,
                          skip       = skip_span))
> cv_rolling$inner_resamples
$`1`
# Rolling origin forecast resampling 
# A tibble: 1,311 x 2
   splits          id       
   <list>          <chr>    
 1 <split [100/1]> Slice0001
 2 <split [100/1]> Slice0002
 3 <split [100/1]> Slice0003
 4 <split [100/1]> Slice0004
 5 <split [100/1]> Slice0005
 6 <split [100/1]> Slice0006
 7 <split [100/1]> Slice0007
 8 <split [100/1]> Slice0008
 9 <split [100/1]> Slice0009
10 <split [100/1]> Slice0010
# ... with 1,301 more rows

$`2`
# Rolling origin forecast resampling 
# A tibble: 1,309 x 2
   splits          id       
   <list>          <chr>    
 1 <split [100/1]> Slice0001
 2 <split [100/1]> Slice0002
 3 <split [100/1]> Slice0003
 4 <split [100/1]> Slice0004
 5 <split [100/1]> Slice0005
 6 <split [100/1]> Slice0006
 7 <split [100/1]> Slice0007
 8 <split [100/1]> Slice0008
 9 <split [100/1]> Slice0009
10 <split [100/1]> Slice0010
# ... with 1,299 more rows
这给了我:

# A tibble: 2,820 x 2
       x category
   <dbl> <chr>   
 1  58   male    
 2  62.6 male    
 3  70   male    
 4  55.7 male    
 5  85   male    
 6  83.5 female  
 7  94.8 female  
 8  66.3 female  
 9  75.9 male    
10  75.5 female  
# ... with 2,810 more rows
[1] "nested_cv"      "group_vfold_cv" "rset"           "tbl_df"         "tbl"            "data.frame"    
# Nested resampling:
#  outer: Group -fold cross-validation
#  inner: Rolling origin forecast resampling
# A tibble: 2 x 3
  splits              id        inner_resamples     
  <named list>        <chr>     <named list>        
1 <split [1.4K/1.4K]> Resample1 <tibble [1,311 x 2]>
2 <split [1.4K/1.4K]> Resample2 <tibble [1,309 x 2]>
似乎也为“男性”提供了正确的输出(将
1
替换为
2
提供了“女性”输出)。作为随机检查,我还进行了以下检查:

第一个代码给了我一个结果,第二个没有(这是意料之中的,因为我只分析男性
1
split。)

问题

我回到开头,添加一个类别
other

data(sunspots)
data <- sunspots %>% 
  as_tibble() %>% 
  mutate(category = sample(c('male', 'female', 'other'), nrow(.), replace=TRUE))
最后:

cv_rolling$inner_resamples$`1`$splits[[650]] %>% 
  analysis() %>% 
  filter(category == "male")

cv_rolling$inner_resamples$`1`$splits[[650]] %>% 
  analysis() %>% 
  filter(category == "other")

cv_rolling$inner_resamples$`1`$splits[[650]] %>% 
  analysis() %>% 
  filter(category == "female")
它返回
其他
女性
的结果,但不返回
男性

我做错了什么?如果我想扩展类别的数量,如何对每个类别执行
滚动_origin

data(sunspots)
data <- sunspots %>% 
  as_tibble() %>% 
  mutate(category = sample(c('male', 'female', 'other'), nrow(.), replace=TRUE))
> data
# A tibble: 2,820 x 2
       x category
   <dbl> <chr>   
 1  58   other   
 2  62.6 male    
 3  70   female  
 4  55.7 male    
 5  85   male    
 6  83.5 male    
 7  94.8 other   
 8  66.3 other   
 9  75.9 female  
10  75.5 male    
# ... with 2,810 more rows
library(rsample)
periods_train <- 100
periods_test  <- 1
skip_span     <- 0


cv_rolling <- nested_cv(data, 
                        outside = group_vfold_cv(group = "category"),
                        inside = rolling_origin(
                          initial    = periods_train,
                          assess     = periods_test,
                          cumulative = FALSE,
                          skip       = skip_span))

cv_rolling
[1] "nested_cv"      "group_vfold_cv" "rset"           "tbl_df"         "tbl"            "data.frame"    
# Nested resampling:
#  outer: Group -fold cross-validation
#  inner: Rolling origin forecast resampling
# A tibble: 3 x 3
  splits             id        inner_resamples     
  <named list>       <chr>     <named list>        
1 <split [1.9K/961]> Resample1 <tibble [1,759 x 2]>
2 <split [1.9K/939]> Resample2 <tibble [1,781 x 2]>
3 <split [1.9K/920]> Resample3 <tibble [1,800 x 2]>
> cv_rolling$inner_resamples
$`1`
# Rolling origin forecast resampling 
# A tibble: 1,759 x 2
   splits          id       
   <list>          <chr>    
 1 <split [100/1]> Slice0001
 2 <split [100/1]> Slice0002
 3 <split [100/1]> Slice0003
 4 <split [100/1]> Slice0004
 5 <split [100/1]> Slice0005
 6 <split [100/1]> Slice0006
 7 <split [100/1]> Slice0007
 8 <split [100/1]> Slice0008
 9 <split [100/1]> Slice0009
10 <split [100/1]> Slice0010
# ... with 1,749 more rows

$`2`
# Rolling origin forecast resampling 
# A tibble: 1,781 x 2
   splits          id       
   <list>          <chr>    
 1 <split [100/1]> Slice0001
 2 <split [100/1]> Slice0002
 3 <split [100/1]> Slice0003
 4 <split [100/1]> Slice0004
 5 <split [100/1]> Slice0005
 6 <split [100/1]> Slice0006
 7 <split [100/1]> Slice0007
 8 <split [100/1]> Slice0008
 9 <split [100/1]> Slice0009
10 <split [100/1]> Slice0010
# ... with 1,771 more rows

$`3`
# Rolling origin forecast resampling 
# A tibble: 1,800 x 2
   splits          id       
   <list>          <chr>    
 1 <split [100/1]> Slice0001
 2 <split [100/1]> Slice0002
 3 <split [100/1]> Slice0003
 4 <split [100/1]> Slice0004
 5 <split [100/1]> Slice0005
 6 <split [100/1]> Slice0006
 7 <split [100/1]> Slice0007
 8 <split [100/1]> Slice0008
 9 <split [100/1]> Slice0009
10 <split [100/1]> Slice0010
# ... with 1,790 more rows
> map(cv_rolling$inner_resamples$`2`$splits, ~ analysis(.x)) %>% 
+   head()
[[1]]
# A tibble: 100 x 2
       x category
   <dbl> <chr>   
 1  58   other   
 2  62.6 male    
 3  55.7 male    
 4  85   male    
 5  83.5 male    
 6  94.8 other   
 7  66.3 other   
 8  75.5 male    
 9  85.2 male    
10  73.3 other   
# ... with 90 more rows
cv_rolling$inner_resamples$`1`$splits[[650]] %>% 
  analysis() %>% 
  filter(category == "male")

cv_rolling$inner_resamples$`1`$splits[[650]] %>% 
  analysis() %>% 
  filter(category == "other")

cv_rolling$inner_resamples$`1`$splits[[650]] %>% 
  analysis() %>% 
  filter(category == "female")