Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/r/80.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
R H2O elastic net lambda搜索不会选择使验证偏差最小化的lambda_R_Cross Validation_H2o_Glm - Fatal编程技术网

R H2O elastic net lambda搜索不会选择使验证偏差最小化的lambda

R H2O elastic net lambda搜索不会选择使验证偏差最小化的lambda,r,cross-validation,h2o,glm,R,Cross Validation,H2o,Glm,使用lambda_search选项交叉验证弹性网络lambda超参数时,算法可能不会从指定网格中选取lambda的值,以最大限度地减少验证样本上的偏差。当我们设置early_stopping=FALSE时,也会发生这种情况,即当人们期望H2O评估网格中lambda的所有值时 通过首先使用h2o.glm()中的lambda_search=TRUE交叉验证lambda,然后使用h2o.grid()对lambda的相同值运行网格搜索,并比较结果超参数和验证偏差值,可以检查此语句。请参见下面的R代码 这

使用
lambda_search
选项交叉验证弹性网络
lambda
超参数时,算法可能不会从指定网格中选取
lambda
的值,以最大限度地减少验证样本上的偏差。当我们设置
early_stopping=FALSE
时,也会发生这种情况,即当人们期望H2O评估网格中
lambda
的所有值时

通过首先使用
h2o.glm()
中的
lambda_search=TRUE
交叉验证lambda,然后使用
h2o.grid()
对lambda的相同值运行网格搜索,并比较结果超参数和验证偏差值,可以检查此语句。请参见下面的R代码

这个问题与所指出和提到的问题密切相关。这个问题补充的是文件说明,
lambda
的交叉验证值不必是使验证偏差最小化的值。也就是说,问题可能比H2O计算到最佳λ然后退出更严重,如评论中所述。在使用log link对Tweedie glm中的一个验证示例进行调优时,我出现了这个问题,我不确定它对此设置的具体程度

基于这些结果,我倾向于使用网格搜索来确定
lambda
。这是否恰当?或者,在
h2o.glm()
中是否有一些选项可以解决
lambda\u search
的问题

rm(list = ls())
library(h2o)
library(tweedie)
library(tidyverse)

# Configuration -----------------------------------------------------------
# DGP:
n = 1000
k = 10
phi = 1
const = 0
bet = seq(-1, 1, length.out = k)
power = 1.5

# algorithm
alpha = 0.5

# Generate some data ------------------------------------------------------
set.seed(42)

x = rnorm(n * k) %>% 
  matrix(nrow = n, dimnames = list(NULL, paste0("x", seq(1, k))))
mu = as.numeric(exp(const + x %*% bet))

dat = x %>% 
  as_tibble() %>% 
  mutate(mu = mu,
         y  = rtweedie(n, 
                       mu = mu,
                       phi = phi, 
                       power = power),
         id = row_number(),
         sample = case_when(
           id <= n / 2 ~ "train",
           TRUE ~ "valid"))

# Initialize H2O ----------------------------------------------------------
h2o.init()

df_h2o_train = dat %>% 
  filter(sample == "train") %>% 
  as.h2o()

df_h2o_valid = dat %>% 
  filter(sample == "valid") %>% 
  as.h2o()


# Tune lambda -------------------------------------------------------------
# 1. Lambda search
glm_warmstart = h2o.glm(
  x                      = paste0("x", seq(1, k)),
  y                      = "y",
  family                 = "tweedie",
  tweedie_variance_power = power,
  tweedie_link_power     = 0,
  training_frame         = df_h2o_train,
  validation_frame       = df_h2o_valid,
  alpha                  = alpha,
  lambda_search          = TRUE,
  early_stopping         = FALSE
)

lambda_warmstart = glm_warmstart@model$lambda_best 
print(lambda_warmstart) # 0.1501327

# 2. Grid search
hyper_params = list(lambda = glm_warmstart@model$scoring_history$lambda %>% 
                      h2o.asnumeric())

grid_search = h2o.grid("glm",
                       hyper_params           = hyper_params,
                       x                      = paste0("x", seq(1, k)),
                       y                      = "y",
                       family                 = "tweedie",
                       tweedie_variance_power = power,
                       tweedie_link_power     = 0,
                       training_frame         = df_h2o_train,
                       validation_frame       = df_h2o_valid,
                       alpha                  = alpha,
                       lambda_search          = FALSE)

lambda_grid_search = grid_search@summary_table %>% 
  as_tibble() %>%
  head(1) %>% 
  pull(lambda) %>% 
  stringr::str_sub(2, -2) %>% 
  as.numeric()
print(lambda_grid_search) # 0.013

glm_grid_search = h2o.glm(
  x                      = paste0("x", seq(1, k)),
  y                      = "y",
  family                 = "tweedie",
  tweedie_variance_power = power,
  tweedie_link_power     = 0,
  training_frame         = df_h2o_train,
  alpha                  = alpha,
  lambda                 = lambda_grid_search)

# Compare validation deviance ---------------------------------------------
dat %>% 
  filter(sample == "valid") %>% 
  mutate(pred_warmstart = as.vector(h2o.predict(glm_warmstart,
                                             newdata = df_h2o_valid)),
         pred_grid_search  = as.vector(h2o.predict(glm_grid_search,
                                             newdata = df_h2o_valid)),
         deviance_warmstart = tweedie.dev(y, pred_warmstart, power),
         deviance_grid_search = tweedie.dev(y, pred_grid_search, power)) %>% 
  summarise(
    mean_deviance_warmstart = mean(deviance_warmstart), # 1.16
    mean_deviance_grid_search = mean(deviance_grid_search) # 1.08
  )

# Close -------------------------------------------------------------------
h2o.shutdown(prompt = FALSE)

rm(list=ls())
图书馆(h2o)
图书馆(tweedie)
图书馆(tidyverse)
#配置-----------------------------------------------------------
#DGP:
n=1000
k=10
φ=1
常数=0
下注=序号(-1,1,长度=k)
功率=1.5
#算法
α=0.5
#生成一些数据------------------------------------------------------
种子(42)
x=rnorm(n*k)%>%
矩阵(nrow=n,dimnames=list(NULL,paste0(“x”,seq(1,k)))
mu=作为数值(exp(常数+x%*%bet))
dat=x%>%
as_tible()%>%
突变(mu=mu,
y=rtweedie(n,
μ=μ,
φ=φ,
功率=功率),
id=行号(),
样本=案例(
id%
过滤器(样本==“序列”)%>%
as.h2o()
df_h2o_有效=dat%>%
筛选器(示例==“有效”)%>%
as.h2o()
#调谐lambda-------------------------------------------------------------
#1.Lambda搜索
glm_warmstart=h2o.glm(
x=0(“x”,序号(1,k)),
y=“y”,
family=“tweedie”,
tweedie\u方差\u功率=功率,
tweedie\u链路功率=0,
训练帧=df\U h2o\U训练,
验证\u帧=df\u h2o\u有效,
α=α,
lambda_search=TRUE,
提前停止=错误
)
lambda_warmstart=glm_warmstart@model$lambda_最佳
打印(lambda_warmstart)#0.1501327
#2.网格搜索
超参数=列表(lambda=glm_warmstart@model$scoring_history$lambda%>%
h2o.asnumeric())
网格搜索=h2o.grid(“glm”,
超参数=超参数,
x=0(“x”,序号(1,k)),
y=“y”,
family=“tweedie”,
tweedie\u方差\u功率=功率,
tweedie\u链路功率=0,
训练帧=df\U h2o\U训练,
验证\u帧=df\u h2o\u有效,
α=α,
lambda_search=FALSE)
lambda_grid_search=网格_search@summary_table %>% 
as_tible()%>%
总目(1)%>%
拉力(λ)%>%
stringr::str_sub(2,-2)%%>%
as.numeric()
打印(lambda_网格_搜索)#0.013
glm_网格搜索=h2o.glm(
x=0(“x”,序号(1,k)),
y=“y”,
family=“tweedie”,
tweedie\u方差\u功率=功率,
tweedie\u链路功率=0,
训练帧=df\U h2o\U训练,
α=α,
lambda=lambda_网格_搜索)
#比较验证偏差---------------------------------------------
dat%>%
筛选器(示例==“有效”)%>%
突变(pred_warmstart=as.vector(h2o.predict)(glm_warmstart,
newdata=df_h2o_有效),
pred_grid_search=as.vector(h2o.predict)(glm_grid_search,
newdata=df_h2o_有效),
偏差预热启动=tweedie.dev(y,pred预热启动,power),
偏差网格搜索=tweedie.dev(y,pred网格搜索,power))%>%
总结(
平均偏差预热开始=平均值(偏差预热开始),#1.16
平均偏差网格搜索=平均值(偏差网格搜索)#1.08
)
#接近-------------------------------------------------------------------
h2o.关闭(提示=FALSE)

让我在这里补充一下,我非常喜欢使用H2O!