R H2O elastic net lambda搜索不会选择使验证偏差最小化的lambda
使用R H2O elastic net lambda搜索不会选择使验证偏差最小化的lambda,r,cross-validation,h2o,glm,R,Cross Validation,H2o,Glm,使用lambda_search选项交叉验证弹性网络lambda超参数时,算法可能不会从指定网格中选取lambda的值,以最大限度地减少验证样本上的偏差。当我们设置early_stopping=FALSE时,也会发生这种情况,即当人们期望H2O评估网格中lambda的所有值时 通过首先使用h2o.glm()中的lambda_search=TRUE交叉验证lambda,然后使用h2o.grid()对lambda的相同值运行网格搜索,并比较结果超参数和验证偏差值,可以检查此语句。请参见下面的R代码 这
lambda_search
选项交叉验证弹性网络lambda
超参数时,算法可能不会从指定网格中选取lambda
的值,以最大限度地减少验证样本上的偏差。当我们设置early_stopping=FALSE
时,也会发生这种情况,即当人们期望H2O评估网格中lambda
的所有值时
通过首先使用h2o.glm()
中的lambda_search=TRUE
交叉验证lambda,然后使用h2o.grid()
对lambda的相同值运行网格搜索,并比较结果超参数和验证偏差值,可以检查此语句。请参见下面的R代码
这个问题与所指出和提到的问题密切相关。这个问题补充的是文件说明,lambda
的交叉验证值不必是使验证偏差最小化的值。也就是说,问题可能比H2O计算到最佳λ然后退出更严重,如评论中所述。在使用log link对Tweedie glm中的一个验证示例进行调优时,我出现了这个问题,我不确定它对此设置的具体程度
基于这些结果,我倾向于使用网格搜索来确定lambda
。这是否恰当?或者,在h2o.glm()
中是否有一些选项可以解决lambda\u search
的问题
rm(list = ls())
library(h2o)
library(tweedie)
library(tidyverse)
# Configuration -----------------------------------------------------------
# DGP:
n = 1000
k = 10
phi = 1
const = 0
bet = seq(-1, 1, length.out = k)
power = 1.5
# algorithm
alpha = 0.5
# Generate some data ------------------------------------------------------
set.seed(42)
x = rnorm(n * k) %>%
matrix(nrow = n, dimnames = list(NULL, paste0("x", seq(1, k))))
mu = as.numeric(exp(const + x %*% bet))
dat = x %>%
as_tibble() %>%
mutate(mu = mu,
y = rtweedie(n,
mu = mu,
phi = phi,
power = power),
id = row_number(),
sample = case_when(
id <= n / 2 ~ "train",
TRUE ~ "valid"))
# Initialize H2O ----------------------------------------------------------
h2o.init()
df_h2o_train = dat %>%
filter(sample == "train") %>%
as.h2o()
df_h2o_valid = dat %>%
filter(sample == "valid") %>%
as.h2o()
# Tune lambda -------------------------------------------------------------
# 1. Lambda search
glm_warmstart = h2o.glm(
x = paste0("x", seq(1, k)),
y = "y",
family = "tweedie",
tweedie_variance_power = power,
tweedie_link_power = 0,
training_frame = df_h2o_train,
validation_frame = df_h2o_valid,
alpha = alpha,
lambda_search = TRUE,
early_stopping = FALSE
)
lambda_warmstart = glm_warmstart@model$lambda_best
print(lambda_warmstart) # 0.1501327
# 2. Grid search
hyper_params = list(lambda = glm_warmstart@model$scoring_history$lambda %>%
h2o.asnumeric())
grid_search = h2o.grid("glm",
hyper_params = hyper_params,
x = paste0("x", seq(1, k)),
y = "y",
family = "tweedie",
tweedie_variance_power = power,
tweedie_link_power = 0,
training_frame = df_h2o_train,
validation_frame = df_h2o_valid,
alpha = alpha,
lambda_search = FALSE)
lambda_grid_search = grid_search@summary_table %>%
as_tibble() %>%
head(1) %>%
pull(lambda) %>%
stringr::str_sub(2, -2) %>%
as.numeric()
print(lambda_grid_search) # 0.013
glm_grid_search = h2o.glm(
x = paste0("x", seq(1, k)),
y = "y",
family = "tweedie",
tweedie_variance_power = power,
tweedie_link_power = 0,
training_frame = df_h2o_train,
alpha = alpha,
lambda = lambda_grid_search)
# Compare validation deviance ---------------------------------------------
dat %>%
filter(sample == "valid") %>%
mutate(pred_warmstart = as.vector(h2o.predict(glm_warmstart,
newdata = df_h2o_valid)),
pred_grid_search = as.vector(h2o.predict(glm_grid_search,
newdata = df_h2o_valid)),
deviance_warmstart = tweedie.dev(y, pred_warmstart, power),
deviance_grid_search = tweedie.dev(y, pred_grid_search, power)) %>%
summarise(
mean_deviance_warmstart = mean(deviance_warmstart), # 1.16
mean_deviance_grid_search = mean(deviance_grid_search) # 1.08
)
# Close -------------------------------------------------------------------
h2o.shutdown(prompt = FALSE)
rm(list=ls())
图书馆(h2o)
图书馆(tweedie)
图书馆(tidyverse)
#配置-----------------------------------------------------------
#DGP:
n=1000
k=10
φ=1
常数=0
下注=序号(-1,1,长度=k)
功率=1.5
#算法
α=0.5
#生成一些数据------------------------------------------------------
种子(42)
x=rnorm(n*k)%>%
矩阵(nrow=n,dimnames=list(NULL,paste0(“x”,seq(1,k)))
mu=作为数值(exp(常数+x%*%bet))
dat=x%>%
as_tible()%>%
突变(mu=mu,
y=rtweedie(n,
μ=μ,
φ=φ,
功率=功率),
id=行号(),
样本=案例(
id%
过滤器(样本==“序列”)%>%
as.h2o()
df_h2o_有效=dat%>%
筛选器(示例==“有效”)%>%
as.h2o()
#调谐lambda-------------------------------------------------------------
#1.Lambda搜索
glm_warmstart=h2o.glm(
x=0(“x”,序号(1,k)),
y=“y”,
family=“tweedie”,
tweedie\u方差\u功率=功率,
tweedie\u链路功率=0,
训练帧=df\U h2o\U训练,
验证\u帧=df\u h2o\u有效,
α=α,
lambda_search=TRUE,
提前停止=错误
)
lambda_warmstart=glm_warmstart@model$lambda_最佳
打印(lambda_warmstart)#0.1501327
#2.网格搜索
超参数=列表(lambda=glm_warmstart@model$scoring_history$lambda%>%
h2o.asnumeric())
网格搜索=h2o.grid(“glm”,
超参数=超参数,
x=0(“x”,序号(1,k)),
y=“y”,
family=“tweedie”,
tweedie\u方差\u功率=功率,
tweedie\u链路功率=0,
训练帧=df\U h2o\U训练,
验证\u帧=df\u h2o\u有效,
α=α,
lambda_search=FALSE)
lambda_grid_search=网格_search@summary_table %>%
as_tible()%>%
总目(1)%>%
拉力(λ)%>%
stringr::str_sub(2,-2)%%>%
as.numeric()
打印(lambda_网格_搜索)#0.013
glm_网格搜索=h2o.glm(
x=0(“x”,序号(1,k)),
y=“y”,
family=“tweedie”,
tweedie\u方差\u功率=功率,
tweedie\u链路功率=0,
训练帧=df\U h2o\U训练,
α=α,
lambda=lambda_网格_搜索)
#比较验证偏差---------------------------------------------
dat%>%
筛选器(示例==“有效”)%>%
突变(pred_warmstart=as.vector(h2o.predict)(glm_warmstart,
newdata=df_h2o_有效),
pred_grid_search=as.vector(h2o.predict)(glm_grid_search,
newdata=df_h2o_有效),
偏差预热启动=tweedie.dev(y,pred预热启动,power),
偏差网格搜索=tweedie.dev(y,pred网格搜索,power))%>%
总结(
平均偏差预热开始=平均值(偏差预热开始),#1.16
平均偏差网格搜索=平均值(偏差网格搜索)#1.08
)
#接近-------------------------------------------------------------------
h2o.关闭(提示=FALSE)
让我在这里补充一下,我非常喜欢使用H2O!