R 列车()中的公式与非公式接口
[我研究了类似的线索和问题,Max和其他人提出的问题似乎都与我的案件无关。] 我在这里看到一些关于公式接口失败的报道,而非公式接口对他们来说运行良好。我的问题恰恰相反。下面带有公式界面的R 列车()中的公式与非公式接口,r,r-caret,R,R Caret,[我研究了类似的线索和问题,Max和其他人提出的问题似乎都与我的案件无关。] 我在这里看到一些关于公式接口失败的报道,而非公式接口对他们来说运行良好。我的问题恰恰相反。下面带有公式界面的train()函数工作正常: glmTune <- train(class ~ ., data = trainData, method = "glmnet", trControl = train.contr
train()
函数工作正常:
glmTune <- train(class ~ .,
data = trainData,
method = "glmnet",
trControl = train.control,
tuneGrid = tune.grid)
对于数值y
而言,它只是略有不同(不同的性能指标):
代码如下:
library(caret)
library(dplyr)
library(glmnet)
# see dput(droplevels(head(df, 20))) output of data below:
# 70%/30% split
set.seed(42)
inTrain <- createDataPartition(df$lnprice, p=0.7, list=F)
trainData <- df[inTrain, ]
testData <- df[-inTrain, ]
# train model
train.control <- trainControl(method = "repeatedcv",
number = 10,
repeats= 5,
allowParallel = F)
tune.grid <- expand.grid(lambda = seq(0.0001,0.1,length=20),
alpha = c(0, 0.5, 1))
X <- trainData[, !(names(trainData) %in% "lnprice")]
Y <- trainData$lnprice
fit <- train(
# x = X, y = Y, # non-formula
lnprice ~ ., data = trainData, # formula
method = "glmnet",
preProcess = c("zv", "center", "scale"),
tuneGrid = tune.grid,
trControl = train.control)
# plot model
print(plot(fit))
> dput(droplevels(head(df,20)))
structure(list(fuel.type = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "gas", class = "factor"),
aspiration = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("std",
"turbo"), class = "factor"), num.of.doors = structure(c(2L,
2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L,
2L, 1L, 2L, 2L), .Label = c("four", "two"), class = "factor"),
body.style = structure(c(1L, 1L, 2L, 3L, 3L, 3L, 3L, 4L,
3L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L), .Label = c("convertible",
"hatchback", "sedan", "wagon"), class = "factor"), drive.wheels = structure(c(2L,
2L, 2L, 1L, 3L, 1L, 1L, 1L, 1L, 3L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L), .Label = c("fwd", "rwd", "X4wd"), class = "factor"),
engine.location = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "front", class = "factor"),
wheel.base = c(88.6, 88.6, 94.5, 99.8, 99.4, 99.8, 105.8,
105.8, 105.8, 99.5, 101.2, 101.2, 101.2, 101.2, 103.5, 103.5,
103.5, 110, 88.4, 94.5), length = c(168.8, 168.8, 171.2,
176.6, 176.6, 177.3, 192.7, 192.7, 192.7, 178.2, 176.8, 176.8,
176.8, 176.8, 189, 189, 193.8, 197, 141.1, 155.9), width = c(64.1,
64.1, 65.5, 66.2, 66.4, 66.3, 71.4, 71.4, 71.4, 67.9, 64.8,
64.8, 64.8, 64.8, 66.9, 66.9, 67.9, 70.9, 60.3, 63.6), height = c(48.8,
48.8, 52.4, 54.3, 54.3, 53.1, 55.7, 55.7, 55.9, 52, 54.3,
54.3, 54.3, 54.3, 55.7, 55.7, 53.7, 56.3, 53.2, 52), curb.weight = c(2548L,
2548L, 2823L, 2337L, 2824L, 2507L, 2844L, 2954L, 3086L, 3053L,
2395L, 2395L, 2710L, 2765L, 3055L, 3230L, 3380L, 3505L, 1488L,
1874L), engine.type = structure(c(1L, 1L, 4L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L), .Label = c("dohc",
"l", "ohc", "ohcv"), class = "factor"), num.of.cylinders = structure(c(2L,
2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L), .Label = c("five.six", "four.or.less"), class = "factor"),
engine.size = c(130L, 130L, 152L, 109L, 136L, 136L, 136L,
136L, 131L, 131L, 108L, 108L, 164L, 164L, 164L, 209L, 209L,
209L, 61L, 90L), fuel.system = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L), .Label = c("mpfi", "X2bbl"), class = "factor"), bore = c(3.47,
3.47, 2.68, 3.19, 3.19, 3.19, 3.19, 3.19, 3.13, 3.13, 3.5,
3.5, 3.31, 3.31, 3.31, 3.62, 3.62, 3.62, 2.91, 3.03), stroke = c(2.68,
2.68, 3.47, 3.4, 3.4, 3.4, 3.4, 3.4, 3.4, 3.4, 2.8, 2.8,
3.19, 3.19, 3.19, 3.39, 3.39, 3.39, 3.03, 3.11), compression.ratio = c(9,
9, 9, 10, 8, 8.5, 8.5, 8.5, 8.3, 7, 8.8, 8.8, 9, 9, 9, 8,
8, 8, 9.5, 9.6), horsepower = c(111, 111, 154, 102, 115,
110, 110, 110, 140, 160, 101, 101, 121, 121, 121, 182, 182,
182, 48, 70), peak.rpm = c(5000L, 5000L, 5000L, 5500L, 5500L,
5500L, 5500L, 5500L, 5500L, 5500L, 5800L, 5800L, 4250L, 4250L,
4250L, 5400L, 5400L, 5400L, 5100L, 5400L), city.mpg = c(21L,
21L, 19L, 24L, 18L, 19L, 19L, 19L, 17L, 16L, 23L, 23L, 21L,
21L, 20L, 16L, 16L, 15L, 47L, 38L), highway.mpg = c(27L,
27L, 26L, 30L, 22L, 25L, 25L, 25L, 20L, 22L, 29L, 29L, 28L,
28L, 25L, 22L, 22L, 20L, 53L, 43L), make = structure(c(1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 4L, 4L), .Label = c("alfa.romero", "audi", "bmw",
"chevrolet"), class = "factor"), lnprice = c(9.5101, 9.7111,
9.7111, 9.5432, 9.7671, 9.6323, 9.7819, 9.848, 10.0806, 9.69176,
9.7069, 9.7365, 9.9508, 9.9573, 10.1091, 10.334, 10.629,
10.5154, 8.5469, 8.7475)), .Names = c("fuel.type", "aspiration",
"num.of.doors", "body.style", "drive.wheels", "engine.location",
"wheel.base", "length", "width", "height", "curb.weight", "engine.type",
"num.of.cylinders", "engine.size", "fuel.system", "bore", "stroke",
"compression.ratio", "horsepower", "peak.rpm", "city.mpg", "highway.mpg",
"make", "lnprice"), row.names = c(NA, 20L), class = "data.frame")
库(插入符号)
图书馆(dplyr)
图书馆(glmnet)
#见以下数据的dput(液滴液位(水头(df,20)))输出:
#70%/30%分割
种子(42)
inTrain奇怪。看起来train.default
方法没有默认的na.action
处理程序
从?插入符号::序列的输出
## Default S3 method:
train(x, y, method = "rf", preProcess = NULL, ...,
weights = NULL, metric = ifelse(is.factor(y), "Accuracy", "RMSE"),
maximize = ifelse(metric %in% c("RMSE", "logLoss", "MAE"), FALSE, TRUE),
trControl = trainControl(), tuneGrid = NULL,
tuneLength = ifelse(trControl$method == "none", 1, 3))
而train.formula
方法的作用是:
## S3 method for class 'formula'
train(form, data, ..., weights, subset, na.action = na.fail, contrasts = NULL)
^^^^^^^^^^^^^^^^^^^
如果您将na.action=na.fail
添加到您的train.default
调用,x,y
接口,您是否会得到与train.formula
调用相同的行为?示例数据集中没有任何NAs,但我仍然可以重现错误。我认为问题在于glmnet方法只需要数字变量,而train
将使用公式方法为因子创建虚拟变量,但不使用x,y规范(请参阅)
下面我使用recipes包从因子变量中创建一个热编码变量。我删除了变量fuel.type和engine.location,因为它们在您的小示例数据集中只有一个级别
trainData <- trainData %>% select(-fuel.type, -engine.location)
rec <- recipe(lnprice~., data = trainData) %>% step_dummy(all_predictors(), one_hot = TRUE)
rec_prep <- prep(rec, trainData)
train_new <- bake(rec_prep, trainData)
X <- as.data.frame(train_new[, !(names(train_new) %in% "lnprice")])
Y <- train_new$lnprice
fit <- train(
x = X, y = Y, # non-formula
#lnprice ~ ., data = trainData, # formula
method = "glmnet",
preProcess = c("zv", "center", "scale"),
tuneGrid = tune.grid,
trControl = train.control)
trainData%选择(-燃料类型,-发动机位置)
记录%step\u dummy(所有预测值(),一个热=真)
rec_prep谢谢你调查这件事。我刚试过你的建议。完全相同的NA错误。如果要忽略NA
s,可以使用NA.action=NA.pass
。但是没有真正解释为什么这两种方法处理的方式不同对不起,很抱歉回复太晚-我再次检查了na.action=na.pass,
相同的错误。不知道这些错误消息(添加在我的原始帖子中)是否包含一些线索。再次感谢。na.action=na.pass将把NAs传递给模型,而不是忽略它们。你需要na.action=na.omit。你能给我们一个小的,可重复的例子来测试吗?我已经把代码贴在了我原来的帖子下面。这适用于使用自动定价数据的回归模型(log(price),存储在“lnprice”列中,是响应变量)。完整的数据有205行和24列,我不知道如何给您提供数据,因此我尝试按照说明,使用dput(droplevels(head(df,20))的R-console输出粘贴前20行。
希望这有帮助。非常感谢你调查这件事。
## Default S3 method:
train(x, y, method = "rf", preProcess = NULL, ...,
weights = NULL, metric = ifelse(is.factor(y), "Accuracy", "RMSE"),
maximize = ifelse(metric %in% c("RMSE", "logLoss", "MAE"), FALSE, TRUE),
trControl = trainControl(), tuneGrid = NULL,
tuneLength = ifelse(trControl$method == "none", 1, 3))
## S3 method for class 'formula'
train(form, data, ..., weights, subset, na.action = na.fail, contrasts = NULL)
^^^^^^^^^^^^^^^^^^^
trainData <- trainData %>% select(-fuel.type, -engine.location)
rec <- recipe(lnprice~., data = trainData) %>% step_dummy(all_predictors(), one_hot = TRUE)
rec_prep <- prep(rec, trainData)
train_new <- bake(rec_prep, trainData)
X <- as.data.frame(train_new[, !(names(train_new) %in% "lnprice")])
Y <- train_new$lnprice
fit <- train(
x = X, y = Y, # non-formula
#lnprice ~ ., data = trainData, # formula
method = "glmnet",
preProcess = c("zv", "center", "scale"),
tuneGrid = tune.grid,
trControl = train.control)