R xgboost poisson回归：标签必须为非负_R_Machine Learning_Regression_Xgboost_Poisson

R xgboost poisson回归：标签必须为非负

r machine-learning

R xgboost poisson回归：标签必须为非负,r,machine-learning,regression,xgboost,poisson,R,Machine Learning,Regression,Xgboost,Poisson,我使用的是Windows10笔记本电脑，R和xgboost版本为0.6-4。运行以下代码时，我遇到了一个奇怪的错误 xgb_params <- list("objective" = "count:poisson", "eval_metric" = "rmse") regression <- xgboost(data = training_fold, label = y_training_fold,

我使用的是Windows10笔记本电脑，R和xgboost版本为0.6-4。运行以下代码时，我遇到了一个奇怪的错误

xgb_params <- list("objective" = "count:poisson",
                "eval_metric" = "rmse")
 regression <- xgboost(data = training_fold, 
                   label = y_training_fold, 
                   nrounds = 10,
                   params = xgb_params)

Error in xgb.iter.update(bst$handle, dtrain, iteration - 1, obj) :
amalgamation/../src/objective/regression_obj.cc:190: Check failed: 
label_correct PoissonRegression: label must be nonnegative

我怎样才能解决这个问题？我试着移除NA，但没有用

提前谢谢

编辑

这是一个traindata的示例

dput(droplevels(head(train[, c(1,2,4,5,6,8,9,10,11)], 20)))

structure(list(VacancyId = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 
3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L), .Label = c("55288","56838", "57822", "57902", "57925", "58008"), class = "factor"), 
VacancyBankId = c(2L, 1609L, 1611L, 147L, 17L, 1611L, 2L, 
257L, 1611L, 2L, 147L, 17L, 1611L, 239L, 1609L, 2L, 1609L, 
2L, 2L, 1609L), FunctionId = c(36L, 36L, 36L, 36L, 35L, 35L, 
3L, 4L, 4L, 4L, 4L, 9L, 9L, 9L, 3L, 3L, 3L, 3L, 3L, 3L), 
EducationLevel = c(6L, 6L, 6L, 6L, 6L, 6L, 4L, 6L, 6L, 6L, 
6L, 4L, 4L, 4L, 6L, 6L, 6L, 6L, 6L, 6L), ProvinceId = c(22L, 
22L, 22L, 22L, 24L, 24L, 19L, 16L, 16L, 16L, 16L, 19L, 19L, 
19L, 21L, 21L, 16L, 16L, 22L, 22L), CandidatesCount = c(126L, 
27L, 18L, 12L, 1L, 4L, 2L, 6L, 7L, 7L, 1L, 8L, 15L, 13L, 
7L, 7L, 7L, 7L, 7L, 7L), DurationDays = c(62L, 62L, 62L, 
62L, 18L, 18L, 43L, 61L, 61L, 61L, 61L, 60L, 60L, 60L, 62L, 
62L, 62L, 62L, 62L, 62L), DurationWeeks = c(8.857142857, 
8.857142857, 8.857142857, 8.857142857, 2.571428571, 2.571428571, 
6.142857143, 8.714285714, 8.714285714, 8.714285714, 8.714285714, 
8.571428571, 8.571428571, 8.571428571, 8.857142857, 8.857142857, 
8.857142857, 8.857142857, 8.857142857, 8.857142857), CandidatesPerWeek = c(NA, 
3.048387097, 2.032258065, 1.35483871, 0.388888889, 1.555555556, 
0.325581395, 0.68852459, 0.803278689, 0.803278689, 0.114754098, 
0.933333333, 1.75, 1.516666667, 0.790322581, 0.790322581, 
0.790322581, 0.790322581, 0.790322581, 0.790322581)), .Names = c("VacancyId", "VacancyBankId", "FunctionId", "EducationLevel", "ProvinceId", "CandidatesCount", "DurationDays", "DurationWeeks", "CandidatesPerWeek"), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 26L, 27L, 28L, 29L, 30L, 31L), class = "data.frame")

我想用FunctionId、教育程度、省份和VacancyBankId预测每周的候选人。因此，y_培训次数为每周候选人，培训次数为职能、教育、省份和vacancybankid

希望有人能帮我

数据集中的问题不是在

y\u training\u fold

中存在负值，而是存在非整数值。
使用非整数值的

y\u training\u fold

向量，请参见以下模拟：

library(xgboost)

training_fold <- matrix(rnorm(1000),nrow=100)
y_training_fold <- matrix(rnorm(100),ncol=1)

xgb_params <- list("objective" = "count:poisson",
                "eval_metric" = "rmse")
 regression <- xgboost(data = training_fold, 
                   label = y_training_fold, 
                   nrounds = 10,
                   params = xgb_params)

现在，尝试使用整数的

y\u training\u fold

向量：

y_training_fold <- matrix(rpois(100,10),ncol=1)

xgb_params <- list("objective" = "count:poisson",
                "eval_metric" = "rmse")
regression <- xgboost(data = training_fold, 
                   label = y_training_fold, 
                   nrounds = 10,
                   params = xgb_params)

编辑。

使用您的数据，问题的解决方案是：

dts <- structure(list(VacancyId = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 
3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L), .Label = c("55288","56838", "57822", "57902", "57925", "58008"), class = "factor"), 
VacancyBankId = c(2L, 1609L, 1611L, 147L, 17L, 1611L, 2L, 
257L, 1611L, 2L, 147L, 17L, 1611L, 239L, 1609L, 2L, 1609L, 
2L, 2L, 1609L), FunctionId = c(36L, 36L, 36L, 36L, 35L, 35L, 
3L, 4L, 4L, 4L, 4L, 9L, 9L, 9L, 3L, 3L, 3L, 3L, 3L, 3L), 
EducationLevel = c(6L, 6L, 6L, 6L, 6L, 6L, 4L, 6L, 6L, 6L, 
6L, 4L, 4L, 4L, 6L, 6L, 6L, 6L, 6L, 6L), ProvinceId = c(22L, 
22L, 22L, 22L, 24L, 24L, 19L, 16L, 16L, 16L, 16L, 19L, 19L, 
19L, 21L, 21L, 16L, 16L, 22L, 22L), CandidatesCount = c(126L, 
27L, 18L, 12L, 1L, 4L, 2L, 6L, 7L, 7L, 1L, 8L, 15L, 13L, 
7L, 7L, 7L, 7L, 7L, 7L), DurationDays = c(62L, 62L, 62L, 
62L, 18L, 18L, 43L, 61L, 61L, 61L, 61L, 60L, 60L, 60L, 62L, 
62L, 62L, 62L, 62L, 62L), DurationWeeks = c(8.857142857, 
8.857142857, 8.857142857, 8.857142857, 2.571428571, 2.571428571, 
6.142857143, 8.714285714, 8.714285714, 8.714285714, 8.714285714, 
8.571428571, 8.571428571, 8.571428571, 8.857142857, 8.857142857, 
8.857142857, 8.857142857, 8.857142857, 8.857142857), CandidatesPerWeek = c(NA, 
3.048387097, 2.032258065, 1.35483871, 0.388888889, 1.555555556, 
0.325581395, 0.68852459, 0.803278689, 0.803278689, 0.114754098, 
0.933333333, 1.75, 1.516666667, 0.790322581, 0.790322581, 
0.790322581, 0.790322581, 0.790322581, 0.790322581)), 
.Names = c("VacancyId", "VacancyBankId", "FunctionId", "EducationLevel", 
"ProvinceId", "CandidatesCount", "DurationDays", "DurationWeeks", 
"CandidatesPerWeek"), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 11L, 12L, 
13L, 14L, 15L, 16L, 17L, 18L, 26L, 27L, 28L, 29L, 30L, 31L), 
class = "data.frame")

# Delete missing values
dts <- na.omit(dts)

# Build X matrix of potential predictors
# Important: do not use the first column (ID) and the last (response variable)
training_fold <- as.matrix(dts[,-c(1,9)])
# Round to the nearest integer the response variable
y_training_fold <- as.matrix(dts[,9])
y_training_fold <- round(y_training_fold)

xgb_params <- list("objective" = "count:poisson",
                "eval_metric" = "rmse")
( regression <- xgboost(data = training_fold, 
                   label = y_training_fold, 
                   nrounds = 10,
                   params = xgb_params) )
# Output
##### xgb.Booster
# raw: 4.6 Kb 
# call:
#   xgb.train(params = params, data = dtrain, nrounds = nrounds, 
#     watchlist = watchlist, verbose = verbose, print_every_n = print_every_n, 
#     early_stopping_rounds = early_stopping_rounds, maximize = maximize, 
#     save_period = save_period, save_name = save_name, xgb_model = xgb_model, 
#     callbacks = callbacks)
# params (as set within xgb.train):
#   objective = "count:poisson", eval_metric = "rmse", silent = "1"
# xgb.attributes:
#   niter
# callbacks:
#   cb.print.evaluation(period = print_every_n)
#   cb.evaluation.log()
#   cb.save.model(save_period = save_period, save_name = save_name)
# niter: 10
# evaluation_log:
#     iter train_rmse
#        1   0.914084
#        2   0.829741
# ---                
#        9   0.332951
#       10   0.291877

dts谢谢你的回答，马可。只有当我将标签四舍五入并将其保存为向量时，它仍然会给出相同的错误。我添加了一些数据@卢卡申科，看看我的答案中新增的文本。希望能有所帮助。请考虑对我的回答进行投票，并结束这个问题。谢谢
y_training_fold <- matrix(rpois(100,10),ncol=1)

xgb_params <- list("objective" = "count:poisson",
                "eval_metric" = "rmse")
regression <- xgboost(data = training_fold, 
                   label = y_training_fold, 
                   nrounds = 10,
                   params = xgb_params)

[1]     train-rmse:9.795855 
[2]     train-rmse:9.660112 
[3]     train-rmse:9.492991 
[4]     train-rmse:9.287366 
[5]     train-rmse:9.034582 
[6]     train-rmse:8.724205 
[7]     train-rmse:8.343800 
[8]     train-rmse:7.878869 
[9]     train-rmse:7.312294 
[10]    train-rmse:6.632671

dts <- structure(list(VacancyId = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 
3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L), .Label = c("55288","56838", "57822", "57902", "57925", "58008"), class = "factor"), 
VacancyBankId = c(2L, 1609L, 1611L, 147L, 17L, 1611L, 2L, 
257L, 1611L, 2L, 147L, 17L, 1611L, 239L, 1609L, 2L, 1609L, 
2L, 2L, 1609L), FunctionId = c(36L, 36L, 36L, 36L, 35L, 35L, 
3L, 4L, 4L, 4L, 4L, 9L, 9L, 9L, 3L, 3L, 3L, 3L, 3L, 3L), 
EducationLevel = c(6L, 6L, 6L, 6L, 6L, 6L, 4L, 6L, 6L, 6L, 
6L, 4L, 4L, 4L, 6L, 6L, 6L, 6L, 6L, 6L), ProvinceId = c(22L, 
22L, 22L, 22L, 24L, 24L, 19L, 16L, 16L, 16L, 16L, 19L, 19L, 
19L, 21L, 21L, 16L, 16L, 22L, 22L), CandidatesCount = c(126L, 
27L, 18L, 12L, 1L, 4L, 2L, 6L, 7L, 7L, 1L, 8L, 15L, 13L, 
7L, 7L, 7L, 7L, 7L, 7L), DurationDays = c(62L, 62L, 62L, 
62L, 18L, 18L, 43L, 61L, 61L, 61L, 61L, 60L, 60L, 60L, 62L, 
62L, 62L, 62L, 62L, 62L), DurationWeeks = c(8.857142857, 
8.857142857, 8.857142857, 8.857142857, 2.571428571, 2.571428571, 
6.142857143, 8.714285714, 8.714285714, 8.714285714, 8.714285714, 
8.571428571, 8.571428571, 8.571428571, 8.857142857, 8.857142857, 
8.857142857, 8.857142857, 8.857142857, 8.857142857), CandidatesPerWeek = c(NA, 
3.048387097, 2.032258065, 1.35483871, 0.388888889, 1.555555556, 
0.325581395, 0.68852459, 0.803278689, 0.803278689, 0.114754098, 
0.933333333, 1.75, 1.516666667, 0.790322581, 0.790322581, 
0.790322581, 0.790322581, 0.790322581, 0.790322581)), 
.Names = c("VacancyId", "VacancyBankId", "FunctionId", "EducationLevel", 
"ProvinceId", "CandidatesCount", "DurationDays", "DurationWeeks", 
"CandidatesPerWeek"), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 11L, 12L, 
13L, 14L, 15L, 16L, 17L, 18L, 26L, 27L, 28L, 29L, 30L, 31L), 
class = "data.frame")

# Delete missing values
dts <- na.omit(dts)

# Build X matrix of potential predictors
# Important: do not use the first column (ID) and the last (response variable)
training_fold <- as.matrix(dts[,-c(1,9)])
# Round to the nearest integer the response variable
y_training_fold <- as.matrix(dts[,9])
y_training_fold <- round(y_training_fold)

xgb_params <- list("objective" = "count:poisson",
                "eval_metric" = "rmse")
( regression <- xgboost(data = training_fold, 
                   label = y_training_fold, 
                   nrounds = 10,
                   params = xgb_params) )
# Output
##### xgb.Booster
# raw: 4.6 Kb 
# call:
#   xgb.train(params = params, data = dtrain, nrounds = nrounds, 
#     watchlist = watchlist, verbose = verbose, print_every_n = print_every_n, 
#     early_stopping_rounds = early_stopping_rounds, maximize = maximize, 
#     save_period = save_period, save_name = save_name, xgb_model = xgb_model, 
#     callbacks = callbacks)
# params (as set within xgb.train):
#   objective = "count:poisson", eval_metric = "rmse", silent = "1"
# xgb.attributes:
#   niter
# callbacks:
#   cb.print.evaluation(period = print_every_n)
#   cb.evaluation.log()
#   cb.save.model(save_period = save_period, save_name = save_name)
# niter: 10
# evaluation_log:
#     iter train_rmse
#        1   0.914084
#        2   0.829741
# ---                
#        9   0.332951
#       10   0.291877