Scikit learn 使用sklearn votingClassifier的类权重
我有一个分类问题的不平衡数据集。我的目标变量是二进制的,有两个类别。 我通过指定class_权重作为参数,实现了随机森林和逻辑回归。 当我分别将数据拟合到随机森林和逻辑回归时,效果很好。但是,当我在随机森林上使用投票分类器和sklearn.ensemble的逻辑回归来拟合数据时,它会给出错误Scikit learn 使用sklearn votingClassifier的类权重,scikit-learn,classification,ensemble-learning,imbalanced-data,Scikit Learn,Classification,Ensemble Learning,Imbalanced Data,我有一个分类问题的不平衡数据集。我的目标变量是二进制的,有两个类别。 我通过指定class_权重作为参数,实现了随机森林和逻辑回归。 当我分别将数据拟合到随机森林和逻辑回归时,效果很好。但是,当我在随机森林上使用投票分类器和sklearn.ensemble的逻辑回归来拟合数据时,它会给出错误类标签不存在。我需要使用3个或更多模型的集成。我已经检查过,这个错误不是因为代码中实现了Naiver_bayes 我的代码: rf_param = { 'class_weight': {'no_payment
类标签不存在。
我需要使用3个或更多模型的集成。我已经检查过,这个错误不是因为代码中实现了Naiver_bayes
我的代码:
rf_param = { 'class_weight': {'no_payment': 1, 'payment': 3},'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 30, 'min_samples_split': 15, 'n_estimators': 100}
lr_param = {'C': 0.1, 'class_weight': {'no_payment': 1, 'payment': 3}, 'fit_intercept': False, 'penalty': 'l2'}
rf = ensemble.RandomForestClassifier(**rf_param)
lr = linear_model.LogisticRegression(**lr_param)
nb = naive_bayes.MultinomialNB(alpha=0.0, class_prior=None, fit_prior=False)
rf.fit(train_x, train_y)
lr.fit(train_x, train_y)
nb.fit(train_x, train_y)
model = ensemble.VotingClassifier(estimators=[('rf', rf), ('lr', lr), ('nb',nb)], voting='hard'
,weights = [2,2,1])
model.fit(train_x, train_y)
predictions = model.predict(valid_x)
如果我从参数列表中删除class\u weight
,则此代码运行良好
下面是完整的错误消息
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-35-e05cd516f347> in <module>()
15 )
16
---> 17 model.fit(train_x, train_y)
18
19 predictions = model.predict(valid_x)
/home/.local/lib/python3.6/site-packages/sklearn/ensemble/_voting.py in fit(self, X, y, sample_weight)
220 transformed_y = self.le_.transform(y)
221
--> 222 return super().fit(X, transformed_y, sample_weight)
223
224 def predict(self, X):
/home/.local/lib/python3.6/site-packages/sklearn/ensemble/_voting.py in fit(self, X, y, sample_weight)
66 delayed(_parallel_fit_estimator)(clone(clf), X, y,
67 sample_weight=sample_weight)
---> 68 for clf in clfs if clf not in (None, 'drop')
69 )
70
/home/.local/lib/python3.6/site-packages/joblib/parallel.py in __call__(self, iterable)
1002 # remaining jobs.
1003 self._iterating = False
-> 1004 if self.dispatch_one_batch(iterator):
1005 self._iterating = self._original_iterator is not None
1006
/home/.local/lib/python3.6/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
833 return False
834 else:
--> 835 self._dispatch(tasks)
836 return True
837
/home/.local/lib/python3.6/site-packages/joblib/parallel.py in _dispatch(self, batch)
752 with self._lock:
753 job_idx = len(self._jobs)
--> 754 job = self._backend.apply_async(batch, callback=cb)
755 # A job can complete so quickly than its callback is
756 # called before we get here, causing self._jobs to
/home/.local/lib/python3.6/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
207 def apply_async(self, func, callback=None):
208 """Schedule a func to be run"""
--> 209 result = ImmediateResult(func)
210 if callback:
211 callback(result)
/home/.local/lib/python3.6/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
588 # Don't delay the application, to avoid keeping the input
589 # arguments in memory
--> 590 self.results = batch()
591
592 def get(self):
/home/.local/lib/python3.6/site-packages/joblib/parallel.py in __call__(self)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
/home/.local/lib/python3.6/site-packages/joblib/parallel.py in <listcomp>(.0)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
/home/.local/lib/python3.6/site-packages/sklearn/ensemble/_base.py in _parallel_fit_estimator(estimator, X, y, sample_weight)
34 raise
35 else:
---> 36 estimator.fit(X, y)
37 return estimator
38
/home/.local/lib/python3.6/site-packages/sklearn/ensemble/_forest.py in fit(self, X, y, sample_weight)
319 self.n_outputs_ = y.shape[1]
320
--> 321 y, expanded_class_weight = self._validate_y_class_weight(y)
322
323 if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
/home/.local/lib/python3.6/site-packages/sklearn/ensemble/_forest.py in _validate_y_class_weight(self, y)
585 class_weight = self.class_weight
586 expanded_class_weight = compute_sample_weight(class_weight,
--> 587 y_original)
588
589 return y, expanded_class_weight
/home/.local/lib/python3.6/site-packages/sklearn/utils/class_weight.py in compute_sample_weight(class_weight, y, indices)
161 weight_k = compute_class_weight(class_weight_k,
162 classes_full,
--> 163 y_full)
164
165 weight_k = weight_k[np.searchsorted(classes_full, y_full)]
/home/.local/lib/python3.6/site-packages/sklearn/utils/class_weight.py in compute_class_weight(class_weight, classes, y)
63 i = np.searchsorted(classes, c)
64 if i >= len(classes) or classes[i] != c:
---> 65 raise ValueError("Class label {} not present.".format(c))
66 else:
67 weight[i] = class_weight[c]
ValueError: Class label no_payment not present.
---------------------------------------------------------------------------
ValueError回溯(最近一次调用上次)
在()
15 )
16
--->17型号配合(x列、y列)
18
19预测=模型预测(有效)
/home/.local/lib/python3.6/site-packages/sklearn/employee//u voting.py合适(self、X、y、sample\u weight)
220变换的y=自变换(y)
221
-->222 return super().fit(X、转换y、样本重量)
223
224 def预测(自我,X):
/home/.local/lib/python3.6/site-packages/sklearn/employee//u voting.py合适(self、X、y、sample\u weight)
66延迟(_并行_拟合_估计量)(克隆(clf),X,y,
67样品重量=样品重量)
--->如果clf不在clf中,则clf在clf中为68(无,“下降”)
69 )
70
/home/.local/lib/python3.6/site-packages/joblb/parallel.py in.\uuuuuu call.\uuuuu(self,iterable)
1002#剩余工作。
1003自迭代=假
->1004如果自行调度一批(迭代器):
1005 self.\u iterating=self.\u original\u iterator不是None
1006
/home/.local/lib/python3.6/site-packages/joblb/parallel.py在dispatch\u one\u批处理中(self,iterator)
833返回错误
834其他:
-->835自我派遣(任务)
836返回真值
837
/home/.local/lib/python3.6/site-packages/joblb/parallel.py in_dispatch(self,batch)
752带自锁:
753作业idx=len(自作业)
-->754 job=self.\u backend.apply\u async(批处理,回调=cb)
755#一个作业完成的速度比它的回调速度要快
756#在我们到达这里之前打电话给self.#u jobs
/home/.local/lib/python3.6/site-packages/joblib//\u parallel\u backends.py in apply\u async(self、func、callback)
207 def apply_async(self、func、callback=None):
208“计划要运行的func”
-->209结果=立即结果(func)
210如果回调:
211回调(结果)
/home/.local/lib/python3.6/site-packages/joblib//\u parallel\u backends.py in\uuuuu init\uuuu(self,batch)
588#不要延迟应用程序,以免保留输入
589#内存中的参数
-->590 self.results=batch()
591
592 def get(自我):
/home/.local/lib/python3.6/site-packages/joblb/parallel.py in.\uuuuuu call\uuuuu(self)
254具有并行_后端(self._后端,n_作业=self._n_作业):
255返回[func(*args,**kwargs)
-->256用于自身项目中的func、args、kwargs]
257
258定义长度(自):
/home/.local/lib/python3.6/site-packages/joblb/parallel.py in(.0)
254具有并行_后端(self._后端,n_作业=self._n_作业):
255返回[func(*args,**kwargs)
-->256用于自身项目中的func、args、kwargs]
257
258定义长度(自):
/home/.local/lib/python3.6/site-packages/sklearn/employee//u base.py in\u parallel\u fit\u估计器(估计器,X,y,样本权重)
34提高
35.其他:
--->36估计量拟合(X,y)
37收益估计器
38
/home/.local/lib/python3.6/site-packages/sklearn/employee//u forest.py合适(self,X,y,sample\u weight)
319 self.n_输出u=y.shape[1]
320
-->321 y,扩展类重量=自身。验证类重量(y)
322
323如果getattr(y,“数据类型”,无)!=双重或非y.flags.Continental:
/home/.local/lib/python3.6/site-packages/sklearn/employee//u forest.py in\u validate\u y\u class\u weight(self,y)
585等级重量=自身等级重量
586扩展类权重=计算样本权重(类权重,
-->587(原件)
588
589返回y,扩展类重量
/home/.local/lib/python3.6/site-packages/sklearn/utils/class_weight.py in compute_sample_weight(class_weight,y,index)
161权重=计算类权重(类权重),
162班满,
-->163(满)
164
165权重k=权重k[np.搜索排序(类满,y满)]
/home/.local/lib/python3.6/site-packages/sklearn/utils/class_weight.py in compute_class_weight(class_weight,class,y)
63 i=np.分类(c类)
64如果i>=len(类)或类[i]!=c:
--->65 raise VALUERROR(“类标签{}不存在。”。格式(c))
66.其他:
67重量[i]=类别重量[c]
ValueError:类别标签无付款不存在。
这里的错误信息非常清楚:您正在为数据中不存在的“无付款”类分配权重。但该类存在于数据中。Rf.fit、lr.fit和nb.fit完美配合。在集成中使用类加权模型时返回此错误。如果我删除cla