xgboost(python)中的多类分类
我的第一个多类分类。我有Xtrn和Ytrn的值。Ytrn有5个值[0,1,2,3,4]。但如果我开始,则会得到“不支持多类格式”。 此值示例:xgboost(python)中的多类分类,python,python-2.7,machine-learning,xgboost,Python,Python 2.7,Machine Learning,Xgboost,我的第一个多类分类。我有Xtrn和Ytrn的值。Ytrn有5个值[0,1,2,3,4]。但如果我开始,则会得到“不支持多类格式”。 此值示例: Xtrn Ytrn -1.35173485 1.50224188 2.04951167 0.43759658 0.24381777 2 2.81047260 1.31259056 1.39265240 0.1638400
Xtrn Ytrn
-1.35173485 1.50224188 2.04951167 0.43759658 0.24381777 2
2.81047260 1.31259056 1.39265240 0.16384002 0.65438366 3
2.32878809 -1.92845940 -2.06453246 0.73132270 0.11771229 2
-0.12810555 -2.07268765 -2.40760215 0.97855042 0.11144164 1
1.88682063 0.75792329 -0.09754671 0.46571931 0.62111648 2
-1.09361266 1.74758304 2.49960891 0.36679883 0.88895562 2
0.71760095 -1.30711698 -2.15681966 0.33700593 0.07171119 2
4.60060308 -1.60544855 -1.88996123 0.94500124 0.63776116 4
-0.84223064 2.78233537 3.07299711 0.31470071 0.34424704 1
-0.71236435 0.53140549 0.46677096 0.12320728 0.58829090 2
-0.35333909 1.12463059 1.70104349 0.89084673 0.16585229 2
3.04322100 -1.36878116 -2.31056167 0.81178387 0.04095645 1
-1.04088918 -1.97497570 -1.93285343 0.54101882 0.02528487 1
-0.41624939 0.54592833 0.95458283 0.40004902 0.55062705 2
-1.77706795 0.29061278 0.68186697 0.17430716 0.75095729 0
这是代码:
#import data
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
from sklearn import metrics, cross_validation, grid_search, preprocessing
Xtrn = pd.read_csv('x_train_secret.csv', header=None, delimiter=';', na_values='?')
Ytrn = pd.read_csv('y_train_secret.csv', header=None)
Test = pd.read_csv('x_test_secret.csv', header=None, delimiter=';', na_values='?')
#Number of unique values Ytrn
n_classes_ = len(np.unique(Ytrn))
#learning model
X_train, X_test, y_train, y_test = train_test_split(Xtrn, Ytrn, test_size=0.30, random_state=42)
xgb_model = xgb.XGBClassifier(objective='multi:softmax')
xgb_params = [{'num_class': n_classes_}]
xgb_params = [
{
"n_estimators": range(50, 501, 50),
}
]
#cv
cv = cross_validation.StratifiedShuffleSplit(y_train, n_iter=5, test_size=0.3, random_state=42)
xgb_grid = grid_search.GridSearchCV(xgb_model, xgb_params, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=3)
xgb_grid.fit(X_train, y_train)
这是一个错误:
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] n_estimators=50 .................................................
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-233-77d3e8d4b8c3> in <module>()
10
11 xgb_grid = grid_search.GridSearchCV(xgb_model, xgb_params, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=3)
---> 12 xgb_grid.fit(X_train, y_train)
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/grid_search.pyc in fit(self, X, y)
827
828 """
--> 829 return self._fit(X, y, ParameterGrid(self.param_grid))
830
831
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/grid_search.pyc in _fit(self, X, y, parameter_iterable)
571 self.fit_params, return_parameters=True,
572 error_score=self.error_score)
--> 573 for parameters in parameter_iterable
574 for train, test in cv)
575
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
756 # was dispatched. In particular this covers the edge
757 # case of Parallel used with an exhausted iterator.
--> 758 while self.dispatch_one_batch(iterator):
759 self._iterating = True
760 else:
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator)
606 return False
607 else:
--> 608 self._dispatch(tasks)
609 return True
610
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch)
569 dispatch_timestamp = time.time()
570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571 job = self._backend.apply_async(batch, callback=cb)
572 self._jobs.append(job)
573
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in apply_async(self, func, callback)
107 def apply_async(self, func, callback=None):
108 """Schedule a func to be run"""
--> 109 result = ImmediateResult(func)
110 if callback:
111 callback(result)
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in __init__(self, batch)
324 # Don't delay the application, to avoid keeping the input
325 # arguments in memory
--> 326 self.results = batch()
327
328 def get(self):
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
1682
1683 else:
-> 1684 test_score = _score(estimator, X_test, y_test, scorer)
1685 if return_train_score:
1686 train_score = _score(estimator, X_train, y_train, scorer)
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _score(estimator, X_test, y_test, scorer)
1739 score = scorer(estimator, X_test)
1740 else:
-> 1741 score = scorer(estimator, X_test, y_test)
1742 if hasattr(score, 'item'):
1743 try:
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.pyc in __call__(self, clf, X, y, sample_weight)
169 y_type = type_of_target(y)
170 if y_type not in ("binary", "multilabel-indicator"):
--> 171 raise ValueError("{0} format is not supported".format(y_type))
172
173 if is_regressor(clf):
ValueError: multiclass format is not supported
为10名候选人中的每一人进行5次试衣,共50次试衣
[CV]n_估计值=50。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。
---------------------------------------------------------------------------
ValueError回溯(最近一次调用上次)
在()
10
11 xgb\u grid=grid\u search.GridSearchCV(xgb\u模型,xgb\u参数,评分='roc\u auc',cv=cv,n\u作业=-1,详细=3)
--->12 xgb_网格安装(X_系列、y_系列)
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/grid_search.pyc in fit(self,X,y)
827
828 """
-->829返回自拟合(X,y,参数网格(自参数网格))
830
831
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/grid_search.pyc in_fit(self,X,y,parameter_iterable)
571自拟合参数,返回参数=真,
572错误分数=自我错误分数)
-->573用于参数_iterable中的参数
574用于列车,在cv中进行试验)
575
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in uuuu调用(self,iterable)
756#被派遣。特别是这涵盖了边缘
757#与耗尽迭代器一起使用的并行情况。
-->758自行调度一批时(迭代器):
759自迭代=真
760其他:
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc在dispatch\u one\u批处理中(self,迭代器)
606返回错误
607其他:
-->608自我派遣(任务)
609返回真值
610
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in_dispatch(self,batch)
569调度时间戳=time.time()
570 cb=BatchCompletionCallBack(调度时间戳,len(批处理),self)
-->571作业=self.\u后端.apply\u异步(批处理,回调=cb)
572自我作业。附加(作业)
573
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib//u parallel\u backends.pyc in apply\u async(self、func、callback)
107 def apply_async(self、func、callback=None):
108“计划要运行的func”
-->109结果=立即结果(func)
110如果回调:
111回调(结果)
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib//u parallel\u backends.pyc in\uuuu init\uuuuuuu(self,batch)
324#不要延迟应用程序,以免保留输入
325#内存中的参数
-->326 self.results=批处理()
327
328 def get(自我):
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in(self)
129
130 def呼叫(自我):
-->131返回[func(*args,**kwargs),用于self.items中的func、args、kwargs]
132
133定义长度(自):
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.pyc in_-fit_和_分数(估计器、X、y、计分器、训练、测试、详细、参数、拟合参数、返回_-train_分数、返回_参数、错误_分数)
1682
1683其他:
->1684测试分数=_分数(估计员、X测试、y测试、计分员)
1685如果返回列车评分:
1686训练分数=_分数(估计员、X训练、y训练、计分员)
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.pyc in_分数(估计员、X_测试、y_测试、计分员)
1739分=记分员(估计员,X_检验)
1740其他:
->1741分=记分员(估计员、X_检验、y_检验)
1742如果hasattr(分数,'项目'):
1743尝试:
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.pyc in\uuuuu调用(self、clf、X、y、样本重量)
169 y_类型=_目标的类型(y)
170如果y_类型不在(“二进制”、“多标签指示器”):
-->171 raise VALUERROR(“{0}格式不受支持”。格式(y_类型))
172
173如果是回归器(clf):
ValueError:不支持多类格式
我找到了答案。二元分类的评分仅限roc\u auc。需要另一个(例如准确性)
xgb_参数=[{'num_类]:n_类}]需要删除