Python 如何从GridSearchCV绘制网格分数?
我正在寻找一种从sklearn中的GridSearchCV绘制网格分数的方法。在本例中,我尝试为SVR算法网格搜索最佳gamma和C参数。我的代码如下所示:Python 如何从GridSearchCV绘制网格分数?,python,machine-learning,scikit-learn,grid-search,Python,Machine Learning,Scikit Learn,Grid Search,我正在寻找一种从sklearn中的GridSearchCV绘制网格分数的方法。在本例中,我尝试为SVR算法网格搜索最佳gamma和C参数。我的代码如下所示: C_range = 10.0 ** np.arange(-4, 4) gamma_range = 10.0 ** np.arange(-4, 4) param_grid = dict(gamma=gamma_range.tolist(), C=C_range.tolist()) grid = GridSear
C_range = 10.0 ** np.arange(-4, 4)
gamma_range = 10.0 ** np.arange(-4, 4)
param_grid = dict(gamma=gamma_range.tolist(), C=C_range.tolist())
grid = GridSearchCV(SVR(kernel='rbf', gamma=0.1),param_grid, cv=5)
grid.fit(X_train,y_train)
print(grid.grid_scores_)
运行代码并打印网格分数后,我得到以下结果:
[mean: -3.28593, std: 1.69134, params: {'gamma': 0.0001, 'C': 0.0001}, mean: -3.29370, std: 1.69346, params: {'gamma': 0.001, 'C': 0.0001}, mean: -3.28933, std: 1.69104, params: {'gamma': 0.01, 'C': 0.0001}, mean: -3.28925, std: 1.69106, params: {'gamma': 0.1, 'C': 0.0001}, mean: -3.28925, std: 1.69106, params: {'gamma': 1.0, 'C': 0.0001}, mean: -3.28925, std: 1.69106, params: {'gamma': 10.0, 'C': 0.0001},etc]
我想根据gamma和C参数可视化所有分数(平均值)。我试图获得的图表应如下所示:
plot_grid_search(grid_search.cv_results_,
'Accuracy',
list(np.linspace(0.001, 10, 50)),
['linear', 'rbf'],
'C',
'kernel')
其中x轴为伽马,y轴为平均分数(本例中为均方根误差),不同的直线代表不同的C值
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn import datasets
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
digits = datasets.load_digits()
X = digits.data
y = digits.target
clf_ = SVC(kernel='rbf')
Cs = [1, 10, 100, 1000]
Gammas = [1e-3, 1e-4]
clf = GridSearchCV(clf_,
dict(C=Cs,
gamma=Gammas),
cv=2,
pre_dispatch='1*n_jobs',
n_jobs=1)
clf.fit(X, y)
scores = [x[1] for x in clf.grid_scores_]
scores = np.array(scores).reshape(len(Cs), len(Gammas))
for ind, i in enumerate(Cs):
plt.plot(Gammas, scores[ind], label='C: ' + str(i))
plt.legend()
plt.xlabel('Gamma')
plt.ylabel('Mean score')
plt.show()
- 代码基于
- 唯一令人费解的是:sklearn是否会始终尊重C&Gamma的顺序->官方示例使用此“顺序”
参数网格的遍历顺序是确定的,因此可以直接对其进行重塑和绘制。大概是这样的:
scores = [entry.mean_validation_score for entry in grid.grid_scores_]
# the shape is according to the alphabetical order of the parameters in the grid
scores = np.array(scores).reshape(len(C_range), len(gamma_range))
for c_scores in scores:
plt.plot(gamma_range, c_scores, '-')
@sascha显示的代码是正确的。但是,
grid\u scores\u
属性很快就会被弃用。最好使用cvu results
属性
它可以以与@sascha方法类似的方式实现:
def plot_grid_search(cv_results,grid_param_1,grid_param_2,name_param_1,name_param_2):
#获取每个网格搜索的平均测试分数和标准测试分数
分数平均值=cv结果[“平均值测试分数”]
分数平均值=np.数组(分数平均值).重塑(len(网格参数2),len(网格参数1))
分数=cv结果['std测试分数']
分数=np.数组(分数)。重塑(len(网格参数2),len(网格参数1))
#绘制网格搜索分数
_,ax=plt.子批次(1,1)
#Param1是X轴,Param2表示为不同的曲线(颜色线)
对于idx,枚举中的val(网格参数2):
坐标图(网格参数1,分数平均值[idx,:],'-o',标签=名称参数2+':'+str(val))
ax.set_title(“网格搜索分数”,fontsize=20,fontwweight='bold')
ax.set\u xlabel(名称参数1,字体大小=16)
ax.set_ylabel('CV平均分数',fontsize=16)
ax.图例(loc=“best”,fontsize=15)
ax.grid('on')
#调用方法
绘图网格搜索(pipe\u grid.cv\u results\u、n\u估计器、max\u特征、“n估计器”、“max特征”)
上述结果如下图所示:
我想做一些类似的事情(但可扩展到大量参数),下面是我的解决方案,用于生成输出的群集图:
score = pd.DataFrame(gs_clf.grid_scores_).sort_values(by='mean_validation_score', ascending = False)
for i in parameters.keys():
print(i, len(parameters[i]), parameters[i])
score[i] = score.parameters.apply(lambda x: x[i])
l =['mean_validation_score'] + list(parameters.keys())
for i in list(parameters.keys()):
sns.swarmplot(data = score[l], x = i, y = 'mean_validation_score')
#plt.savefig('170705_sgd_optimisation//'+i+'.jpg', dpi = 100)
plt.show()
这里有一个利用的解决方案。此方法的优点是,它允许您在搜索2个以上的参数时绘制结果
导入seaborn作为sns
作为pd进口熊猫
def plot_cv_results(cv_results,param_x,param_z,metric='mean_test_score'):
"""
cv_results-GridSearchCV实例(或类似实例)的cv_results_属性
param_x-要在x轴上绘制的网格搜索参数的名称
param_z-要按线颜色打印的网格搜索参数的名称
"""
cv_结果=pd.数据帧(cv_结果)
col_x='param_'+param_x
col_z='param_'+param_z
图,ax=plt.子批次(1,1,figsize=(11,8))
sns.pointplot(x=col_x,y=metric,hue=col_z,data=cv_results,ci=99,n_boot=64,ax=ax)
ax.设置标题(“CV网格搜索结果”)
ax.set\u xlabel(参数x)
ax.set_ylabel(公制)
ax.图例(标题=参数)
返回图
xgboost的使用示例:
从xgboost导入XGBRegressor
从sklearn导入GridSearchCV
参数={
“最大深度”:[3,6,9,12],
“gamma”:[0,1,10,20,100],
“最小儿童体重”:[1,4,16,64,256],
}
模型=XGBRegressor()
grid=GridSearchCV(模型、参数、评分='neg_mean_squared_error')
网格拟合(…)
图=绘图结果(grid.cv结果、“gamma”、“最小子权重”)
这将生成一个图形,显示x轴上的gamma
正则化参数、线颜色的min\u child\u weight
正则化参数,以及任何其他网格搜索参数(在这种情况下max\u depth
)将通过seaborn点图99%置信区间的扩展来描述
*注意,在下面的示例中,我稍微改变了上面代码的美学。
当我试图在随机森林中绘制平均分数与树木数量的对比图时,这对我很有效。函数的作用是:找出平均值
param_n_estimators = cv_results['param_n_estimators']
param_n_estimators = np.array(param_n_estimators)
mean_n_estimators = np.mean(param_n_estimators.reshape(-1,5), axis=0)
mean_test_scores = cv_results['mean_test_score']
mean_test_scores = np.array(mean_test_scores)
mean_test_scores = np.mean(mean_test_scores.reshape(-1,5), axis=0)
mean_train_scores = cv_results['mean_train_score']
mean_train_scores = np.array(mean_train_scores)
mean_train_scores = np.mean(mean_train_scores.reshape(-1,5), axis=0)
为了在调优多个超参数时绘制结果,我所做的是将所有参数固定为它们的最佳值,除了一个参数,并绘制另一个参数的每个值的平均分数
def plot_search_results(grid):
"""
Params:
grid: A trained GridSearchCV object.
"""
## Results from grid search
results = grid.cv_results_
means_test = results['mean_test_score']
stds_test = results['std_test_score']
means_train = results['mean_train_score']
stds_train = results['std_train_score']
## Getting indexes of values per hyper-parameter
masks=[]
masks_names= list(grid.best_params_.keys())
for p_k, p_v in grid.best_params_.items():
masks.append(list(results['param_'+p_k].data==p_v))
params=grid.param_grid
## Ploting results
fig, ax = plt.subplots(1,len(params),sharex='none', sharey='all',figsize=(20,5))
fig.suptitle('Score per parameter')
fig.text(0.04, 0.5, 'MEAN SCORE', va='center', rotation='vertical')
pram_preformace_in_best = {}
for i, p in enumerate(masks_names):
m = np.stack(masks[:i] + masks[i+1:])
pram_preformace_in_best
best_parms_mask = m.all(axis=0)
best_index = np.where(best_parms_mask)[0]
x = np.array(params[p])
y_1 = np.array(means_test[best_index])
e_1 = np.array(stds_test[best_index])
y_2 = np.array(means_train[best_index])
e_2 = np.array(stds_train[best_index])
ax[i].errorbar(x, y_1, e_1, linestyle='--', marker='o', label='test')
ax[i].errorbar(x, y_2, e_2, linestyle='-', marker='^',label='train' )
ax[i].set_xlabel(p.upper())
plt.legend()
plt.show()
我在xgboost上使用了不同学习率、最大深度和估计器数量的网格搜索
gs_param_grid = {'max_depth': [3,4,5],
'n_estimators' : [x for x in range(3000,5000,250)],
'learning_rate':[0.01,0.03,0.1]
}
gbm = XGBRegressor()
grid_gbm = GridSearchCV(estimator=gbm,
param_grid=gs_param_grid,
scoring='neg_mean_squared_error',
cv=4,
verbose=1
)
grid_gbm.fit(X_train,y_train)
为了创建误差与具有不同学习率的估计器数量的关系图,我使用了以下方法:
y=[]
cvres = grid_gbm.cv_results_
best_md=grid_gbm.best_params_['max_depth']
la=gs_param_grid['learning_rate']
n_estimators=gs_param_grid['n_estimators']
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
if params["max_depth"]==best_md:
y.append(np.sqrt(-mean_score))
y=np.array(y).reshape(len(la),len(n_estimators))
%matplotlib inline
plt.figure(figsize=(8,8))
for y_arr, label in zip(y, la):
plt.plot(n_estimators, y_arr, label=label)
plt.title('Error for different learning rates(keeping max_depth=%d(best_param))'%best_md)
plt.legend()
plt.xlabel('n_estimators')
plt.ylabel('Error')
plt.show()
可以在此处查看该图:
请注意,同样可以为不同最大深度(或根据用户情况的任何其他参数)的误差与估计器数量创建图表。以下是完整的工作代码,将生成绘图,以便您可以使用GridSearchCV完全可视化最多3个参数的变化。这是您在运行代码时将看到的内容:
from sklearn import tree
from sklearn import model_selection
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.datasets import load_digits
digits = load_digits()
X, y = digits.data, digits.target
Algo = [['DecisionTreeClassifier', tree.DecisionTreeClassifier(), # algorithm
'max_depth', [1, 2, 4, 6, 8, 10, 12, 14, 18, 20, 22, 24, 26, 28, 30], # Parameter1
'max_features', ['sqrt', 'log2', None], # Parameter2
'criterion', ['gini', 'entropy']]] # Parameter3
def plot_grid_search(cv_results, grid_param_1, grid_param_2, name_param_1, name_param_2, title):
# Get Test Scores Mean and std for each grid search
grid_param_1 = list(str(e) for e in grid_param_1)
grid_param_2 = list(str(e) for e in grid_param_2)
scores_mean = cv_results['mean_test_score']
scores_std = cv_results['std_test_score']
params_set = cv_results['params']
scores_organized = {}
std_organized = {}
std_upper = {}
std_lower = {}
for p2 in grid_param_2:
scores_organized[p2] = []
std_organized[p2] = []
std_upper[p2] = []
std_lower[p2] = []
for p1 in grid_param_1:
for i in range(len(params_set)):
if str(params_set[i][name_param_1]) == str(p1) and str(params_set[i][name_param_2]) == str(p2):
mean = scores_mean[i]
std = scores_std[i]
scores_organized[p2].append(mean)
std_organized[p2].append(std)
std_upper[p2].append(mean + std)
std_lower[p2].append(mean - std)
_, ax = plt.subplots(1, 1)
# Param1 is the X-axis, Param 2 is represented as a different curve (color line)
# plot means
for key in scores_organized.keys():
ax.plot(grid_param_1, scores_organized[key], '-o', label= name_param_2 + ': ' + str(key))
ax.fill_between(grid_param_1, std_lower[key], std_upper[key], alpha=0.1)
ax.set_title(title)
ax.set_xlabel(name_param_1)
ax.set_ylabel('CV Average Score')
ax.legend(loc="best")
ax.grid('on')
plt.show()
dataset = 'Titanic'
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
cv_split = model_selection.KFold(n_splits=10, random_state=2)
for i in range(len(Algo)):
name = Algo[0][0]
alg = Algo[0][1]
param_1_name = Algo[0][2]
param_1_range = Algo[0][3]
param_2_name = Algo[0][4]
param_2_range = Algo[0][5]
param_3_name = Algo[0][6]
param_3_range = Algo[0][7]
for p in param_3_range:
# grid search
param = {
param_1_name: param_1_range,
param_2_name: param_2_range,
param_3_name: [p]
}
grid_test = GridSearchCV(alg, param_grid=param, scoring='accuracy', cv=cv_split)
grid_test.fit(X_train, y_train)
plot_grid_search(grid_test.cv_results_, param[param_1_name], param[param_2_name], param_1_name, param_2_name, dataset + ' GridSearch Scores: ' + name + ', ' + param_3_name + '=' + str(p))
param = {
param_1_name: param_1_range,
param_2_name: param_2_range,
param_3_name: param_3_range
}
grid_final = GridSearchCV(alg, param_grid=param, scoring='accuracy', cv=cv_split)
grid_final.fit(X_train, y_train)
best_params = grid_final.best_params_
alg.set_params(**best_params)
@nathandrake尝试根据@david alvarez的代码改编以下内容:
def plot_grid_search(cv_results, metric, grid_param_1, grid_param_2, name_param_1, name_param_2):
# Get Test Scores Mean and std for each grid search
scores_mean = cv_results[('mean_test_' + metric)]
scores_sd = cv_results[('std_test_' + metric)]
if grid_param_2 is not None:
scores_mean = np.array(scores_mean).reshape(len(grid_param_2),len(grid_param_1))
scores_sd = np.array(scores_sd).reshape(len(grid_param_2),len(grid_param_1))
# Set plot style
plt.style.use('seaborn')
# Plot Grid search scores
_, ax = plt.subplots(1,1)
if grid_param_2 is not None:
# Param1 is the X-axis, Param 2 is represented as a different curve (color line)
for idx, val in enumerate(grid_param_2):
ax.plot(grid_param_1, scores_mean[idx,:], '-o', label= name_param_2 + ': ' + str(val))
else:
# If only one Param1 is given
ax.plot(grid_param_1, scores_mean, '-o')
ax.set_title("Grid Search", fontsize=20, fontweight='normal')
ax.set_xlabel(name_param_1, fontsize=16)
ax.set_ylabel('CV Average ' + str.capitalize(metric), fontsize=16)
ax.legend(loc="best", fontsize=15)
ax.grid('on')
如您所见,我添加了支持包含多个指标的网格搜索的功能。您只需在对p的调用中指定要绘制的度量