如何使用hyperopt为python中的内核PCA选择hyperparameters?
我正在研究应用核主成分分析(KPCA)来降低特征矩阵集的维数,以获得数据点的聚类。我在scikit学习包中浏览了KPCA中使用的参数,了解到如果选择了其中一个参数(例如,如果选择了gamma,则不使用度和系数),则有些参数应该有效。此外,我还浏览了以下链接,了解用于分类模型的超参数方法:如何使用hyperopt为python中的内核PCA选择hyperparameters?,python,scikit-learn,pca,hyperparameters,hyperopt,Python,Scikit Learn,Pca,Hyperparameters,Hyperopt,我正在研究应用核主成分分析(KPCA)来降低特征矩阵集的维数,以获得数据点的聚类。我在scikit学习包中浏览了KPCA中使用的参数,了解到如果选择了其中一个参数(例如,如果选择了gamma,则不使用度和系数),则有些参数应该有效。此外,我还浏览了以下链接,了解用于分类模型的超参数方法: 我试图编码hyperopt代码并将其与KPCA相结合,但是,在处理PCA模型评分方面,我不断出现错误。我知道KPCA没有分数来确定PCA模型的准确性,因此,我如何克服这个错误?我尝试了几种计分方法,
from sklearn.decomposition import PCA, KernelPCA, SparsePCA, IncrementalPCA
from hyperopt import hp, tpe, atpe, fmin, Trials, rand, STATUS_OK
# Implementing Hyperparamater method:
models = {'pca' : PCA,
'kpca' : KernelPCA,
'spca' : SparsePCA,
# 'ipca' : IncrementalPCA
}
def search_space(model):
# Initialising variables:
model = model.lower()
space = {}
# Calling the models:
if model == 'pca':
space = {'svd_solver' : hp.choice('svd_solver', ["auto", "full", "arpack", "randomized"]),
}
elif model == 'kpca':
space = {'kernel' : hp.choice('kernel', ['linear', 'poly', 'rbf', 'sigmoid', 'cosine', 'precomputed']),
'gamma' : hp.choice('gamma', np.arange(0.03, 0.05, 0.002)),
'degree' : hp.choice('degree', range(1, 10, 1)),
'coef0' : hp.choice('coef0', np.arange(1, 10, 0.2))
}
elif model == 'spca':
space = {'alpha' : hp.choice('alpha', np.arange(1.0, 15.0, 0.2)),
'ridge_alpha' : hp.choice('ridge_alpha', np.linspace(0.01, 0.3, 30)),
'method' : hp.choice('method', ['lars', 'cd']),
'max_iter' : hp.choice('max_iter', [1000, 1500, 2000, 2500, 3000])
}
# elif model == 'ipca':
# space = {'batch_size' : hp.choice('batch_size', ['gini', 'entropy']),
# }
space['model'] = model
return space
def obj_fnc(params):
model = params.get('model').lower()
# X_ = scale_normalize(params, X[:])
del params['model']
clf = models[model](**params)
return (get_acc_status(clf, X))
def get_acc_status(clf, X):
X_reduced = clf.fit_transform(X)
# X_prereduced = clf.fit_inverse_transform(X_reduced)
# acc = -1 * mean_squared_error(X, X_prereduced)
X_prereduced = clf.inverse_transform(X_reduced)
# acc = -1 * mean_absolute_error(X, X_prereduced)
acc = -1 * r2_score(X, X_prereduced)
# acc = cross_val_score(clf, X).mean()
return {'loss': -acc, 'status': STATUS_OK}
##### Hyperparameter optimisation:
# Running Bayesian Optimisation to get the best parameters:
start = time.time()
# Create the algorithms
tpe_algo = tpe.suggest
# rand_algo = rand.suggest
# atpe_algo = atpe.suggest
# Assigning model:
model = 'kpca'
# Creating the trial objects:
hypopt_trials = Trials()
# Getting the best parameters:
best_params = fmin(obj_fnc, search_space(model), algo=tpe_algo, max_evals=500, trials=hypopt_trials)
print("Best params: ", best_params)
print('Best accuracy: ', hypopt_trials.best_trial['result']['loss'])
print("[INFO] Baye. Opt. search took {:.2f} seconds".format(time.time() - start))
# Calling parameters:
## PCA:
svd_solver = ["auto", "full", "arpack", "randomized"]
## KPCA:
kernel = ["linear", "poly", "rbf", "sigmoid", "cosine", "precomputed"]
gamma = np.arange(0.1, 0.9, 0.01)
degree = range(1, 10, 1)
coef0 = np.arange(1, 10, 0.2)
kernel_gamma = ["poly", "rbf", "sigmoid"]
kernel_degree = "poly"
kernel_coef0 = "sigmoid"
## SPCA:
alpha = np.arange(1.0, 15.0, 0.2)
ridge_alpha = np.linspace(0.01, 0.3, 30)
method = ['lars', 'cd']
max_iter = [1000, 1500, 2000, 2500, 3000]
# Creating the PCA models:
# pca = PCA(n_components=2, svd_solver=svd_solver[best_params['svd_solver'])
if any(x in best_params for x in kernel_gamma):
pca = KernelPCA(n_components=2, kernel=kernel[best_params['kernel']], gamma='{0}'.format(gamma[best_params['gamma']]))
if any(x in best_params for x in kernel_degree):
pca = KernelPCA(n_components=2, kernel=kernel[best_params['kernel']], gamma='{0}'.format(gamma[best_params['gamma']]), degree='{0}'.format(degree[best_params['degree']]), coef0='{0}'.format(coef0[best_params['coef0']]))
if any(x in best_params for x in kernel_coef0):
pca = KernelPCA(n_components=2, kernel=kernel[best_params['kernel']], gamma='{0}'.format(gamma[best_params['gamma']]), coef0='{0}'.format(coef0[best_params['coef0']]))
# pca = SparsePCA(n_components=2, alpha='{0}'.format(alpha[best_params['alpha']]), ridge_alpha='{0}'.format(ridge_alpha[best_params['ridge_alpha']]), method=method[best_params['method']], max_iter='{0}'.format(max_iter[best_params['max_iter']]))
# pca = IncrementalPCA(n_components=2)
print('Model: ', pca)
PrincipalComponents = pca.fit_transform(X_std)
principalDf = pd.DataFrame(data = PrincipalComponents, columns = ['principal component 1', 'principal component 2'])
finalDf = pd.concat([principalDf, dataframe[['Label']]], axis = 1)
print('Principal Component Analysis: ')
print(principalDf)
ValueError: There are significant negative eigenvalues (1.11715 of the maximum positive). Either the matrix is not PSD, or there was an issue while computing the eigendecomposition of the matrix.
ValueError: Precomputed metric requires shape (n_queries, n_indexed). Got (50, 14) for 50 indexed.