Python LightGBM vs Sklearn LightGBM-实现中的错误-完全相同的参数给出不同的结果_Python_Python 3.x_Machine Learning_Scikit Learn_Lightgbm

Python LightGBM vs Sklearn LightGBM-实现中的错误-完全相同的参数给出不同的结果

python python-3.x machine-learning scikit-learn

Python LightGBM vs Sklearn LightGBM-实现中的错误-完全相同的参数给出不同的结果,python,python-3.x,machine-learning,scikit-learn,lightgbm,Python,Python 3.x,Machine Learning,Scikit Learn,Lightgbm,在将完全相同的参数传递给LightGBM和sklearn的LightGBM实现时，我得到了不同的结果。起初，我在做这件事时得到了完全相同的结果，但是，我对代码做了一些更改，现在我无法找出为什么它们不一样。这意味着性能指标和功能重要性的表现有所不同。请帮我弄清楚，我弄不清楚我犯了什么错误。这可能是我使用原始库实现LightGBM的方式中的错误，也可能是sklearn的实现中的错误。链接以解释为什么我们应该得到相同的结果- 结果有多不一样？@Benoitdementhire-谢谢你回来。特征重要性的

在将完全相同的参数传递给LightGBM和sklearn的LightGBM实现时，我得到了不同的结果。起初，我在做这件事时得到了完全相同的结果，但是，我对代码做了一些更改，现在我无法找出为什么它们不一样。这意味着性能指标和功能重要性的表现有所不同。请帮我弄清楚，我弄不清楚我犯了什么错误。这可能是我使用原始库实现LightGBM的方式中的错误，也可能是sklearn的实现中的错误。链接以解释为什么我们应该得到相同的结果-

结果有多不一样？@Benoitdementhire-谢谢你回来。特征重要性的顺序不同（分数不同），所有性能指标（准确度、roc_auc、f1分数、准确度、召回率）相差5-10%。此外，所有这些最初都是完全一样的。你对你的

剪贴板的使用有点惊讶吗？为什么要使用此？将结果复制到excel。然而，这并不是问题所在，因为即使我只是打印结果或打印重要性，情况也不一样。
x_train, x_test, y_train, y_test = train_test_split(df_dummy[df_merge.columns], labels, test_size=0.25,random_state=42)

n_folds = 5

lgb_train = lgb.Dataset(x_train, y_train)

def objective(params, n_folds = n_folds):
    """Objective function for Gradient Boosting Machine Hyperparameter Tuning"""

    print(params)

    params['max_depth'] = int(params['max_depth'])
    params['num_leaves'] = int(params['num_leaves'])

    params['min_child_samples'] = int(params['min_child_samples'])
    params['subsample_freq'] = int(params['subsample_freq'])

    # Perform n_fold cross validation with hyperparameters

    # Use early stopping and evalute based on ROC AUC
    cv_results = lgb.cv(params, lgb_train, nfold=n_folds, num_boost_round=10000, 
                        early_stopping_rounds=100, metrics='auc')

    # Extract the best score
    best_score = max(cv_results['auc-mean'])

    # Loss must be minimized
    loss = 1 - best_score
    num_iteration = int(np.argmax(cv_results['auc-mean']) + 1)

    of_connection = open(out_file, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([loss, params, num_iteration])

    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'status': STATUS_OK, 'estimators': num_iteration}

space = {
    'min_child_samples': hp.quniform('min_child_samples', 5, 100, 5), 
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
    'max_depth' : hp.quniform('max_depth', 3, 10, 1),
    'subsample' : hp.quniform('subsample', 0.6, 1, 0.05),
    'num_leaves': hp.quniform('num_leaves', 20, 150, 1),  
    'subsample_freq': hp.quniform('subsample_freq',0,10,1),
    'min_gain_to_split': hp.quniform('min_gain_to_split', 0.01, 0.1, 0.01),


    'learning_rate' : 0.05,
    'objective' : 'binary',

}

out_file = 'results/gbm_trials.csv'
of_connection = open(out_file, 'w')
writer = csv.writer(of_connection)

writer.writerow(['loss', 'params', 'estimators'])
of_connection.close()

trials = Trials()
best = fmin(objective, space, algo=tpe.suggest, trials=trials, max_evals=10)
bayes_trials_results = sorted(trials.results, key = lambda x: x['loss'])

results = pd.read_csv('results/gbm_trials.csv')

# Sort with best scores on top and reset index for slicing
results.sort_values('loss', ascending = True, inplace = True)
results.reset_index(inplace = True, drop = True)
results.head()
best_bayes_estimators = int(results.loc[0, 'estimators'])

best['max_depth'] = int(best['max_depth'])
best['num_leaves'] = int(best['num_leaves'])

best['min_child_samples'] = int(best['min_child_samples'])

num_boost_round=int(best_bayes_estimators * 1.1)
best['objective'] = 'binary'
best['boosting_type'] = 'gbdt'

best['subsample_freq'] = int(best['subsample_freq'])

#Actual LightGBM

best_gbm = lgb.train(params=best, train_set=lgb_train, num_boost_round=num_boost_round)

print('Plotting feature importances...')
ax = lgb.plot_importance(best_gbm, max_num_features=15)
plt.show()

feature_imp = pd.DataFrame()
feature_imp["feature"] = list(x_train.columns)
feature_imp["importance_gain"] = best_gbm.feature_importance(importance_type='gain')
feature_imp["importance_split"] = best_gbm.feature_importance(importance_type='split')
feature_imp.to_clipboard()

y_pred_score = best_gbm.predict(x_test)

roc_auc_score_list = []
f1_score_list = []
accuracy_score_list = []
precision_score_list = []
recall_score_list = []

thresholds = [0.4,0.5,0.6,0.7]
for threshold in thresholds:
    print("threshold is {}".format(threshold))
    y_pred = np.where(y_pred_score>=threshold, 1, 0)
    print(roc_auc_score(y_test,y_pred_score))
    print(f1_score(y_test,y_pred))
    print(accuracy_score(y_test,y_pred))
    print(precision_score(y_test,y_pred))
    print(recall_score(y_test,y_pred))

    roc_auc_score_list.append(roc_auc_score(y_test,y_pred_score))
    f1_score_list.append(f1_score(y_test,y_pred))
    accuracy_score_list.append(accuracy_score(y_test,y_pred))
    precision_score_list.append(precision_score(y_test,y_pred))
    recall_score_list.append(recall_score(y_test,y_pred))

performance_metrics = pd.DataFrame(
        {'thresholds':thresholds,
         'roc_auc_score':roc_auc_score_list,
         'f1_score':f1_score_list,
         'accuracy_score':accuracy_score_list,
         'precision_score':precision_score_list,
         'recall_score':recall_score_list })

performance_metrics.transpose().to_clipboard()

#Sklearn's Implementation of LightGBM

best_sk = dict(best)
del best_sk['min_gain_to_split']
sk_best_gbm = lgb.LGBMClassifier(**best_sk, n_estimators=num_boost_round, learning_rate=0.05, min_split_gain=best['min_gain_to_split'])
sk_best_gbm.fit(x_train, y_train)

sk_best_gbm.get_params()

print('Plotting feature importances...')
ax = lgb.plot_importance(sk_best_gbm, max_num_features=15)
plt.show()

feature_imp = pd.DataFrame()
feature_imp["feature"] = list(x_train.columns)
feature_imp["Importance"] = sk_best_gbm.feature_importances_
feature_imp.to_clipboard()

y_pred_score = sk_best_gbm.predict_proba(x_test)[:,1]

roc_auc_score_list = []
f1_score_list = []
accuracy_score_list = []
precision_score_list = []
recall_score_list = []

thresholds = [0.4,0.5,0.6,0.7]
for threshold in thresholds:
    print("threshold is {}".format(threshold))
    y_pred = np.where(y_pred_score>=threshold, 1, 0)
    print(roc_auc_score(y_test,y_pred_score))
    print(f1_score(y_test,y_pred))
    print(accuracy_score(y_test,y_pred))
    print(precision_score(y_test,y_pred))
    print(recall_score(y_test,y_pred))

    roc_auc_score_list.append(roc_auc_score(y_test,y_pred_score))
    f1_score_list.append(f1_score(y_test,y_pred))
    accuracy_score_list.append(accuracy_score(y_test,y_pred))
    precision_score_list.append(precision_score(y_test,y_pred))
    recall_score_list.append(recall_score(y_test,y_pred))

performance_metrics = pd.DataFrame(
        {'thresholds':thresholds,
         'roc_auc_score':roc_auc_score_list,
         'f1_score':f1_score_list,
         'accuracy_score':accuracy_score_list,
         'precision_score':precision_score_list,
         'recall_score':recall_score_list })

performance_metrics.transpose().to_clipboard()