Python 3.x 多数据集上多模型的多处理训练
我有多个数据集,我想在这些数据集上训练多个模型。作为data_1的一个例子,我想训练randomforest和gradientboosting,然后对data_2进行同样的训练 使用多处理池或进程并行地训练模型,或者一次只循环一个模型,这是实现这一点的最佳或最有效的方法吗?机器学习模型是否应该避免多处理,因为它们有一些内置的scikit学习多处理 下面是我对使用池感兴趣的一个示例:Python 3.x 多数据集上多模型的多处理训练,python-3.x,scikit-learn,python-multiprocessing,Python 3.x,Scikit Learn,Python Multiprocessing,我有多个数据集,我想在这些数据集上训练多个模型。作为data_1的一个例子,我想训练randomforest和gradientboosting,然后对data_2进行同样的训练 使用多处理池或进程并行地训练模型,或者一次只循环一个模型,这是实现这一点的最佳或最有效的方法吗?机器学习模型是否应该避免多处理,因为它们有一些内置的scikit学习多处理 下面是我对使用池感兴趣的一个示例: import numpy as np import pandas as pd import xgboost as
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
import multiprocessing as mp
from multiprocessing import Pool, cpu_count
import os
import time
import threading
import inspect
from sklearn.datasets import load_iris
def apply_parallel_training(*fns, dataset):
"""
Enable parallel computation and return DataFrame.
"""
pool = Pool(mp.cpu_count()-1)
ret_list = []
for fn in fns:
results = [pool.map_async(fn, dataset)]
results = [p.get() for p in results]
ret_list.append(results)
pool.close()
pool.join()
time.sleep(1)
flattened = [val for sublist in ret_list for val in sublist]
flattened = [val for sublist in flattened for val in sublist]
return flattened
def random_forest(x):
"""
Specify RF model.
"""
df = train_some_models(x = x,
clf = RandomForestClassifier(),
param_dist = {"n_estimators": [500],
"max_depth": [5, 7],
"max_features": ["auto", "log2"],
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]
},
n_iter = 5,
model_name = inspect.stack()[0][3])
return(df)
def gradient_boosting_tree(x):
"""
Specify GBM model.
"""
df = train_some_models(x = x,
clf = XGBClassifier(),
param_dist = {"n_estimators": [200],
"learning_rate": [0.005, 0.01, 0.05, 0.1],
"booster": ["gbtree"],
"max_depth": [5, 7]
},
n_iter = 5,
model_name = inspect.stack()[0][3])
return(df)
def train_some_models(x, clf, param_dist, n_iter, model_name):
"""
Train models on a specified dataset and predict for known labels.
Args:
x: Dataset in the form of a dataframe.
clf: Classifier.
param_dist: Speficied model parameters.
n_iter: Number of iterations to use in RandomizedCV.
model_name: Name of model used for training.
Returns:
Dataframe with the prediction results per model.
"""
Y = x[["target", "train_test_label"]].copy()
X = x[x.columns.difference(["target"])]
X_train = X[X.train_test_label == "train"]
X_test = X[X.train_test_label == "test"]
y_train = Y[Y.train_test_label == "train"]
y_test = Y[Y.train_test_label == "test"]
del X_train["train_test_label"]
del X_test["train_test_label"]
del y_train["train_test_label"]
del y_test["train_test_label"]
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)
random_clf = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter,
scoring = 'f1')
random_clf.fit(X_train, y_train.ravel())
y_class = random_clf.predict(X_test)
y_class = pd.DataFrame(y_class)
y_class.columns = ["y_class"]
y_class["model_name"] = model_name
df = pd.concat([y_class, pd.DataFrame(y_test)], axis = 1)
return(df)
if __name__ == '__main__':
iris = load_iris()
iris = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
columns= iris['feature_names'] + ['target'])
# Making it into a binary classification problem
iris["target"] = np.where(iris["target"].values == 2, 1, 0)
# Create training and testing samples
iris_train_1 = iris.loc[0:70,:]
iris_test_1 = iris.loc[70:100,:]
iris_train_2 = iris.loc[100:130,:]
iris_test_2 = iris.loc[130:150,:]
# Assign an indicator to tell which is training and which is testing
iris_train_1 = iris_train_1.assign(train_test_label="train")
iris_test_1 = iris_test_1.assign(train_test_label="test")
# Concatenate the training and testing sets together
iris_1 = pd.concat([iris_train_1, iris_test_1], axis = 0)
iris_train_2 = iris_train_2.assign(train_test_label="train")
iris_test_2 = iris_test_2.assign(train_test_label="test")
iris_2 = pd.concat([iris_train_2, iris_test_2], axis = 0)
# Put them into a list to iterate over with multiprocessing.Pools
dataset = [iris_1, iris_2]
# Start the training process
results = apply_parallel_training(random_forest, gradient_boosting_tree,
dataset = dataset)
print(pd.concat(results))
你能解决这个问题吗?