Python 如何在sklearn GridSearch管道中实现FeatureUnion和自定义估计器?
在将自定义估计器对象实现到Sklearn管道中时,我遇到了严重的困难。该管道由一组sklearn transformer对象组成,自定义估计器只是一个自定义类,应该能够规范化数据(将数据转换为正态分布) 因此,该算法的逻辑是将网格搜索应用到转换函数的管道上,以便最好地规范化给定的数据集 这里,我使用的是一个自定义损失函数(scipy.stats中的normality test),它返回的值介于[0:1]之间。此自定义损失函数用于拟合GridSearch。接近零值表示数据标准化较差(最坏情况),而接近1的值表示标准化良好(最佳情况) 下面是我的最佳尝试的代码片段(尽管仍然不起作用):Python 如何在sklearn GridSearch管道中实现FeatureUnion和自定义估计器?,python,scikit-learn,pipeline,gridsearchcv,Python,Scikit Learn,Pipeline,Gridsearchcv,在将自定义估计器对象实现到Sklearn管道中时,我遇到了严重的困难。该管道由一组sklearn transformer对象组成,自定义估计器只是一个自定义类,应该能够规范化数据(将数据转换为正态分布) 因此,该算法的逻辑是将网格搜索应用到转换函数的管道上,以便最好地规范化给定的数据集 这里,我使用的是一个自定义损失函数(scipy.stats中的normality test),它返回的值介于[0:1]之间。此自定义损失函数用于拟合GridSearch。接近零值表示数据标准化较差(最坏情况),而
将熊猫作为pd导入
将numpy作为np导入
从scipy导入统计信息
从sk学习导入预处理
从sklearn.pipeline导入功能联合,管道
从sklearn导入集_config
从sklearn.compose导入make_column_transformer
导入操作系统
从sklearn.model_选择导入GridSearchCV
从sklearn.metrics导入make_scorer
从scipy.stats导入normaltest
从sklearn.base导入BaseEstimator
def make_transform_管道(dir_to_save=None):
转换_list=[(“RobustScaler”,preprocessing.RobustScaler()),
(“分额变压器”,预处理。分额变压器(输出分布=正态),
n_分位数=10,
随机_状态=0)),
('yeo-johnson',preprocessing.PowerTransformer(method='yeo-johnson',standarding=False)),
('normalizer',preprocessing.normalizer(norm='l2'))
]
列_trans=生成列_transformer(*转换列表)
如果dir_to_save为无:
dir_to_save=os.getcwd()
pipeline\u filename=os.path.join(dir\u to\u save,'my\u estimator.html')
设置配置(display='diagram')
从sklearn.utils导入估计器\u html\u repr
将open(管道_文件名,'w')作为f:
f、 编写(估算器报告(列报告))
返回FeatureUnion(转换列表),转换列表
def检查数据分布是否为高斯分布(y_true=None,y_pred=None,alpha=0.05):
#零假设:x来自正态分布
如果isinstance(y_true,(pd.Series,pd.DataFrame)):
y_真=y_真值
zscore,pvalue=normaltest(y_true.flatte(),nan_policy='omit')
#如果pvalue<0.05,“可以拒绝无效假设”
返回pvalue
类数据分布检查(基本估计):
定义初始(自我、特征、联合、,
alpha=0.05,n_工作=无,
变压器列表=无,
变压器重量=无,
详细=错误):
self.feature\u union=feature\u union
super()。\uuuu init\uuuuu()
def get_参数(self,deep=True):
#此估计器具有特征联合属性
返回{“feature\u union”:self.feature\u union}
def set_参数(自身,**参数):
对于参数,参数.items()中的值:
setattr(自身、参数、值)
回归自我
def变换(自、x、y=无):
打印('Normalizer\u管道转换调用\n\n')
返回自特征联合变换(x)
def分数(自我、x、y=无,**kwargs):
返回自我。预测概率(x,y)
def predict_proba(self,x=None,y=None):
如果y:
如果数据分布为高斯分布,则返回检查(y真=y)
其他:
如果数据分布为高斯分布,则返回检查
def预测(自我,x):
如果isinstance(x,(pd.Series,pd.DataFrame)):
x=x.0个值
return stats.zscore(x.flatte(),nan_policy='omit')
def配合(自身、x、y=无):
回归自我
def fit_变换(self,x,y=None):
返回self.feature\u union.fit\u变换(x)
def fit_预测(自我,x):
y=自、特征、联合、拟合、变换(x)
返回自我预测(y)
如果“\uuuuu main\uuuuuuuuu”==\uuuuuuuuu name\uuuuuuuuuu:
X=pd.DataFrame(np.random.randn(20,5))
特征联合,转换列表=生成转换管道()
DDC=数据分布检查器(特征联合=特征联合)
功能\u联合\u与\u预测器=管道(步骤=[
(“变形金刚”,特征为“联合”),
(“估算器”,DDC)
])
参数网格=dict(变压器分位数变压器分位数=np.arange(10,np.size(X)//5),
变压器分位数变压器输出分布=[‘正常’、‘均匀’],
估计器特征联合=[特征联合])
分数参数=dict(α=0.05)
#在这里,我使用选项“越大的值越大=真”,因为正态性pvalue分数表示数据来自正态(高斯)分布,当它接近1时,否则接近零。
My_gaussian_scorer=make_scorer(检查_数据_是否为_gaussian分布,
越大越好=正确,
需要_proba=False,
**分数(参数)
格里夫海
import pandas as pd
import numpy as np
from scipy import stats
from sklearn import preprocessing
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn import set_config
from sklearn.compose import make_column_transformer
import os
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from scipy.stats import normaltest
from sklearn.base import BaseEstimator
def make_transform_pipeline(dir_to_save=None):
transformation_list = [("RobustScaler", preprocessing.RobustScaler()),
("QuantileTransformer", preprocessing.QuantileTransformer(output_distribution='normal',
n_quantiles=10,
random_state=0)),
('yeo-johnson', preprocessing.PowerTransformer(method='yeo-johnson', standardize=False)),
('normalizer', preprocessing.Normalizer(norm='l2') )
]
column_trans = make_column_transformer( *transformation_list)
if dir_to_save is None:
dir_to_save = os.getcwd()
pipeline_filename = os.path.join(dir_to_save, 'my_estimator.html')
set_config(display='diagram')
from sklearn.utils import estimator_html_repr
with open(pipeline_filename, 'w') as f:
f.write(estimator_html_repr(column_trans))
return FeatureUnion(transformation_list), transformation_list
def check_if_data_is_gaussian_distributed(y_true=None, y_pred=None, alpha=0.05):
# null hypothesis: x comes from a normal distribution
if isinstance(y_true, (pd.Series, pd.DataFrame)):
y_true = y_true.values
zscore , pvalue = normaltest(y_true.flatten(), nan_policy='omit')
# if pvalue < 0.05, "The null hypothesis can be rejected"
return pvalue
class Data_Distribution_checker(BaseEstimator):
def __init__(self, feature_union,
alpha=0.05, n_jobs=None,
transformer_list=None,
transformer_weights=None,
verbose=False):
self.feature_union = feature_union
super().__init__()
def get_params(self, deep=True):
# this estimator has the feature_union attribute
return {"feature_union": self.feature_union}
def set_params(self, **parameters):
for parameter, value in parameters.items():
setattr(self, parameter, value)
return self
def transform(self, x, y=None):
print( 'Normalizer_Pipeline transform called \n\n')
return self.feature_union.transform(x)
def score(self, x, y=None, **kwargs):
return self.predict_proba(x, y)
def predict_proba(self, x=None, y=None):
if y:
return check_if_data_is_gaussian_distributed(y_true=y)
else:
return check_if_data_is_gaussian_distributed(y_true=x)
def predict(self, x):
if isinstance(x, (pd.Series, pd.DataFrame)):
x = x.values
return stats.zscore(x.flatten(), nan_policy='omit')
def fit(self, x, y=None):
return self
def fit_transform(self, x, y=None):
return self.feature_union.fit_transform(x)
def fit_predict(self, x):
y = self.feature_union.fit_transform(x)
return self.predict(y)
if '__main__' == __name__:
X = pd.DataFrame(np.random.randn(20, 5))
Feature_Union, transformation_list = make_transform_pipeline()
DDC = Data_Distribution_checker(feature_union=Feature_Union)
Feature_Union_with_predictor = Pipeline(steps=[
('transformers', Feature_Union) ,
('estimator', DDC)
])
param_grid = dict(transformers__QuantileTransformer__n_quantiles=np.arange(10, np.size(X)//5),
transformers__QuantileTransformer__output_distribution=['normal', 'uniform'],
estimator__feature_union = [Feature_Union])
score_params = dict(alpha=0.05)
# here I use the option "greater_is_better=True", since the normality pvalue scores indicate that the data comes from a normal (gaussian) distribution when it is closer to 1, and closer to zero otherwise.
My_gaussian_scorer = make_scorer(check_if_data_is_gaussian_distributed,
greater_is_better=True,
needs_proba=False,
**score_params)
grid_search = GridSearchCV(Feature_Union_with_predictor,
param_grid=param_grid,
n_jobs=1,
scoring={'gaussian': My_gaussian_scorer},
refit='gaussian'
)
# here the code breaks (check erro message below)
search = grid_search.fit(X)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search .best_params_)
search = grid_search.fit(X)
File "C:\Anaconda3\envs\Python_3.8\lib\site-packages\sklearn\utils\validation.py", line 72, in inner_f
return f(**kwargs)
File "C:\Anaconda3\envs\Python_3.8\lib\site-packages\sklearn\model_selection\_search.py", line 736, in fit
self._run_search(evaluate_candidates)
File "C:\Anaconda3\envs\Python_3.8\lib\site-packages\sklearn\model_selection\_search.py", line 1188, in _run_search
evaluate_candidates(ParameterGrid(self.param_grid))
File "C:\Anaconda3\envs\Python_3.8\lib\site-packages\sklearn\model_selection\_search.py", line 708, in evaluate_candidates
out = parallel(delayed(_fit_and_score)(clone(base_estimator),
File "C:\Anaconda3\envs\Python_3.8\lib\site-packages\joblib\parallel.py", line 1029, in __call__
if self.dispatch_one_batch(iterator):
File "C:\Anaconda3\envs\Python_3.8\lib\site-packages\joblib\parallel.py", line 847, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Anaconda3\envs\Python_3.8\lib\site-packages\joblib\parallel.py", line 765, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Anaconda3\envs\Python_3.8\lib\site-packages\joblib\_parallel_backends.py", line 208, in apply_async
result = ImmediateResult(func)
File "C:\Anaconda3\envs\Python_3.8\lib\site-packages\joblib\_parallel_backends.py", line 572, in __init__
self.results = batch()
File "C:\Anaconda3\envs\Python_3.8\lib\site-packages\joblib\parallel.py", line 252, in __call__
return [func(*args, **kwargs)
File "C:\Anaconda3\envs\Python_3.8\lib\site-packages\joblib\parallel.py", line 252, in <listcomp>
return [func(*args, **kwargs)
File "C:\Anaconda3\envs\Python_3.8\lib\site-packages\sklearn\model_selection\_validation.py", line 560, in _fit_and_score
test_scores = _score(estimator, X_test, y_test, scorer)
File "C:\Anaconda3\envs\Python_3.8\lib\site-packages\sklearn\model_selection\_validation.py", line 605, in _score
scores = scorer(estimator, X_test)
File "C:\Anaconda3\envs\Python_3.8\lib\site-packages\sklearn\metrics\_scorer.py", line 87, in __call__
score = scorer._score(cached_call, estimator,
TypeError: _score() missing 1 required positional argument: 'y_true'