Python 如何在sklearn GridSearch管道中实现FeatureUnion和自定义估计器?

Python 如何在sklearn GridSearch管道中实现FeatureUnion和自定义估计器?,python,scikit-learn,pipeline,gridsearchcv,Python,Scikit Learn,Pipeline,Gridsearchcv,在将自定义估计器对象实现到Sklearn管道中时,我遇到了严重的困难。该管道由一组sklearn transformer对象组成,自定义估计器只是一个自定义类,应该能够规范化数据(将数据转换为正态分布) 因此,该算法的逻辑是将网格搜索应用到转换函数的管道上,以便最好地规范化给定的数据集 这里,我使用的是一个自定义损失函数(scipy.stats中的normality test),它返回的值介于[0:1]之间。此自定义损失函数用于拟合GridSearch。接近零值表示数据标准化较差(最坏情况),而

在将自定义估计器对象实现到Sklearn管道中时,我遇到了严重的困难。该管道由一组sklearn transformer对象组成,自定义估计器只是一个自定义类,应该能够规范化数据(将数据转换为正态分布)

因此,该算法的逻辑是将网格搜索应用到转换函数的管道上,以便最好地规范化给定的数据集

这里,我使用的是一个自定义损失函数(scipy.stats中的normality test),它返回的值介于[0:1]之间。此自定义损失函数用于拟合GridSearch。接近零值表示数据标准化较差(最坏情况),而接近1的值表示标准化良好(最佳情况)

下面是我的最佳尝试的代码片段(尽管仍然不起作用):

将熊猫作为pd导入
将numpy作为np导入
从scipy导入统计信息
从sk学习导入预处理
从sklearn.pipeline导入功能联合,管道
从sklearn导入集_config
从sklearn.compose导入make_column_transformer
导入操作系统
从sklearn.model_选择导入GridSearchCV
从sklearn.metrics导入make_scorer
从scipy.stats导入normaltest
从sklearn.base导入BaseEstimator
def make_transform_管道(dir_to_save=None):
转换_list=[(“RobustScaler”,preprocessing.RobustScaler()),
(“分额变压器”,预处理。分额变压器(输出分布=正态),
n_分位数=10,
随机_状态=0)),
('yeo-johnson',preprocessing.PowerTransformer(method='yeo-johnson',standarding=False)),
('normalizer',preprocessing.normalizer(norm='l2'))
]
列_trans=生成列_transformer(*转换列表)
如果dir_to_save为无:
dir_to_save=os.getcwd()
pipeline\u filename=os.path.join(dir\u to\u save,'my\u estimator.html')
设置配置(display='diagram')
从sklearn.utils导入估计器\u html\u repr
将open(管道_文件名,'w')作为f:
f、 编写(估算器报告(列报告))
返回FeatureUnion(转换列表),转换列表
def检查数据分布是否为高斯分布(y_true=None,y_pred=None,alpha=0.05):
#零假设:x来自正态分布
如果isinstance(y_true,(pd.Series,pd.DataFrame)):
y_真=y_真值
zscore,pvalue=normaltest(y_true.flatte(),nan_policy='omit')
#如果pvalue<0.05,“可以拒绝无效假设”
返回pvalue
类数据分布检查(基本估计):
定义初始(自我、特征、联合、,
alpha=0.05,n_工作=无,
变压器列表=无,
变压器重量=无,
详细=错误):
self.feature\u union=feature\u union
super()。\uuuu init\uuuuu()
def get_参数(self,deep=True):
#此估计器具有特征联合属性
返回{“feature\u union”:self.feature\u union}
def set_参数(自身,**参数):
对于参数,参数.items()中的值:
setattr(自身、参数、值)
回归自我
def变换(自、x、y=无):
打印('Normalizer\u管道转换调用\n\n')
返回自特征联合变换(x)
def分数(自我、x、y=无,**kwargs):
返回自我。预测概率(x,y)
def predict_proba(self,x=None,y=None):
如果y:
如果数据分布为高斯分布,则返回检查(y真=y)
其他:
如果数据分布为高斯分布,则返回检查
def预测(自我,x):
如果isinstance(x,(pd.Series,pd.DataFrame)):
x=x.0个值
return stats.zscore(x.flatte(),nan_policy='omit')
def配合(自身、x、y=无):
回归自我
def fit_变换(self,x,y=None):
返回self.feature\u union.fit\u变换(x)
def fit_预测(自我,x):
y=自、特征、联合、拟合、变换(x)
返回自我预测(y)
如果“\uuuuu main\uuuuuuuuu”==\uuuuuuuuu name\uuuuuuuuuu:
X=pd.DataFrame(np.random.randn(20,5))
特征联合,转换列表=生成转换管道()
DDC=数据分布检查器(特征联合=特征联合)
功能\u联合\u与\u预测器=管道(步骤=[
(“变形金刚”,特征为“联合”),
(“估算器”,DDC)
])
参数网格=dict(变压器分位数变压器分位数=np.arange(10,np.size(X)//5),
变压器分位数变压器输出分布=[‘正常’、‘均匀’],
估计器特征联合=[特征联合])
分数参数=dict(α=0.05)
#在这里,我使用选项“越大的值越大=真”,因为正态性pvalue分数表示数据来自正态(高斯)分布,当它接近1时,否则接近零。
My_gaussian_scorer=make_scorer(检查_数据_是否为_gaussian分布,
越大越好=正确,
需要_proba=False,
**分数(参数)
格里夫海
import pandas as pd
import numpy as np
from scipy import stats 
from sklearn import preprocessing
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn import set_config


from sklearn.compose import make_column_transformer
import os


from sklearn.model_selection import GridSearchCV

from sklearn.metrics import make_scorer 
from scipy.stats import normaltest

from sklearn.base import BaseEstimator



def make_transform_pipeline(dir_to_save=None):
    
    
    transformation_list = [("RobustScaler", preprocessing.RobustScaler()),
                          ("QuantileTransformer",  preprocessing.QuantileTransformer(output_distribution='normal',
                                                                                     n_quantiles=10,
                                                                                     random_state=0)),
                           ('yeo-johnson', preprocessing.PowerTransformer(method='yeo-johnson', standardize=False)),
                           ('normalizer', preprocessing.Normalizer(norm='l2')  )
                           ]
    
    
    
    
    column_trans = make_column_transformer( *transformation_list)
        
    
    if dir_to_save is None:
        dir_to_save = os.getcwd()
    
    pipeline_filename = os.path.join(dir_to_save, 'my_estimator.html')
    
    set_config(display='diagram')   
    
    from sklearn.utils import estimator_html_repr
    with open(pipeline_filename, 'w') as f:  
        f.write(estimator_html_repr(column_trans))
        
    return FeatureUnion(transformation_list), transformation_list






def check_if_data_is_gaussian_distributed(y_true=None, y_pred=None, alpha=0.05):
    
    # null hypothesis: x comes from a normal distribution
    
    if isinstance(y_true, (pd.Series, pd.DataFrame)):
        y_true = y_true.values
        
        
    zscore , pvalue = normaltest(y_true.flatten(), nan_policy='omit')
    
    # if pvalue < 0.05, "The null hypothesis can be rejected"
    
    return pvalue
    
   


class Data_Distribution_checker(BaseEstimator):
    
    def __init__(self,  feature_union, 
                 alpha=0.05, n_jobs=None,
                 transformer_list=None,
                 transformer_weights=None,
                 verbose=False):
        
        self.feature_union = feature_union
        super().__init__()
        
    def get_params(self, deep=True):
        # this estimator has the feature_union attribute
        return {"feature_union": self.feature_union}
        
        
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self
        
    def transform(self, x, y=None):
        
        print( 'Normalizer_Pipeline transform called \n\n')
        
        return self.feature_union.transform(x)
    
    def score(self, x, y=None, **kwargs):
        return self.predict_proba(x, y)
    
    def predict_proba(self, x=None, y=None):
        if y:
            return check_if_data_is_gaussian_distributed(y_true=y)
        else:
            return check_if_data_is_gaussian_distributed(y_true=x)
        
    
    def predict(self, x):
        
        if isinstance(x, (pd.Series, pd.DataFrame)):
            x = x.values
        
        return stats.zscore(x.flatten(), nan_policy='omit')
    
    def fit(self, x, y=None):
        
        return self
    
    def fit_transform(self, x, y=None):
        return self.feature_union.fit_transform(x)
    
    
    def fit_predict(self, x):
        y = self.feature_union.fit_transform(x)
        return self.predict(y)
        

if '__main__' == __name__:
    
      
    X = pd.DataFrame(np.random.randn(20, 5))
    
    
    Feature_Union, transformation_list = make_transform_pipeline()
    
    DDC = Data_Distribution_checker(feature_union=Feature_Union)
    
    Feature_Union_with_predictor = Pipeline(steps=[
                                                    
                                                    ('transformers', Feature_Union) ,
                                                    ('estimator', DDC)
                                            
                                           ])
    
    
    param_grid = dict(transformers__QuantileTransformer__n_quantiles=np.arange(10, np.size(X)//5),
                      transformers__QuantileTransformer__output_distribution=['normal', 'uniform'],
                      estimator__feature_union = [Feature_Union])
    
    
    score_params = dict(alpha=0.05)
    

    # here I use the option "greater_is_better=True", since the normality pvalue scores indicate that the data comes from a normal (gaussian) distribution when it is closer to 1, and closer to zero otherwise.

    My_gaussian_scorer = make_scorer(check_if_data_is_gaussian_distributed,
                                        greater_is_better=True,
                                        needs_proba=False,
                                        **score_params)
    
    
    
    grid_search = GridSearchCV(Feature_Union_with_predictor, 
                               param_grid=param_grid, 
                               n_jobs=1,
                               scoring={'gaussian': My_gaussian_scorer},
                               refit='gaussian'
                               )
    
    # here the code breaks (check erro message below)            
    search  = grid_search.fit(X)            
    print("Best parameter (CV score=%0.3f):" % search.best_score_)
    print(search .best_params_)
   search  = grid_search.fit(X)

  File "C:\Anaconda3\envs\Python_3.8\lib\site-packages\sklearn\utils\validation.py", line 72, in inner_f
    return f(**kwargs)

  File "C:\Anaconda3\envs\Python_3.8\lib\site-packages\sklearn\model_selection\_search.py", line 736, in fit
    self._run_search(evaluate_candidates)

  File "C:\Anaconda3\envs\Python_3.8\lib\site-packages\sklearn\model_selection\_search.py", line 1188, in _run_search
    evaluate_candidates(ParameterGrid(self.param_grid))

  File "C:\Anaconda3\envs\Python_3.8\lib\site-packages\sklearn\model_selection\_search.py", line 708, in evaluate_candidates
    out = parallel(delayed(_fit_and_score)(clone(base_estimator),

  File "C:\Anaconda3\envs\Python_3.8\lib\site-packages\joblib\parallel.py", line 1029, in __call__
    if self.dispatch_one_batch(iterator):

  File "C:\Anaconda3\envs\Python_3.8\lib\site-packages\joblib\parallel.py", line 847, in dispatch_one_batch
    self._dispatch(tasks)

  File "C:\Anaconda3\envs\Python_3.8\lib\site-packages\joblib\parallel.py", line 765, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)

  File "C:\Anaconda3\envs\Python_3.8\lib\site-packages\joblib\_parallel_backends.py", line 208, in apply_async
    result = ImmediateResult(func)

  File "C:\Anaconda3\envs\Python_3.8\lib\site-packages\joblib\_parallel_backends.py", line 572, in __init__
    self.results = batch()

  File "C:\Anaconda3\envs\Python_3.8\lib\site-packages\joblib\parallel.py", line 252, in __call__
    return [func(*args, **kwargs)

  File "C:\Anaconda3\envs\Python_3.8\lib\site-packages\joblib\parallel.py", line 252, in <listcomp>
    return [func(*args, **kwargs)

  File "C:\Anaconda3\envs\Python_3.8\lib\site-packages\sklearn\model_selection\_validation.py", line 560, in _fit_and_score
    test_scores = _score(estimator, X_test, y_test, scorer)

  File "C:\Anaconda3\envs\Python_3.8\lib\site-packages\sklearn\model_selection\_validation.py", line 605, in _score
    scores = scorer(estimator, X_test)

  File "C:\Anaconda3\envs\Python_3.8\lib\site-packages\sklearn\metrics\_scorer.py", line 87, in __call__
    score = scorer._score(cached_call, estimator,

TypeError: _score() missing 1 required positional argument: 'y_true'