Python 自定义转换器和GridSearch-管道中的ValueError

Python 自定义转换器和GridSearch-管道中的ValueError,python,scikit-learn,grid-search,Python,Scikit Learn,Grid Search,我试图在scikit学习管道中优化超参数,使用一些定制的转换器,但我不断得到一个错误: from sklearn.model_selection import TimeSeriesSplit from sklearn.model_selection import GridSearchCV from sklearn.base import BaseEstimator, TransformerMixin from sklearn.pipeline import Pipeline class Rol

我试图在scikit学习管道中优化超参数,使用一些定制的转换器,但我不断得到一个错误:

from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

class RollingMeanTransform(BaseEstimator, TransformerMixin):

    def __init__(self, col, window=3):
        self._window = window
        self._col = col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()
        df['{}_rolling_mean'.format(self._col)] = df[self._col].shift(1).rolling(self._window).mean().fillna(0.0)
        return df


class TimeEncoding(BaseEstimator, TransformerMixin):

    def __init__(self, col, drop_original=True):
        self._col = col 
        self._drop_original = drop_original

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        unique_vals = float(len(X[self._col].unique()))
        X['sin_{}'.format(self._col)] = np.sin(2 * np.pi * X[self._col] / unique_vals)
        X['cos_{}'.format(self._col)] = np.cos(2 * np.pi * X[self._col] / unique_vals)
        if self._drop_original:
            X.drop([self._col], axis=1, inplace=True, errors='ignore')
        return X


huber = HuberRegressor()
huber_max_iter = [100, 200, 500, 1000]
huber_alpha = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10, 100]
huber_epsilon = [1.15, 1.25, 1.35, 1.5]

huber_grid = {'clf__alpha':huber_alpha,
              'clf__epsilon':huber_epsilon,
              'clf__max_iter':huber_max_iter,
}

regression_pipeline = Pipeline([('encoding', TimeEncoding('my_col')),
                                ('mean', RollingMeanTransform('my_other_col')), 
                                ('select', Treshold()),
                                ('scale', Scale()),
                                ('clf', huber)
])
我尝试将其与以下内容相匹配:

grid = GridSearchCV(regression_pipeline, huber_grid, cv=TimeSeriesSplit(n_splits=5))
grid.fit(X_train, y_train)
但我得到以下解释:

ValueError                                Traceback (most recent call last)
<ipython-input-14-3949096c802a> in <module>()
----> 1 grid.fit(X_train, y_train)

~/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
    637                                   error_score=self.error_score)
    638           for parameters, (train, test) in product(candidate_params,
--> 639                                                    cv.split(X, y, groups)))
    640 
    641         # if one choose to see train score, "out" will contain train score info

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    777             # was dispatched. In particular this covers the edge
    778             # case of Parallel used with an exhausted iterator.
--> 779             while self.dispatch_one_batch(iterator):
    780                 self._iterating = True
    781             else:

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
    623                 return False
    624             else:
--> 625                 self._dispatch(tasks)
    626                 return True
    627 

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
    586         dispatch_timestamp = time.time()
    587         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588         job = self._backend.apply_async(batch, callback=cb)
    589         self._jobs.append(job)
    590 

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
    109     def apply_async(self, func, callback=None):
    110         """Schedule a func to be run"""
--> 111         result = ImmediateResult(func)
    112         if callback:
    113             callback(result)

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
    330         # Don't delay the application, to avoid keeping the input
    331         # arguments in memory
--> 332         self.results = batch()
    333 
    334     def get(self):

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

~/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
    456             estimator.fit(X_train, **fit_params)
    457         else:
--> 458             estimator.fit(X_train, y_train, **fit_params)
    459 
    460     except Exception as e:

~/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    246             This estimator
    247         """
--> 248         Xt, fit_params = self._fit(X, y, **fit_params)
    249         if self._final_estimator is not None:
    250             self._final_estimator.fit(Xt, y, **fit_params)

~/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
    211                 Xt, fitted_transformer = fit_transform_one_cached(
    212                     cloned_transformer, None, Xt, y,
--> 213                     **fit_params_steps[name])
    214                 # Replace the transformer of the step with the fitted
    215                 # transformer. This is necessary when loading the transformer

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/memory.py in __call__(self, *args, **kwargs)
    360 
    361     def __call__(self, *args, **kwargs):
--> 362         return self.func(*args, **kwargs)
    363 
    364     def call_and_shelve(self, *args, **kwargs):

~/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, weight, X, y, **fit_params)
    579                        **fit_params):
    580     if hasattr(transformer, 'fit_transform'):
--> 581         res = transformer.fit_transform(X, y, **fit_params)
    582     else:
    583         res = transformer.fit(X, y, **fit_params).transform(X)

~/anaconda3/lib/python3.6/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
    518         else:
    519             # fit method of arity 2 (supervised transformation)
--> 520             return self.fit(X, y, **fit_params).transform(X)
    521 
    522 

~/my_project/my_model.py in transform(self, X)
    126     def transform(self, X):
    127         X = X.copy()
--> 128         unique_vals = float(len(X[self._col].unique()))
    129         X['sin_{}'.format(self._col)] = np.sin(2 * np.pi * X[self._col] / unique_vals)
    130         X['cos_{}'.format(self._col)] = np.cos(2 * np.pi * X[self._col] / unique_vals)

~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2137             return self._getitem_multilevel(key)
   2138         else:
-> 2139             return self._getitem_column(key)
   2140 
   2141     def _getitem_column(self, key):

~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key)
   2144         # get column
   2145         if self.columns.is_unique:
-> 2146             return self._get_item_cache(key)
   2147 
   2148         # duplicate columns & possible reduce dimensionality

~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
   1840         res = cache.get(item)
   1841         if res is None:
-> 1842             values = self._data.get(item)
   1843             res = self._box_item_values(item, values)
   1844             cache[item] = res

~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in get(self, item, fastpath)
   3850                         loc = indexer.item()
   3851                     else:
-> 3852                         raise ValueError("cannot label index with a null key")
   3853 
   3854             return self.iget(loc, fastpath=fastpath)

ValueError: cannot label index with a null key
我得到了同样的错误,但这次调用了
mean
transformer

完整的代码示例:

from sklearn.linear_model import HuberRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import pandas as pd
import numpy as np

class RollingMeanTransform(BaseEstimator, TransformerMixin):

    def __init__(self, col, window=3):
        self._window = window
        self._col = col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()
        df['{}_rolling_mean'.format(self._col)] = df[self._col].shift(1).rolling(self._window).mean().fillna(0.0)
        return df


class TimeEncoding(BaseEstimator, TransformerMixin):

    def __init__(self, col, drop_original=True):
        self._col = col 
        self._drop_original = drop_original

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        unique_vals = float(len(X[self._col].unique()))
        X['sin_{}'.format(self._col)] = np.sin(2 * np.pi * X[self._col] / unique_vals)
        X['cos_{}'.format(self._col)] = np.cos(2 * np.pi * X[self._col] / unique_vals)
        if self._drop_original:
            X.drop([self._col], axis=1, inplace=True, errors='ignore')
        return X

class Treshold(BaseEstimator, TransformerMixin):
    # note: Threshold which removes features with constant value
    # and preserves the input data as data frame
    def __init__(self):

        self.to_keep = list()

    def fit(self, X, y=None):


        self.to_keep = list()

        self.colname_original = X.columns

        for i, col in enumerate(X):

            if len(np.unique(X.values[:, i])) >= 2:
                self.to_keep.append(col)

        return self

    def transform(self, X, copy=None):
        return X[self.to_keep]


class Scale(BaseEstimator, TransformerMixin):
    # note: scaler which keeps the input data as data frame
    # and does not scale binary features
    def __init__(self, copy=True, with_mean=True, with_std=True):
        self.scaler = StandardScaler(copy, with_mean, with_std)

        self.bin_vars_index = list()
        self.cont_vars_index = list()

        self.colnames_original = list()

    def fit(self, X, y=None):

        self.bin_vars_index = list()
        self.cont_vars_index = list()

        self.colnames_original = list()

        self.colnames_original = X.columns

        for i in range(X.shape[1]):

            if len(np.unique(X.values[:, i])) <= 2:
                self.bin_vars_index.append(i)
            else:
                self.cont_vars_index.append(i)

        self.scaler.fit(X.values[:, self.cont_vars_index])
        return self

    def transform(self, X, copy=None):
        X_tail = self.scaler.transform(X.values[:, self.cont_vars_index], copy)
        res = np.concatenate((X.values[:, self.bin_vars_index], X_tail), axis=1)

        colnames_res = np.array(
            list(self.colnames_original[self.bin_vars_index]) + list(self.colnames_original[self.cont_vars_index]))
        assert len(colnames_res) == len(self.colnames_original)
        res = pd.DataFrame(data=res, columns=colnames_res)
        return res[[str(el) for el in self.colnames_original]].set_index(X.index)



huber = HuberRegressor()
huber_max_iter = [100, 200, 500, 1000]
huber_alpha = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10, 100]
huber_epsilon = [1.15, 1.25, 1.35, 1.5]

huber_grid = {'clf__alpha':huber_alpha,
              'clf__epsilon':huber_epsilon,
              'clf__max_iter':huber_max_iter,
}

regression_pipeline = Pipeline([('encoding', TimeEncoding('my_col')),
                                ('mean', RollingMeanTransform('my_other_col')), 
                                ('select', Treshold()),
                                ('scale', Scale()),
                                ('clf', huber)
])

grid = GridSearchCV(regression_pipeline, huber_grid, cv=TimeSeriesSplit(n_splits=5))

X = pd.DataFrame(np.random.randint(low=0, high=10, size=(20, 2)), columns=['my_col', 'my_other_col'])

y = pd.Series(np.random.randint(low=0, high=10, size=(20,)))

grid.fit(X, y)
来自sklearn.linear_模型导入HuberRegressor
从sklearn.model\u选择导入TimeSeriesSplit
从sklearn.model_选择导入GridSearchCV
来自sklearn.base导入BaseEstimator,TransformerMixin
从sklearn.pipeline导入管道
从sklearn.preprocessing导入StandardScaler
作为pd进口熊猫
将numpy作为np导入
类RollingMeansTransform(BaseEstimator,TransformerMixin):
定义初始化(self,col,window=3):
self.\u window=window
自我。_col=col
def配合(自身、X、y=无):
回归自我
def变换(自,X):
df=X.copy()
df[{}u rolling\u mean'.格式(self.\u col)]=df[self.\u col].shift(1).滚动(self.\u window).mean().fillna(0.0)
返回df
类时间编码(BaseEstimator,TransformerMixin):
定义初始值(self、col、drop\u original=True):
自我。_col=col
self.\u drop\u original=drop\u original
def配合(自身、X、y=无):
回归自我
def变换(自,X):
X=X.copy()
unique\u vals=float(len(X[self.\u col].unique())
X['sin.{}.format(self.\u col)]=np.sin(2*np.pi*X[self.\u col]/unique\u vals)
X['cos{}.格式(self.\u col)]=np.cos(2*np.pi*X[self.\u col]/unique\u vals)
如果是自投原件:
X.drop([self.\u col],axis=1,inplace=True,errors='ignore')
返回X
Treshold类(BaseEstimator,TransformerMixin):
#注意:删除具有常量值的功能的阈值
#并将输入数据保留为数据帧
定义初始化(自):
self.to_keep=list()
def配合(自身、X、y=无):
self.to_keep=list()
self.colname_original=X.columns
对于i,枚举(X)中的列:
如果len(np.unique(X.values[:,i])>=2:
self.to_keep.append(列)
回归自我
def转换(自、X、复制=无):
返回X[自我保留]
等级量表(基本估计器,TransformerMixin):
#注:将输入数据保留为数据帧的定标器
#并且不会缩放二进制特征
def uuu init uuuu(self,copy=True,with_mean=True,with_std=True):
self.scaler=StandardScaler(复制,带_-mean,带_-std)
self.bin\u vars\u index=list()
self.cont_vars_index=list()
self.colnames_original=list()
def配合(自身、X、y=无):
self.bin\u vars\u index=list()
self.cont_vars_index=list()
self.colnames_original=list()
self.colnames_original=X.columns
对于范围内的i(X.shape[1]):
如果len(np.unique(X.values[:,i])您会看到GridSearchCV(以及scikit learn中的大多数交叉验证实用程序)克隆提供的数据以执行网格搜索

在这样做时,他们将使用您继承的BaseEstimator类的。现在,
get_params()
将从您声明的
\uuu init\uuu()方法中获取参数

这是:

现在要获取值,请使用]():

因此,这将给出的参数是:

col = None
drop_original = None
不是您使用的带
前导下划线的。这两个值都为“无”,因为对象没有任何具有这些名称的属性

现在,这些无值参数将用于实例化克隆对象:

然后这些
None
值将设置为您的
\u col
\u drop\u original
。这就是错误的根源

这件事已记录在:

init接受的参数都应该是关键字参数 使用默认值。换句话说,用户应该能够 实例化估计器而不向其传递任何参数。这个 参数都应该对应于描述 模型或估计器试图解决的优化问题

此外,init接受的每个关键字参数 与实例上的属性相对应。Scikit学习依赖于 这是为了找到相关的属性,以设置一个估计时,这样做 型号选择

因此,解决此问题的建议方法是从参数名称中删除前导下划线(以便
\uuuu init\uuu
self
中的名称应相同):

现在对所有自定义估计器执行此操作

现在,如果您在使用属性的前导下划线时有一些限制(可能尝试将它们设置为私有或类似),那么第二个选项是重写
set_params()
方法以显式设置参数

您可以看到GridSearchCV(以及scikit learn中的大多数交叉验证实用程序)克隆提供的数据以执行网格搜索

在这样做时,他们将使用您继承的BaseEstimator类的。现在,
get_params()
将从您声明的
\uuu init\uuu()方法中获取参数

这是:

现在要获取值,请使用]():

因此,这将给出的参数是:

col = None
drop_original = None
不是您使用的带
前导下划线的。这两个值都为“无”,因为对象没有任何具有这些名称的属性

现在,这些无值参数将用于实例化克隆对象:

然后这些
None
值将设置为您的
\u col
\u drop\u original
。这就是错误的根源

这件事已记录在:

init接受的参数都应该是关键字参数 使用默认值。换句话说,用户
    for key in self._get_param_names():
        value = getattr(self, key, None)
col = None
drop_original = None
...
new_object = klass(**new_object_params)
...
...
class TimeEncoding(BaseEstimator, TransformerMixin):
    # Changed the names from _col to col
    def __init__(self, col, drop_original=True):
        self.col = col
        self.drop_original = drop_original

    def transform(self, X):
        X = X.copy()

        # Updated the names to be used
        unique_vals = float(len(X[self.col].unique()))
        X['sin_{}'.format(self.col)] = np.sin(2 * np.pi * X[self.col] / unique_vals)
        X['cos_{}'.format(self.col)] = np.cos(2 * np.pi * X[self.col] / unique_vals)
        if self.drop_original:
            X.drop([self.col], axis=1, inplace=True, errors='ignore')
        return X