Python sklearn转换管道和featureunion

Python sklearn转换管道和featureunion,python,machine-learning,scikit-learn,Python,Machine Learning,Scikit Learn,我在尝试运行以下代码时遇到问题。这是房价的机器学习问题 from sklearn.pipeline import FeatureUnion from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.base import BaseEstimator,TransformerMixin num_attributes=list(housing_num) cat

我在尝试运行以下代码时遇到问题。这是房价的机器学习问题

from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator,TransformerMixin

num_attributes=list(housing_num)
cat_attributes=['ocean_proximity']
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self,attribute_names):
        self.attribute_names=attribute_names
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        return X[self.attribute_names].values

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room 
    def fit(self, X,y=None):
        return self # nothing else to do 
    def transform(self, X,y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix] 
        population_per_household = X[:, population_ix] / X[:, household_ix] 
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix] 
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]


num_pipeline=Pipeline([
    ('selector',DataFrameSelector(num_attributes)),
    ('imputer',Imputer(strategy="median")),
    ('attribs_adder',CombinedAttributesAdder()),
    ('std_scalar',StandardScaler()),
    ])
cat_pipeline=Pipeline([
    ('selector',DataFrameSelector(cat_attributes)),
    ('label_binarizer',LabelBinarizer()),
    ])
full_pipeline=FeatureUnion(transformer_list=[
    ("num_pipeline",num_pipeline),
    ("cat_pipeline",cat_pipeline),
    ])
当我尝试运行时出现错误:

housing_prepared = full_pipeline.fit_transform(housing)
误差如下所示:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-141-acd0fd68117b> in <module>()
----> 1 housing_prepared = full_pipeline.fit_transform(housing)

/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/pipeline.pyc in fit_transform(self, X, y, **fit_params)
    744             delayed(_fit_transform_one)(trans, weight, X, y,
    745                                         **fit_params)
--> 746             for name, trans, weight in self._iter())
    747 
    748         if not result:

/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
    777             # was dispatched. In particular this covers the edge
    778             # case of Parallel used with an exhausted iterator.
--> 779             while self.dispatch_one_batch(iterator):
    780                 self._iterating = True
    781             else:

/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator)
    623                 return False
    624             else:
--> 625                 self._dispatch(tasks)
    626                 return True
    627 

/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch)
    586         dispatch_timestamp = time.time()
    587         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588         job = self._backend.apply_async(batch, callback=cb)
    589         self._jobs.append(job)
    590 

/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in apply_async(self, func, callback)
    109     def apply_async(self, func, callback=None):
    110         """Schedule a func to be run"""
--> 111         result = ImmediateResult(func)
    112         if callback:
    113             callback(result)

/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in __init__(self, batch)
    330         # Don't delay the application, to avoid keeping the input
    331         # arguments in memory
--> 332         self.results = batch()
    333 
    334     def get(self):

/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/pipeline.pyc in _fit_transform_one(transformer, weight, X, y, **fit_params)
    587                        **fit_params):
    588     if hasattr(transformer, 'fit_transform'):
--> 589         res = transformer.fit_transform(X, y, **fit_params)
    590     else:
    591         res = transformer.fit(X, y, **fit_params).transform(X)

/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/pipeline.pyc in fit_transform(self, X, y, **fit_params)
    290         Xt, fit_params = self._fit(X, y, **fit_params)
    291         if hasattr(last_step, 'fit_transform'):
--> 292             return last_step.fit_transform(Xt, y, **fit_params)
    293         elif last_step is None:
    294             return Xt

TypeError: fit_transform() takes exactly 2 arguments (3 given)
这些可以正确执行,但我得到的结果是大小为(16512,16)的numpy.ndarray,而
housing\u prepared=full\u pipeline.fit\u transform(housing)
的预期结果应该是大小为(16512,17)的凹凸.ndarray这是我的第二个问题,为什么会造成这种差异?

外壳是一个大小为(16512,9)的数据帧,只有1个分类特征和8个数字特征


提前感谢。

看起来sklearn以另一种超出您预期的方式识别数据类型。确保数字标识为int。最简单的方法:使用“你的”帖子作者提供的数据

看起来sklearn以另一种超出您预期的方式识别数据类型。确保数字标识为int。最简单的方法:使用“你的”帖子作者提供的数据

我读这本书时遇到了这个问题。在尝试了一系列变通方法(我觉得这是在浪费时间)之后,我放弃了并安装了scikit learn v0.20 dev。下载控制盘并使用pip安装它。这应该允许您使用专为处理这些问题而设计的CategoricalEncoder类

我读这本书时遇到了这个问题。在尝试了一系列变通方法(我觉得这是在浪费时间)之后,我放弃了并安装了scikit learn v0.20 dev。下载控制盘并使用pip安装它。这应该允许您使用专为处理这些问题而设计的CategoricalEncoder类

我遇到了同样的问题,它是由缩进问题引起的,缩进问题不会总是抛出错误(请参阅)


如果您直接从书中复制代码,请确保代码缩进正确。

我遇到了同样的问题,它是由缩进问题引起的,缩进问题不会总是引发错误(请参阅)

如果您直接从书中复制代码,请确保代码缩进正确

  • TypeError:fit\u transform()只接受2个参数(给定3个)
  • 为什么会出现这种错误

    回答:因为您使用的是LabelBinarizer(),它用于响应变量

    怎么办?:你有:

    • 改为使用OneHotEncoder()
    • 为LabelBinarizer编写自定义转换器
    • 使用支持您的代码的旧版本的sklean
  • 准备的住房形状不同
  • 如果你正在使用,那么你有9个预测因子(8个数字和1个分类)。 CombinedAttributesAdder()又增加了3列,LabelBinarizer()又增加了5列,因此它变成了17列
    请记住,sklearn.pipeline.FeatureUnion连接多个transformer对象的结果

    手动执行此操作时,不会添加原始的“ocean_Proximition”变量

    让我们看看它的实际行动:

    print("housing_shape: ", housing.shape)
    
    num_attribs = list(housing_num)
    cat_attribs = ["ocean_proximity"]
    
    DFS=DataFrameSelector(num_attribs)
    a1=DFS.fit_transform(housing)
    
    print('Numerical variables_shape: ', a1.shape)
    
    imputer=SimpleImputer(strategy='median')
    a2=imputer.fit_transform(a1)
    
    a2.shape 
    
    与a1.形状相同

    CAA=CombinedAttributesAdder()
    a3=CAA.fit_transform(a2)
    SS=StandardScaler()
    a4=SS.fit_transform(a3) # added 3 variables
    print('Numerical variable shape after CAA: ', a4.shape, '\n')
    
    DFS2=DataFrameSelector(cat_attribs)
    b1=DFS2.fit_transform(housing)
    
    print("Categorical variables_shape: ", b1.shape)
    
    LB=LabelBinarizer()
    b2=LB.fit_transform(b1) # instead of one column now we have 5 columns
    print('categorical variable shape after LabelBinarization: ', b2.shape) 
    
    new_features = pd.DataFrame(a4)
    new_features.shape
    
    ocean_cat = ['<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'NEAR BAY', 'ISLAND']
    ocean_LabelBinarize = pd.DataFrame(b2, columns=[ocean_cat[i] for i in 
    range(len(ocean_cat))])
    
    ocean_LabelBinarize
    
    housing_prepared_new = pd.concat([new_features, ocean_LabelBinarize], 
    axis=1)
    
    print('Shape of new data prepared by above steps', 
    housing_prepared_new.shape)
    
    4列增加

    print(b2)
    
    result=np.concatenate((a4,b2),axis=1)
    print('final shape: ', result.shape, '\n') # Final shape
    
    注意:转换列(结果为a4)和二值化列(结果为b2)尚未添加到原始数据帧中。 为此,需要将numpy数组b2转换为数据帧

    CAA=CombinedAttributesAdder()
    a3=CAA.fit_transform(a2)
    SS=StandardScaler()
    a4=SS.fit_transform(a3) # added 3 variables
    print('Numerical variable shape after CAA: ', a4.shape, '\n')
    
    DFS2=DataFrameSelector(cat_attribs)
    b1=DFS2.fit_transform(housing)
    
    print("Categorical variables_shape: ", b1.shape)
    
    LB=LabelBinarizer()
    b2=LB.fit_transform(b1) # instead of one column now we have 5 columns
    print('categorical variable shape after LabelBinarization: ', b2.shape) 
    
    new_features = pd.DataFrame(a4)
    new_features.shape
    
    ocean_cat = ['<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'NEAR BAY', 'ISLAND']
    ocean_LabelBinarize = pd.DataFrame(b2, columns=[ocean_cat[i] for i in 
    range(len(ocean_cat))])
    
    ocean_LabelBinarize
    
    housing_prepared_new = pd.concat([new_features, ocean_LabelBinarize], 
    axis=1)
    
    print('Shape of new data prepared by above steps', 
    housing_prepared_new.shape)
    
    new_features=pd.DataFrame(a4)
    新的形状
    海洋猫=['
    
  • TypeError:fit\u transform()只接受2个参数(给定3个)
  • 为什么会出现这种错误

    回答:因为您使用的是LabelBinarizer(),它用于响应变量

    怎么办?:你有:

    • 改为使用OneHotEncoder()
    • 为LabelBinarizer编写自定义转换器
    • 使用支持您的代码的旧版本的sklean
  • 准备的住房形状不同
  • 如果你正在使用,那么你有9个预测因子(8个数字和1个分类)。 CombinedAttributesAdder()又增加了3列,LabelBinarizer()又增加了5列,因此它变成了17列
    请记住,sklearn.pipeline.FeatureUnion连接多个transformer对象的结果

    手动执行此操作时,不会添加原始的“ocean_Proximition”变量

    让我们看看它的实际行动:

    print("housing_shape: ", housing.shape)
    
    num_attribs = list(housing_num)
    cat_attribs = ["ocean_proximity"]
    
    DFS=DataFrameSelector(num_attribs)
    a1=DFS.fit_transform(housing)
    
    print('Numerical variables_shape: ', a1.shape)
    
    imputer=SimpleImputer(strategy='median')
    a2=imputer.fit_transform(a1)
    
    a2.shape 
    
    与a1.形状相同

    CAA=CombinedAttributesAdder()
    a3=CAA.fit_transform(a2)
    SS=StandardScaler()
    a4=SS.fit_transform(a3) # added 3 variables
    print('Numerical variable shape after CAA: ', a4.shape, '\n')
    
    DFS2=DataFrameSelector(cat_attribs)
    b1=DFS2.fit_transform(housing)
    
    print("Categorical variables_shape: ", b1.shape)
    
    LB=LabelBinarizer()
    b2=LB.fit_transform(b1) # instead of one column now we have 5 columns
    print('categorical variable shape after LabelBinarization: ', b2.shape) 
    
    new_features = pd.DataFrame(a4)
    new_features.shape
    
    ocean_cat = ['<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'NEAR BAY', 'ISLAND']
    ocean_LabelBinarize = pd.DataFrame(b2, columns=[ocean_cat[i] for i in 
    range(len(ocean_cat))])
    
    ocean_LabelBinarize
    
    housing_prepared_new = pd.concat([new_features, ocean_LabelBinarize], 
    axis=1)
    
    print('Shape of new data prepared by above steps', 
    housing_prepared_new.shape)
    
    4列增加

    print(b2)
    
    result=np.concatenate((a4,b2),axis=1)
    print('final shape: ', result.shape, '\n') # Final shape
    
    注意:转换列(结果为a4)和二值化列(结果为b2)尚未添加到原始数据帧中。 为此,需要将numpy数组b2转换为数据帧

    CAA=CombinedAttributesAdder()
    a3=CAA.fit_transform(a2)
    SS=StandardScaler()
    a4=SS.fit_transform(a3) # added 3 variables
    print('Numerical variable shape after CAA: ', a4.shape, '\n')
    
    DFS2=DataFrameSelector(cat_attribs)
    b1=DFS2.fit_transform(housing)
    
    print("Categorical variables_shape: ", b1.shape)
    
    LB=LabelBinarizer()
    b2=LB.fit_transform(b1) # instead of one column now we have 5 columns
    print('categorical variable shape after LabelBinarization: ', b2.shape) 
    
    new_features = pd.DataFrame(a4)
    new_features.shape
    
    ocean_cat = ['<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'NEAR BAY', 'ISLAND']
    ocean_LabelBinarize = pd.DataFrame(b2, columns=[ocean_cat[i] for i in 
    range(len(ocean_cat))])
    
    ocean_LabelBinarize
    
    housing_prepared_new = pd.concat([new_features, ocean_LabelBinarize], 
    axis=1)
    
    print('Shape of new data prepared by above steps', 
    housing_prepared_new.shape)
    
    new_features=pd.DataFrame(a4)
    新的形状
    
    海洋猫=[“第一个错误是由于
    LabelBinarizer
    。它只需要一个输入y,但由于管道,X和y都将被发送到它。请共享数据,我可以提供帮助。@VivekKumar这是链接,是房屋数据:为什么你认为结果应该有17列而不是16列?@VivekKumar实际上我也认为应该这样做。”d是16列。但这实际上是教科书上的一个示例。代码是他们的。他们可以成功运行我无法运行的代码,并得到17列我无法理解的结果。第一个错误是由于
    LabelBinarizer
    。它只需要一个输入y,但由于管道,X和y都将发送到它。请共享数据和我可以提供帮助。@VivekKumar这里是链接,是住房数据:为什么你认为结果应该是17列而不是16列?@VivekKumar实际上我也认为应该是16列。但这实际上是教科书上的一个例子。代码是他们的。他们可以成功运行我无法运行的代码,他们得到17列这是我无法理解的结果。