Python sklearn.compose.ColumnTransformer:fit_transform()接受2个位置参数,但给出了3个

Python sklearn.compose.ColumnTransformer:fit_transform()接受2个位置参数,但给出了3个,python,machine-learning,scikit-learn,Python,Machine Learning,Scikit Learn,我正在研究一个使用ColumnTransformer和LabelEncoder对著名的泰坦尼克号数据集X进行预处理的示例: Age Embarked Fare Sex 0 22.0 S 7.2500 male 1 38.0 C 71.2833 female 2 26.0 S 7.9250 female 3 35.0 S 53.1000 female 4 35.0 S

我正在研究一个使用
ColumnTransformer
LabelEncoder
对著名的泰坦尼克号数据集
X
进行预处理的示例:

    Age Embarked    Fare    Sex
0   22.0    S      7.2500   male
1   38.0    C      71.2833  female
2   26.0    S      7.9250   female
3   35.0    S      53.1000  female
4   35.0    S      8.0500   male
这样调用变压器:

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
ColumnTransformer(
    transformers=[
        ("label-encode categorical", LabelEncoder(), ["Sex", "Embarked"])
    ]
).fit(X).transform(X)
结果:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-54-fd5a05b7e47e> in <module>
      4         ("label-encode categorical", LabelEncoder(), ["Sex", "Embarked"])
      5     ]
----> 6 ).fit(X).transform(X)

~/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in fit(self, X, y)
    418         # we use fit_transform to make sure to set sparse_output_ (for which we
    419         # need the transformed data) to have consistent output type in predict
--> 420         self.fit_transform(X, y=y)
    421         return self
    422 

~/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
    447         self._validate_remainder(X)
    448 
--> 449         result = self._fit_transform(X, y, _fit_transform_one)
    450 
    451         if not result:

~/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in _fit_transform(self, X, y, func, fitted)
    391                               _get_column(X, column), y, weight)
    392                 for _, trans, column, weight in self._iter(
--> 393                     fitted=fitted, replace_strings=True))
    394         except ValueError as e:
    395             if "Expected 2D array, got 1D array instead" in str(e):

~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    915             # remaining jobs.
    916             self._iterating = False
--> 917             if self.dispatch_one_batch(iterator):
    918                 self._iterating = self._original_iterator is not None
    919 

~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
    757                 return False
    758             else:
--> 759                 self._dispatch(tasks)
    760                 return True
    761 

~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
    714         with self._lock:
    715             job_idx = len(self._jobs)
--> 716             job = self._backend.apply_async(batch, callback=cb)
    717             # A job can complete so quickly than its callback is
    718             # called before we get here, causing self._jobs to

~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
    180     def apply_async(self, func, callback=None):
    181         """Schedule a func to be run"""
--> 182         result = ImmediateResult(func)
    183         if callback:
    184             callback(result)

~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
    547         # Don't delay the application, to avoid keeping the input
    548         # arguments in memory
--> 549         self.results = batch()
    550 
    551     def get(self):

~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
    223         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    224             return [func(*args, **kwargs)
--> 225                     for func, args, kwargs in self.items]
    226 
    227     def __len__(self):

~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
    223         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    224             return [func(*args, **kwargs)
--> 225                     for func, args, kwargs in self.items]
    226 
    227     def __len__(self):

~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, **fit_params)
    612 def _fit_transform_one(transformer, X, y, weight, **fit_params):
    613     if hasattr(transformer, 'fit_transform'):
--> 614         res = transformer.fit_transform(X, y, **fit_params)
    615     else:
    616         res = transformer.fit(X, y, **fit_params).transform(X)

TypeError: fit_transform() takes 2 positional arguments but 3 were given
---------------------------------------------------------------------------
TypeError回溯(最近一次调用上次)
在里面
4(“标签编码分类”,LabelEncoder(),[“性别”,“装载”])
5     ]
---->6).拟合(X).变换(X)
~/anaconda3/lib/python3.7/site-packages/sklearn/compose//u column\u transformer.py合适(self,X,y)
418#我们使用fit_变换来确保设置稀疏输出(我们为其
419(需要转换的数据)在预测中具有一致的输出类型
-->420自拟合_变换(X,y=y)
421返回自我
422
~/anaconda3/lib/python3.7/site-packages/sklearn/compose//u column\u transformer.py in fit\u transform(self,X,y)
447自验证余数(X)
448
-->449结果=自拟合变换(X,y,拟合变换)
450
451如果没有结果:
~/anaconda3/lib/python3.7/site-packages/sklearn/compose//u column\u transformer.py in\u fit\u transform(self、X、y、func、fitted)
391 _get_列(X,列),y,重量)
392用于运输、柱、自重(
-->393已安装=已安装,更换(正确)
394除e值错误外:
395如果str(e)中的“预期2D数组,改为1D数组”:
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in\uuuu调用(self,iterable)
915#剩余工作。
916自迭代=错误
-->917如果自行调度一批(迭代器):
918 self.\u iterating=self.\u original\u iterator不是None
919
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py在dispatch\u one\u批处理中(self,迭代器)
757返回错误
758其他:
-->759自我派遣(任务)
760返回真值
761
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in_dispatch(self,batch)
714带自锁:
715作业idx=len(自作业)
-->716作业=self.\u后端.apply\u异步(批处理,回调=cb)
717#一个作业完成的速度比它的回调速度要快
718#在我们到达这里之前打电话,导致self.#你的工作
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib//\u parallel\u backends.py in apply\u async(self、func、callback)
180 def apply_async(self、func、callback=None):
181“计划要运行的func”
-->182结果=立即结果(func)
183如果回调:
184回调(结果)
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib//u parallel\u backends.py in\uuuuu init\uuuuuu(self,batch)
547#不要延迟应用程序,以免保留输入
548#内存中的参数
-->549 self.results=batch()
550
551 def get(自我):
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in\uuuuu调用(self)
223具有并行_后端(self._后端,n_作业=self._n_作业):
224返回[func(*args,**kwargs)
-->225用于自身项目中的func、ARG、kwargs]
226
227定义长度(自):
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in(.0)
223具有并行_后端(self._后端,n_作业=self._n_作业):
224返回[func(*args,**kwargs)
-->225用于自身项目中的func、ARG、kwargs]
226
227定义长度(自):
~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in_-fit_-transform_-one(变压器、X、y、重量、**配合参数)
612定义拟合变换一(变压器,X,y,重量,**拟合参数):
613如果hasattr(变压器,“拟合变换”):
-->614 res=变换器。拟合变换(X,y,**拟合参数)
615其他:
616 res=变换器.fit(X,y,**拟合参数).transform(X)
TypeError:fit_transform()接受2个位置参数,但给出了3个

此处的
**fit_params
有什么问题?对我来说,这看起来像是
sklearn
中的一个bug,或者至少是不兼容。

我认为这实际上是
LabelEncoder
的一个问题。
LabelEncoder.fit
方法只接受
self
y
作为参数(这很奇怪,因为大多数transformer对象都有
fit(X,y=None,**fit_参数)
)。无论如何,在管道中,无论您传递了什么,都会使用
fit_params
调用转换器。在这种特殊情况下,传递给
LabelEncoder.fit
的确切参数是
X
和一个空字典
{}
。从而提高了误差


从我的观点来看,这是LabelEncoder中的一个bug,但是你应该向sklearn的人了解这一点,因为他们可能有一些理由以不同的方式实现
fit
方法。

这不符合你的目的,主要有两个原因

  • LabelEncoder()
    设计用于目标变量(y)。当
    columnTransformer()
    尝试馈送
    X,y=None,fit_params={}
    时,这就是获取位置参数错误的原因
  • 发件人:

    使用0到n_class-1之间的值对标签进行编码

    适合(y)
    装配标签编码器

    参数:
    y:类似阵列的形状(n_个样本,)
    目标值

  • 即使你做了一个解决方案来移除emp