Python sklearn.compose.ColumnTransformer:fit_transform()接受2个位置参数,但给出了3个
我正在研究一个使用Python sklearn.compose.ColumnTransformer:fit_transform()接受2个位置参数,但给出了3个,python,machine-learning,scikit-learn,Python,Machine Learning,Scikit Learn,我正在研究一个使用ColumnTransformer和LabelEncoder对著名的泰坦尼克号数据集X进行预处理的示例: Age Embarked Fare Sex 0 22.0 S 7.2500 male 1 38.0 C 71.2833 female 2 26.0 S 7.9250 female 3 35.0 S 53.1000 female 4 35.0 S
ColumnTransformer
和LabelEncoder
对著名的泰坦尼克号数据集X
进行预处理的示例:
Age Embarked Fare Sex
0 22.0 S 7.2500 male
1 38.0 C 71.2833 female
2 26.0 S 7.9250 female
3 35.0 S 53.1000 female
4 35.0 S 8.0500 male
这样调用变压器:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
ColumnTransformer(
transformers=[
("label-encode categorical", LabelEncoder(), ["Sex", "Embarked"])
]
).fit(X).transform(X)
结果:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-54-fd5a05b7e47e> in <module>
4 ("label-encode categorical", LabelEncoder(), ["Sex", "Embarked"])
5 ]
----> 6 ).fit(X).transform(X)
~/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in fit(self, X, y)
418 # we use fit_transform to make sure to set sparse_output_ (for which we
419 # need the transformed data) to have consistent output type in predict
--> 420 self.fit_transform(X, y=y)
421 return self
422
~/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
447 self._validate_remainder(X)
448
--> 449 result = self._fit_transform(X, y, _fit_transform_one)
450
451 if not result:
~/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in _fit_transform(self, X, y, func, fitted)
391 _get_column(X, column), y, weight)
392 for _, trans, column, weight in self._iter(
--> 393 fitted=fitted, replace_strings=True))
394 except ValueError as e:
395 if "Expected 2D array, got 1D array instead" in str(e):
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
915 # remaining jobs.
916 self._iterating = False
--> 917 if self.dispatch_one_batch(iterator):
918 self._iterating = self._original_iterator is not None
919
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, **fit_params)
612 def _fit_transform_one(transformer, X, y, weight, **fit_params):
613 if hasattr(transformer, 'fit_transform'):
--> 614 res = transformer.fit_transform(X, y, **fit_params)
615 else:
616 res = transformer.fit(X, y, **fit_params).transform(X)
TypeError: fit_transform() takes 2 positional arguments but 3 were given
---------------------------------------------------------------------------
TypeError回溯(最近一次调用上次)
在里面
4(“标签编码分类”,LabelEncoder(),[“性别”,“装载”])
5 ]
---->6).拟合(X).变换(X)
~/anaconda3/lib/python3.7/site-packages/sklearn/compose//u column\u transformer.py合适(self,X,y)
418#我们使用fit_变换来确保设置稀疏输出(我们为其
419(需要转换的数据)在预测中具有一致的输出类型
-->420自拟合_变换(X,y=y)
421返回自我
422
~/anaconda3/lib/python3.7/site-packages/sklearn/compose//u column\u transformer.py in fit\u transform(self,X,y)
447自验证余数(X)
448
-->449结果=自拟合变换(X,y,拟合变换)
450
451如果没有结果:
~/anaconda3/lib/python3.7/site-packages/sklearn/compose//u column\u transformer.py in\u fit\u transform(self、X、y、func、fitted)
391 _get_列(X,列),y,重量)
392用于运输、柱、自重(
-->393已安装=已安装,更换(正确)
394除e值错误外:
395如果str(e)中的“预期2D数组,改为1D数组”:
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in\uuuu调用(self,iterable)
915#剩余工作。
916自迭代=错误
-->917如果自行调度一批(迭代器):
918 self.\u iterating=self.\u original\u iterator不是None
919
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py在dispatch\u one\u批处理中(self,迭代器)
757返回错误
758其他:
-->759自我派遣(任务)
760返回真值
761
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in_dispatch(self,batch)
714带自锁:
715作业idx=len(自作业)
-->716作业=self.\u后端.apply\u异步(批处理,回调=cb)
717#一个作业完成的速度比它的回调速度要快
718#在我们到达这里之前打电话,导致self.#你的工作
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib//\u parallel\u backends.py in apply\u async(self、func、callback)
180 def apply_async(self、func、callback=None):
181“计划要运行的func”
-->182结果=立即结果(func)
183如果回调:
184回调(结果)
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib//u parallel\u backends.py in\uuuuu init\uuuuuu(self,batch)
547#不要延迟应用程序,以免保留输入
548#内存中的参数
-->549 self.results=batch()
550
551 def get(自我):
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in\uuuuu调用(self)
223具有并行_后端(self._后端,n_作业=self._n_作业):
224返回[func(*args,**kwargs)
-->225用于自身项目中的func、ARG、kwargs]
226
227定义长度(自):
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in(.0)
223具有并行_后端(self._后端,n_作业=self._n_作业):
224返回[func(*args,**kwargs)
-->225用于自身项目中的func、ARG、kwargs]
226
227定义长度(自):
~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in_-fit_-transform_-one(变压器、X、y、重量、**配合参数)
612定义拟合变换一(变压器,X,y,重量,**拟合参数):
613如果hasattr(变压器,“拟合变换”):
-->614 res=变换器。拟合变换(X,y,**拟合参数)
615其他:
616 res=变换器.fit(X,y,**拟合参数).transform(X)
TypeError:fit_transform()接受2个位置参数,但给出了3个
此处的
**fit_params
有什么问题?对我来说,这看起来像是sklearn
中的一个bug,或者至少是不兼容。我认为这实际上是LabelEncoder
的一个问题。LabelEncoder.fit
方法只接受self
和y
作为参数(这很奇怪,因为大多数transformer对象都有fit(X,y=None,**fit_参数)
)。无论如何,在管道中,无论您传递了什么,都会使用fit_params
调用转换器。在这种特殊情况下,传递给LabelEncoder.fit
的确切参数是X
和一个空字典{}
。从而提高了误差
从我的观点来看,这是LabelEncoder中的一个bug,但是你应该向sklearn的人了解这一点,因为他们可能有一些理由以不同的方式实现
fit
方法。这不符合你的目的,主要有两个原因
LabelEncoder()
设计用于目标变量(y)。当columnTransformer()
尝试馈送X,y=None,fit_params={}
时,这就是获取位置参数错误的原因装配标签编码器 参数:
y:类似阵列的形状(n_个样本,)
目标值