Python 如何在sklearn类中更新fit方法?
我最近开始使用GridSearchCV,对面向对象编程不太熟悉 问题:我在一个预测器中缺少一些数据,我有一个算法列表,我想与适当的参数网格一起使用。我想知道是否有一种方法可以创建一个自定义类,在拟合数据之前以不同的方式插补数据中缺失的值,并尝试对我同时使用的任何算法进行不同的属性组合。有可能吗 非常感谢你的帮助 这是我创建的一个类,用于插补我的住房价格数据集SQFT列中缺失的值:Python 如何在sklearn类中更新fit方法?,python,python-3.x,object,machine-learning,scikit-learn,Python,Python 3.x,Object,Machine Learning,Scikit Learn,我最近开始使用GridSearchCV,对面向对象编程不太熟悉 问题:我在一个预测器中缺少一些数据,我有一个算法列表,我想与适当的参数网格一起使用。我想知道是否有一种方法可以创建一个自定义类,在拟合数据之前以不同的方式插补数据中缺失的值,并尝试对我同时使用的任何算法进行不同的属性组合。有可能吗 非常感谢你的帮助 这是我创建的一个类,用于插补我的住房价格数据集SQFT列中缺失的值: class Impute_sqft(): train_X = houses_dummies_copy.dr
class Impute_sqft():
train_X = houses_dummies_copy.dropna(subset=['sqft']).drop(columns=['sqft', 'final_price'])
train_Y = houses_dummies_copy.dropna(subset=['sqft'])['sqft']
test_X = houses_dummies_copy[pd.isna(houses_dummies_copy.sqft)].drop(columns=['sqft', 'final_price'])
def __init__(self, how='random forest'):
self.how = how
def impute(self):
# replace missing values with the ones predicted by random forest
if self.how == 'random forest':
houses_dummies_copy = houses_dummies.copy()
rf = RandomForestRegressor()
rf.fit(train_X, train_Y)
pred_Y = rf.predict(test_X)
houses_dummies_copy.loc[test_X.index,'sqft'] = pred_Y
return houses_dummies_copy[predictors]
# replace missing values with the ones predicted by knn
if self.how == 'knn':
houses_dummies_copy = houses_dummies.copy()
import sys
from impyute.imputation.cs import fast_knn
sys.setrecursionlimit(100000)
knn_n = 30
result = fast_knn(houses_dummies_copy[predictors], k=knn_n)
result.columns = houses_dummies_copy[predictors].columns
return result
# replace missing values with the mean for every type of property
if self.how == 'mean':
houses_dummies_copy = houses_dummies.copy()
sqft_statistics = []
for house_type in houses_types:
statistic = houses_dummies_copy[houses_dummies_copy['type_' + house_type] == 1].sqft.mean(skipna=True)
indexes = houses_dummies_copy[(houses_dummies_copy['type_' + house_type] == 1) & pd.isna(houses_dummies_copy.sqft)].index
houses_dummies_copy.loc[indexes, 'sqft'] = statistic
return houses_dummies_copy[predictors]
我对GridSearchCV有什么网格:
param_grid = [{
'bootstrap': [True, False],
'n_estimators': [3, 10],
'max_features': [2, 3, 4]
}]
param_grid = [{
'bootstrap': [True, False],
'n_estimators': [3, 10],
'max_features': [2, 3, 4],
'sqft_imputer': ['random forest', 'knn', 'mean']
}]
我想要GridSearchCV的网格:
param_grid = [{
'bootstrap': [True, False],
'n_estimators': [3, 10],
'max_features': [2, 3, 4]
}]
param_grid = [{
'bootstrap': [True, False],
'n_estimators': [3, 10],
'max_features': [2, 3, 4],
'sqft_imputer': ['random forest', 'knn', 'mean']
}]
您需要创建一个自定义的
转换器
,并在管道中的估计器之前使用它
自定义变压器:
请查看模板:
几点意见:
- 您需要从
BaseEstimator
继承
self.how
的选择必须在类的\uuuu init\uuuu
方法中定义
- 为了正确,您需要在
拟合过程中估计统计信息,并在变换过程中应用统计信息。(另见)
管道
要链接自定义插补器和估算器,您可以使用scikit学习:
- 例如,请参见此链接,它将
PCA
和逻辑回归
链接到管道
,然后在网格搜索CV
中使用它
- 您需要使用管道中使用的名称更改参数网格,如
参数网格=[{
“估计器引导”:[对,错],
‘估计量’:[3,10],
“估计量最大特征”:[2,3,4],
‘输入法’:[‘随机森林’、‘knn’、‘平均值’]
}]
您想要的是创建一个自定义的转换器
,并在管道中的估计器之前使用它
自定义变压器:
请查看模板:
几点意见:
- 您需要从
BaseEstimator
继承
self.how
的选择必须在类的\uuuu init\uuuu
方法中定义
- 为了正确,您需要在
拟合过程中估计统计信息,并在变换过程中应用统计信息。(另见)
管道
要链接自定义插补器和估算器,您可以使用scikit学习:
- 例如,请参见此链接,它将
PCA
和逻辑回归
链接到管道
,然后在网格搜索CV
中使用它
- 您需要使用管道中使用的名称更改参数网格,如
参数网格=[{
“估计器引导”:[对,错],
‘估计量’:[3,10],
“估计量最大特征”:[2,3,4],
‘输入法’:[‘随机森林’、‘knn’、‘平均值’]
}]
按照上面的答案。现在一切都好了
from sklearn.base import BaseEstimator, TransformerMixin
class Impute_sqft(BaseEstimator, TransformerMixin):
def __init__(self, how='random forest'):
self.how = how
def fit(self, X, y=None):
return self
def transform(self, X):
import sys
sys.setrecursionlimit(100000) #Increase the recursion limit of the OS
from impyute.imputation.cs import fast_knn, mice
result = X.copy()
if self.how == 'random forest':
train_X = houses_dummies.dropna(subset=['sqft']).drop(columns=['sqft', 'final_price'])
train_Y = houses_dummies.dropna(subset=['sqft'])['sqft']
test_X = result[pd.isna(result.sqft)].drop(columns=['sqft'])
rf = RandomForestRegressor()
rf.fit(train_X, train_Y)
pred_Y = rf.predict(test_X)
result.loc[test_X.index,'sqft'] = pred_Y
if self.how == 'knn':
knn_n = 30
result = fast_knn(houses_dummies[predictors], k=knn_n)
result.columns = houses_dummies[predictors].columns
result.index = houses_dummies[predictors].index
result = result.loc[X.index,:]
if self.how == 'mice':
result = mice(houses_dummies[predictors])
result.columns = houses_dummies[predictors].columns
result.index = houses_dummies[predictors].index
result = result.loc[X.index,:]
if self.how == 'mean':
result['sqft'] = houses_edited.groupby('type')['sqft'].transform(lambda x: x.fillna(x.mean(skipna=True)))
if self.how == 'median':
result['sqft'] = houses_edited.groupby('type')['sqft'].transform(lambda x: x.fillna(x.median(skipna=True)))
if self.how == 'mode':
result['sqft'] = houses_edited.groupby('type')['sqft'].transform(lambda x: x.fillna(x.mode()[0]))
return result[predictors]
按照上面的答案。现在一切都好了
from sklearn.base import BaseEstimator, TransformerMixin
class Impute_sqft(BaseEstimator, TransformerMixin):
def __init__(self, how='random forest'):
self.how = how
def fit(self, X, y=None):
return self
def transform(self, X):
import sys
sys.setrecursionlimit(100000) #Increase the recursion limit of the OS
from impyute.imputation.cs import fast_knn, mice
result = X.copy()
if self.how == 'random forest':
train_X = houses_dummies.dropna(subset=['sqft']).drop(columns=['sqft', 'final_price'])
train_Y = houses_dummies.dropna(subset=['sqft'])['sqft']
test_X = result[pd.isna(result.sqft)].drop(columns=['sqft'])
rf = RandomForestRegressor()
rf.fit(train_X, train_Y)
pred_Y = rf.predict(test_X)
result.loc[test_X.index,'sqft'] = pred_Y
if self.how == 'knn':
knn_n = 30
result = fast_knn(houses_dummies[predictors], k=knn_n)
result.columns = houses_dummies[predictors].columns
result.index = houses_dummies[predictors].index
result = result.loc[X.index,:]
if self.how == 'mice':
result = mice(houses_dummies[predictors])
result.columns = houses_dummies[predictors].columns
result.index = houses_dummies[predictors].index
result = result.loc[X.index,:]
if self.how == 'mean':
result['sqft'] = houses_edited.groupby('type')['sqft'].transform(lambda x: x.fillna(x.mean(skipna=True)))
if self.how == 'median':
result['sqft'] = houses_edited.groupby('type')['sqft'].transform(lambda x: x.fillna(x.median(skipna=True)))
if self.how == 'mode':
result['sqft'] = houses_edited.groupby('type')['sqft'].transform(lambda x: x.fillna(x.mode()[0]))
return result[predictors]