Python 将FeatureUnion输出转换为DictVectorizer的字典

Python 将FeatureUnion输出转换为DictVectorizer的字典,python,pandas,scikit-learn,sklearn-pandas,Python,Pandas,Scikit Learn,Sklearn Pandas,我试图从datacamp教程中重新创建管道,但管道出现了问题。我相信我遇到的问题是将FeatureUnion的输出转换为DictVectorizer的字典。当我运行下面的代码时,代码失败,因为没有定义BaseEstimator、TransformerMin。任何关于我哪里出错的指导都将不胜感激


import pandas as pd
from sklearn_pandas import DataFrameMapper
from sklearn.impute import SimpleImputer
from sklearn_pandas import CategoricalImputer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction import DictVectorizer
import xgboost as xgb
from sklearn.model_selection import cross_val_score

kidney_feature_names = ['age',

kidney_data = pd.read_csv("",

kidney_data['pcv'] = pd.to_numeric(kidney_data['pcv'], errors='coerce')
kidney_data['wc'] = pd.to_numeric(kidney_data['wc'], errors='coerce')
kidney_data['rc'] = pd.to_numeric(kidney_data['rc'], errors='coerce')

#Split data between data and labels
X, y = kidney_data.iloc[:,:-1], kidney_data.iloc[:, -1]

# Check number of nulls in each feature column
nulls_per_column = X.isnull().sum()

# Create a boolean mask for categorical columns
categorical_feature_mask = X.dtypes == object

# Get list of categorical column names
categorical_columns = X.columns[categorical_feature_mask].tolist()

# Get list of non-categorical column names
non_categorical_columns = X.columns[~categorical_feature_mask].tolist()

# Apply numeric imputer
numeric_imputation_mapper = DataFrameMapper(
                                        [([numeric_feature], SimpleImputer(strategy="median")) for 
numeric_feature in non_categorical_columns],

# Apply categorical imputer
categorical_imputation_mapper = DataFrameMapper(
                                            [(category_feature, CategoricalImputer()) for 
category_feature in categorical_columns],

# Combine the numeric and categorical transformations
numeric_categorical_union = FeatureUnion([
                                      ("num_mapper", numeric_imputation_mapper),
                                      ("cat_mapper", categorical_imputation_mapper)

# Custom transformer to convert Pandas DataFrame into Dict (needed for DictVectorizer)
class Dictifier(BaseEstimator, TransformerMixin):       
def fit(self, X, y=None):
    return self

def transform(self, X):
    return X.to_dict('records')

# Create full pipeline
pipeline = Pipeline([
                 ("featureunion", numeric_categorical_union),
                 ('dictifier', Dictifier())
                 ("vectorizer", DictVectorizer(sort=False)),
                 ("clf", xgb.XGBClassifier(max_depth=3))

# Perform cross-validation
cross_val_scores = cross_val_score(pipeline, X, y, scoring="roc_auc", cv=3)


from sklearn.base import BaseEstimator, TransformerMixin


# Import modules
import pandas as pd
from sklearn_pandas import DataFrameMapper, CategoricalImputer
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
import xgboost as xgb

# Create list of column names for kidney data: kidney_cols
kidney_cols = ['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr',
               'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm',
               'cad', 'appet', 'pe', 'ane', 'label']

# Load dataset: df_kidney
df_kidney = pd.read_csv('chronic_kidney_disease.csv', names=kidney_cols,

# Replace label values with 0 (ckd) and 1
df_kidney['label'].replace({'ckd':0, 'notckd':1}, inplace=True)

# Define X and y: X, y
X, y = df_kidney.iloc[:, :-1], df_kidney['label'].values

# Define new column order for X: col_order
col_order = ['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot',
             'hemo', 'pcv', 'wc', 'rc', 'rbc', 'pc', 'pcc', 'ba', 'htn', 'dm',
             'cad', 'appet', 'pe', 'ane']

# Rearrange columns of X
X = X[col_order]

# Create a boolean mask for categorical columns
categorical_feature_mask = X.dtypes == object

# Get a list of categorical column names
categorical_columns = X.columns[categorical_feature_mask].tolist()

# Get a list of non-categorical column names
non_categorical_columns = X.columns[~categorical_feature_mask].tolist()

# Create empty list to hold column imputers: transformers
transformers = []

# Create numeric imputers and add to list of transformers
transformers.extend([([num_col], [Imputer(strategy='median'),
                                                 StandardScaler()]) for num_col
                    in non_categorical_columns])

# Create categorical imputers and add to list of transformers
transformers.extend([(cat_col, [CategoricalImputer()]) for cat_col in

# Use list of transformers to create a DataFrameMapper object
numeric_categorical_union = DataFrameMapper(transformers, input_df=True,

# Define Dictifier class to turn df into dictionary as part of pipeline
class Dictifier(BaseEstimator, TransformerMixin):       
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.to_dict('records')

# Create full pipeline
pipeline = Pipeline([('featureunion', numeric_categorical_union),
                    ('dictifier', Dictifier()),
                    ('vectorizer', DictVectorizer(sort=False)),
                    ('clf', xgb.XGBClassifier(max_depth=3))])

# Perform cross-validation
cross_val_scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=3)