Python 完整的sklearn管道示例
我正在尝试使用sklearn管道。但是我在网上尝试了各种各样的教程,但对我没有帮助Python 完整的sklearn管道示例,python,pandas,scikit-learn,pipeline,Python,Pandas,Scikit Learn,Pipeline,我正在尝试使用sklearn管道。但是我在网上尝试了各种各样的教程,但对我没有帮助 import pandas as pd import numpy as np import json import seaborn as sb from sklearn.metrics import log_loss from sklearn import linear_model from sklearn.model_selection import StratifiedKFold from sklear
import pandas as pd
import numpy as np
import json
import seaborn as sb
from sklearn.metrics import log_loss
from sklearn import linear_model
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from scipy.stats import zscore
from Transformers import TextTransformer
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
%matplotlib inline
df = pd.read_json('data/train.json', encoding = 'utf-8', dtype = {'description': str})
len(df)
df = df[['description', 'interest_level']]
from sklearn.pipeline import Pipeline, FeatureUnion
a = TextTransformer('description', max_features=50)
b = TextTransformer('features', max_features=10)
pipeline = Pipeline([
('description',a ), # can pass in either a pipeline
#('features',b ) # or a transformer
J ('clf', SVC()) # classifier
])
pipeline.fit(df[:,'interest_level'])
我的文本转换器
from sklearn.base import BaseEstimator, TransformerMixin
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
class TextTransformer(BaseEstimator, TransformerMixin):
def __init__(self, column, max_features=5000):
self.tfidfVectorizer = TfidfVectorizer(use_idf=False, stop_words='english',
tokenizer=self._custom_tokenizer, analyzer='word',
max_features=max_features)
self._vectorizer = None
self._column = column
def _custom_tokenizer(self, string):
# string = re.sub('^[\w]', '', string)
tokens = nltk.word_tokenize(string)
cleaned = [x if not x.isdigit() else '_NUM_' for x in tokens]
return [str(x.encode('utf-8')) for x in cleaned if (x.isalpha() or x == '_NUM_')]
def _clean_html_tags(self, content):
return BeautifulSoup(content, 'lxml').text
def fit(self, df):
self._vectorizer = self.tfidfVectorizer.fit(df[self._column].apply(self._clean_html_tags))
return self
def transform(self, df):
return self._vectorizer.transform(df[self._column]).todense()
然而,我似乎不能把它弄对。它在ipython笔记本中不断抛出此异常
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-11-b3788282dc5c> in <module>()
8 ('clf', SVC()) # classifier
9 ])
---> 10 pipeline.fit(df[:,'interest_level'])
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
2057 return self._getitem_multilevel(key)
2058 else:
-> 2059 return self._getitem_column(key)
2060
2061 def _getitem_column(self, key):
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
2064 # get column
2065 if self.columns.is_unique:
-> 2066 return self._get_item_cache(key)
2067
2068 # duplicate columns & possible reduce dimensionality
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
1382 """Return the cached item, item represents a label indexer."""
1383 cache = self._item_cache
-> 1384 res = cache.get(item)
1385 if res is None:
1386 values = self._data.get(item)
TypeError: unhashable type
兴趣级别将是我的目标变量您只拟合了一列df[:,“兴趣级别”),但您的第一步转换器a:TextTransformer正在尝试访问列描述。您只拟合了一列df[:,“兴趣级别],但是,您的第一步transformer a:TextTransformer正在尝试访问列描述。使用decorator编写管道要容易得多,请参见此 您的代码如下所示:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
@SKTransform
def clean_num( txt):
return re.compile('\\d+').sub('_NUM_', txt)
@SKTransform
def clean_tags(content):
return BeautifulSoup(content, 'lxml').text
ppl = Pipeline([clean_tags,
clean_num,
TfidfVectorizer(use_idf=False, stop_words='english',tokenizer=nltk.word_tokenize,analyzer='word',max_features=max_features),
])
使用decorator编写管道要容易得多,请参见 您的代码如下所示:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
@SKTransform
def clean_num( txt):
return re.compile('\\d+').sub('_NUM_', txt)
@SKTransform
def clean_tags(content):
return BeautifulSoup(content, 'lxml').text
ppl = Pipeline([clean_tags,
clean_num,
TfidfVectorizer(use_idf=False, stop_words='english',tokenizer=nltk.word_tokenize,analyzer='word',max_features=max_features),
])
sklearn管道的简单示例,用于所有功能分布,如分类、命名、序号等
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# All data encoders
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
std = StandardScaler()
# To fill the missing values
imp_mean = SimpleImputer(strategy='most_frequent')
X_dense_data = imp_mean.fit_transform(X)
X = pd.DataFrame(X_dense_data, columns=X.columns.values.tolist())
# All columns distribution
ohe_column_catagorical_feature = ['race', 'sex', 'age group']
std_column_numeric_feature = ['height', 'weight', 'temperature', 'blood glucose']
# Numaric feature transformer
feature_numeric_transformer = Pipeline(steps=[
('scaler_data', std)
])
# catagorical feature transformer
catagorical_numeric_transformer = Pipeline(steps=[
('onehot', ohe)
])
# column transformer to transform the value of each feature
preprocessor_feature = ColumnTransformer(
transformers=[
('num', feature_numeric_transformer, std_column_numeric_feature),
('cat', catagorical_numeric_transformer, ohe_column_catagorical_feature)
], remainder='drop'
)
确保数据值已完全填充。如果没有,这里是使用sklear SimpleImputer填充空值的示例
插补策略
如果为“平均值”,则使用每列的平均值替换缺失的值。只能与数字数据一起使用
# To fill the missing values
imp_mean = SimpleImputer(strategy='most_frequent')
X_dense_data = imp_mean.fit_transform(X)
如果为“中间值”,则使用每列的中间值替换缺少的值。只能与数字数据一起使用
# To fill the missing values
imp_mean = SimpleImputer(strategy='most_frequent')
X_dense_data = imp_mean.fit_transform(X)
如果为“最频繁”,则使用每列的最频繁值替换缺失值。可以与字符串或数字数据一起使用
# To fill the missing values
imp_mean = SimpleImputer(strategy='most_frequent')
X_dense_data = imp_mean.fit_transform(X)
如果为“常量”,则用fill_值替换缺少的值。可以与字符串或数字数据一起使用
# To fill the missing values
imp_mean = SimpleImputer(strategy='most_frequent')
X_dense_data = imp_mean.fit_transform(X)
sklearn管道的简单示例,用于所有功能分布,如分类、命名、序号等
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# All data encoders
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
std = StandardScaler()
# To fill the missing values
imp_mean = SimpleImputer(strategy='most_frequent')
X_dense_data = imp_mean.fit_transform(X)
X = pd.DataFrame(X_dense_data, columns=X.columns.values.tolist())
# All columns distribution
ohe_column_catagorical_feature = ['race', 'sex', 'age group']
std_column_numeric_feature = ['height', 'weight', 'temperature', 'blood glucose']
# Numaric feature transformer
feature_numeric_transformer = Pipeline(steps=[
('scaler_data', std)
])
# catagorical feature transformer
catagorical_numeric_transformer = Pipeline(steps=[
('onehot', ohe)
])
# column transformer to transform the value of each feature
preprocessor_feature = ColumnTransformer(
transformers=[
('num', feature_numeric_transformer, std_column_numeric_feature),
('cat', catagorical_numeric_transformer, ohe_column_catagorical_feature)
], remainder='drop'
)
确保数据值已完全填充。如果没有,这里是使用sklear SimpleImputer填充空值的示例
插补策略
如果为“平均值”,则使用每列的平均值替换缺失的值。只能与数字数据一起使用
# To fill the missing values
imp_mean = SimpleImputer(strategy='most_frequent')
X_dense_data = imp_mean.fit_transform(X)
如果为“中间值”,则使用每列的中间值替换缺少的值。只能与数字数据一起使用
# To fill the missing values
imp_mean = SimpleImputer(strategy='most_frequent')
X_dense_data = imp_mean.fit_transform(X)
如果为“最频繁”,则使用每列的最频繁值替换缺失值。可以与字符串或数字数据一起使用
# To fill the missing values
imp_mean = SimpleImputer(strategy='most_frequent')
X_dense_data = imp_mean.fit_transform(X)
如果为“常量”,则用fill_值替换缺少的值。可以与字符串或数字数据一起使用
# To fill the missing values
imp_mean = SimpleImputer(strategy='most_frequent')
X_dense_data = imp_mean.fit_transform(X)
那我该怎么写呢?你还需要帮助吗?那我该怎么写呢?你还需要帮助吗?