Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/python/339.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python RandomForestRegressionor:输入包含NaN、无穷大或对kaggle learn上的数据类型(';float32';)太大的值_Python_Pandas_Scikit Learn_Random Forest_Kaggle - Fatal编程技术网

Python RandomForestRegressionor:输入包含NaN、无穷大或对kaggle learn上的数据类型(';float32';)太大的值

Python RandomForestRegressionor:输入包含NaN、无穷大或对kaggle learn上的数据类型(';float32';)太大的值,python,pandas,scikit-learn,random-forest,kaggle,Python,Pandas,Scikit Learn,Random Forest,Kaggle,在Kaggle Learn上执行“练习:分类变量”的第5步时,我得到了ValueError:在测试集的预测阶段,输入包含NaN、无穷大或对于dtype('float32')来说太大的值。 提供完整的jupyter笔记本电脑。 文章末尾显示使用的完整代码 该代码旨在为用户准备提交数据集 问题在于预处理包含测试集的X_测试数据集。首先,我使用了SimpleImputer和最常用的策略。然后对数据集的分类变量执行单热编码 我发现在X_序列(和X_有效)数据集和X_测试之间,一些特性具有不同的数据类型。

在Kaggle Learn上执行“练习:分类变量”的第5步时,我得到了
ValueError:在测试集的预测阶段,输入包含NaN、无穷大或对于dtype('float32')来说太大的值。

提供完整的jupyter笔记本电脑。 文章末尾显示使用的完整代码

该代码旨在为用户准备提交数据集

问题在于预处理包含测试集的
X_测试
数据集。首先,我使用了
SimpleImputer
最常用的
策略。然后对数据集的分类变量执行单热编码

我发现在
X_序列
(和
X_有效
)数据集和
X_测试
之间,一些特性具有不同的数据类型。具体而言,
['BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath','GarageCars','GarageArea']
列在训练数据中为
int64
类型(
X列
X列有效
),而在测试数据中为'float64'。我想问题可能在这里,但我无法解决它。尝试使用以下块强制转换值

# normalize datatypes columns
#for colName in  ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea']:
#    OH_X_train[colName] = OH_X_train[colName].astype('float64')
#    OH_X_valid[colName] = OH_X_train[colName].astype('float64')
但它不起作用。有什么建议吗

#### DATASETS LOAD ####
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
X = pd.read_csv('../input/train.csv', index_col='Id') 
X_test = pd.read_csv('../input/test.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

# To keep things simple, we'll drop columns with missing values
cols_with_missing = [col for col in X.columns if X[col].isnull().any()] 
X.drop(cols_with_missing, axis=1, inplace=True)
X_test.drop(cols_with_missing, axis=1, inplace=True)

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      train_size=0.8, test_size=0.2,
                                                      random_state=0)

#### IMPUTATION OF MISSING VALUES FOR X_TEST ####
from sklearn.impute import SimpleImputer

# All categorical columns
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Columns that will be one-hot encoded
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]

# Fill in the lines below: imputation
my_imputer = SimpleImputer(strategy='most_frequent')
imputed_X_test = pd.DataFrame(my_imputer.fit_transform(X_test))

# Fill in the lines below: imputation removed column names; put them back
imputed_X_test.columns = X_test.columns

#### ONEHOT ENCODING FOR DATA #####
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(imputed_X_test[low_cardinality_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index
OH_cols_test.index = X_test.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)
num_X_test = X_test.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

##### BUILD MODEL AND CREATE SUBMISSION ####
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# normalize datatypes columns
#for colName in  ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea']:
#    OH_X_train[colName] = OH_X_train[colName].astype('float64')
#    OH_X_valid[colName] = OH_X_train[colName].astype('float64')

# Build model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(OH_X_train, y_train)
preds_test = model.predict(OH_X_test)

# Save test predictions to file
#output = pd.DataFrame({'Id': OH_X_test.index,
#                       'SalePrice': preds_test})
#output.to_csv('submission.csv', index=False)
#####数据集加载####
作为pd进口熊猫
从sklearn.model\u选择导入列车\u测试\u拆分
#读取数据
X=pd.read_csv('../input/train.csv',index_col='Id'))
X_test=pd.read_csv('../input/test.csv',index_col='Id'))
#删除缺少目标的行,将目标与预测器分开
X.dropna(axis=0,subset=['SalePrice'],inplace=True)
y=X.售价
X.drop(['SalePrice'],axis=1,inplace=True)
#为了简单起见,我们将删除缺少值的列
cols_with_missing=[col for col in X.columns if X[col].isnull().any()]
X.drop(cols_,缺失_,轴=1,在位=True)
X_测试跌落(缺少_的cols_,轴=1,在位=真)
#从训练数据中断验证集
X_列,X_有效,y_列,y_有效=列测试分割(X,y,
列车尺寸=0.8,试验尺寸=0.2,
随机_状态=0)
####X_检验缺失值的插补####
从sklearn.impute导入SimpleImputer
#所有分类列
object_cols=[col for col in X_train.columns if X_train[col].dtype==“object”]
#将进行一次热编码的列
low_cardinality_cols=[col for col in object_cols if X_train[col].nunique()<10]
#填写以下行:插补
my_inputer=SimpleImputer(strategy='most_frequency')
插补_X_检验=pd.数据帧(my_插补器.拟合_变换(X_检验))
#填写以下行:插补删除列名;把它们放回去
插补_X_test.columns=X_test.columns
####数据的ONEHOT编码#####
从sklearn.preprocessing导入OneHotEncoder
#将一个热编码器应用于具有分类数据的每列
OH_encoder=onehotcoder(handle_unknown='ignore',sparse=False)
OH_cols_train=pd.DataFrame(OH_编码器.fit_变换(X_train[low_cardinality_cols]))
OH_cols_valid=pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))
OH_cols_test=pd.DataFrame(OH_encoder.transform(插补_X_test[低基数_cols]))
#一个热编码删除索引;放回去
OH_cols_train.index=X_train.index
OH_cols_valid.index=X_valid.index
OH_cols_test.index=X_test.index
#删除分类列(将替换为一个热编码)
num_X_train=X_train.drop(对象列,轴=1)
num_X_valid=X_valid.drop(对象列,轴=1)
num_X_test=X_test.drop(对象列,轴=1)
#向数字要素添加一个热编码列
OH_X_train=pd.concat([num_X_train,OH_cols_train],轴=1)
OH_X_valid=pd.concat([num_X_valid,OH_cols_valid],axis=1)
OH_X_测试=pd.concat([num_X_测试,OH_cols_测试],axis=1)
#####构建模型并创建提交####
从sklearn.employ导入随机森林回归器
从sklearn.metrics导入平均绝对误差
#规范化数据类型列
#对于['BsmtFinSF1'、'BsmtFinSF2'、'BsmtUnfSF'、'TotalBsmtSF'、'BsmtFullBath'、'BsmtHalfBath'、'GarageCars'、'GarageArea'中的colName:
#OH_X_train[colName]=OH_X_train[colName].astype('float64')
#OH_X_valid[colName]=OH_X_train[colName].astype('float64')
#构建模型
模型=随机森林回归器(n_估计值=100,随机状态=0)
模型拟合(OH_X_火车、y_火车)
预测检验=模型预测(OH_X_检验)
#将测试预测保存到文件
#output=pd.DataFrame({'Id':OH_X_test.index,
#'SalePrice':preds_test})
#output.to_csv('submission.csv',index=False)
下面是完整的错误日志:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-2-2d85be0f6b26> in <module>
     74 model = RandomForestRegressor(n_estimators=100, random_state=0)
     75 model.fit(OH_X_train, y_train)
---> 76 preds_test = model.predict(OH_X_test)
     77 
     78 # Save test predictions to file

/opt/conda/lib/python3.6/site-packages/sklearn/ensemble/forest.py in predict(self, X)
    691         check_is_fitted(self, 'estimators_')
    692         # Check data
--> 693         X = self._validate_X_predict(X)
    694 
    695         # Assign chunk of trees to jobs

/opt/conda/lib/python3.6/site-packages/sklearn/ensemble/forest.py in _validate_X_predict(self, X)
    357                                  "call `fit` before exploiting the model.")
    358 
--> 359         return self.estimators_[0]._validate_X_predict(X, check_input=True)
    360 
    361     @property

/opt/conda/lib/python3.6/site-packages/sklearn/tree/tree.py in _validate_X_predict(self, X, check_input)
    389         """Validate X whenever one tries to predict, apply, predict_proba"""
    390         if check_input:
--> 391             X = check_array(X, dtype=DTYPE, accept_sparse="csr")
    392             if issparse(X) and (X.indices.dtype != np.intc or
    393                                 X.indptr.dtype != np.intc):

/opt/conda/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    540         if force_all_finite:
    541             _assert_all_finite(array,
--> 542                                allow_nan=force_all_finite == 'allow-nan')
    543 
    544     if ensure_min_samples > 0:

/opt/conda/lib/python3.6/site-packages/sklearn/utils/validation.py in _assert_all_finite(X, allow_nan)
     54                 not allow_nan and not np.isfinite(X).all()):
     55             type_err = 'infinity' if allow_nan else 'NaN, infinity'
---> 56             raise ValueError(msg_err.format(type_err, X.dtype))
     57     # for object dtype data, we only check for NaNs (GH-13254)
     58     elif X.dtype == np.dtype('object') and not allow_nan:

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').
---------------------------------------------------------------------------
ValueError回溯(最近一次调用上次)
在里面
74模型=随机森林回归器(n_估计值=100,随机状态=0)
75型适合(OH_X_火车、y_火车)
--->76预测检验=模型预测(OH_X_检验)
77
78#将测试预测保存到文件
/predict(self,X)中的opt/conda/lib/python3.6/site-packages/sklearn/employ/forest.py
691检查是否已安装(自“估计器”)
692#检查数据
-->693 X=自我验证X预测(X)
694
695#将树块分配给作业
/opt/conda/lib/python3.6/site-packages/sklearn/employ/forest.py in_validate_X_predict(self,X)
357“在使用模型之前调用'fit'”)
358
-->359返回自估计量[0]。\u验证\u X\u预测(X,检查输入=真)
360
361@property
/opt/conda/lib/python3.6/site-packages/sklearn/tree/tree.py in\u validate\u X\u predict(self,X,check\u输入)
389“每当试图预测、应用、预测概率时,验证X”
390如果检查输入:
-->391 X=检查_阵列(X
#### DATASETS LOAD ####
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
X = pd.read_csv('../input/train.csv', index_col='Id') 
X_test = pd.read_csv('../input/test.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

# To keep things simple, we'll drop columns with missing values
cols_with_missing = [col for col in X.columns if X[col].isnull().any()] 
X.drop(cols_with_missing, axis=1, inplace=True)
X_test.drop(cols_with_missing, axis=1, inplace=True)

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      train_size=0.8, test_size=0.2,
                                                      random_state=0)

#### IMPUTATION OF MISSING VALUES FOR X_TEST ####
from sklearn.impute import SimpleImputer

# All categorical columns
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Columns that will be one-hot encoded
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]

# Fill in the lines below: imputation
my_imputer = SimpleImputer(strategy='most_frequent')
imputed_X_test = pd.DataFrame(my_imputer.fit_transform(X_test))

# Fill in the lines below: imputation removed column names; put them back
imputed_X_test.columns = X_test.columns 
imputed_X_test.index = X_test.index ###FIX

#### ONEHOT ENCODING FOR DATA #####
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(imputed_X_test[low_cardinality_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index
OH_cols_test.index = imputed_X_test.index ####FIX

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)
num_X_test = imputed_X_test.drop(object_cols, axis=1) ####FIX

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

##### BUILD MODEL AND CREATE SUBMISSION ####
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# normalize datatypes columns
#for colName in  ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea']:
#    OH_X_train[colName] = OH_X_train[colName].astype('float64')
#    OH_X_valid[colName] = OH_X_train[colName].astype('float64')

# Build model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(OH_X_train, y_train)
preds_test = model.predict(OH_X_test)

# Save test predictions to file
output = pd.DataFrame({'Id': OH_X_test.index,
                       'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)