python sklearn:indexer:'；数组'的索引太多；_Python_Pandas_Scikit Learn

python sklearn:indexer:'；数组'的索引太多；

python pandas scikit-learn

python sklearn:indexer:'；数组'的索引太多；,python,pandas,scikit-learn,Python,Pandas,Scikit Learn,我是机器学习的新手。我最近遇到了一个问题，并且已经搜索了关于同一主题的StackOverFlow，但仍然无法找到它。谁能看一下吗？非常感谢 #-*- coding:utf-8 -*- import pandas as pd import numpy as np import matplotlib.pyplot as plt data_train = pd.read_excel('py_train.xlsx',index_col=0) test_data = pd.read_excel('py_

我是机器学习的新手。我最近遇到了一个问题，并且已经搜索了关于同一主题的StackOverFlow，但仍然无法找到它。谁能看一下吗？非常感谢

#-*- coding:utf-8 -*-
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data_train = pd.read_excel('py_train.xlsx',index_col=0)
test_data = pd.read_excel('py_test.xlsx',index_col=0)


from sklearn import preprocessing

x = data_train.iloc[:,1:].as_matrix()
y = data_train.iloc[:,0:1].as_matrix()

sx = preprocessing.scale(x)

from sklearn import linear_model
clf = linear_model.LogisticRegression()
clf.fit(sx,y)

clf

代码以前运行得很好，并且数据都被清除了。我将数据融入其中，比如：

id  rep a   b   c   d
1   0   1   2   3   4
2   0   2   3   4   5
3   0   3   4   5   6
4   1   4   5   6   7
5   1   5   6   7   8
6   1   6   7   8   9
7   1   7   8   9   10
8   1   8   9   10  11
9   1   9   10  11  12
10  1   10  11  12  13

下面的代码显示了一个索引器。为什么？我该如何修复它

谢谢

import numpy as np
import matplotlib.pyplot as plt
from sklearn.learning_curve import learning_curve


def plot_learning_curve(estimator, title, x, y, ylim=None, cv=None, n_jobs=1, 
                        train_sizes=np.linspace(.05, 1., 20), verbose=0, plot=True):

    train_sizes, train_scores, test_scores = learning_curve(
        estimator, x, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose)

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    if plot:
        plt.figure()
        plt.title(title)
        if ylim is not None:
            plt.ylim(*ylim)   #ylim=y's limit
        plt.xlabel(u"train set size")
        plt.ylabel(u"score")
        plt.gca().invert_yaxis()
        plt.grid()    #网格

        plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, 
                         alpha=0.1, color="b")       # generates a shaded region 
        plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, 
                         alpha=0.1, color="r")
        plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label=u"train set score")    
        plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label=u"CV score")

        plt.legend(loc="best")

        plt.draw()
        plt.gca().invert_yaxis()
        plt.show()

    midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2
    diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1])
    return midpoint, diff

plot_learning_curve(clf, u"learning_curve", x, y)

详情如下：

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-18-0dc3d0934602> in <module>()
     42     return midpoint, diff
     43 
---> 44 plot_learning_curve(clf, u"learning_curve", x, y)

<ipython-input-18-0dc3d0934602> in plot_learning_curve(estimator, title, x, y, ylim, cv, n_jobs, train_sizes, verbose, plot)
      8 
      9     train_sizes, train_scores, test_scores = learning_curve(
---> 10         estimator, x, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose)
     11 
     12     train_scores_mean = np.mean(train_scores, axis=1)

D:\Anaconda3\lib\site-packages\sklearn\learning_curve.py in learning_curve(estimator, X, y, train_sizes, cv, scoring, exploit_incremental_learning, n_jobs, pre_dispatch, verbose, error_score)
    138     X, y = indexable(X, y)
    139     # Make a list since we will be iterating multiple times over the folds
--> 140     cv = list(check_cv(cv, X, y, classifier=is_classifier(estimator)))
    141     scorer = check_scoring(estimator, scoring=scoring)
    142 

D:\Anaconda3\lib\site-packages\sklearn\cross_validation.py in check_cv(cv, X, y, classifier)
   1821         if classifier:
   1822             if type_of_target(y) in ['binary', 'multiclass']:
-> 1823                 cv = StratifiedKFold(y, cv)
   1824             else:
   1825                 cv = KFold(_num_samples(y), cv)

D:\Anaconda3\lib\site-packages\sklearn\cross_validation.py in __init__(self, y, n_folds, shuffle, random_state)
    567         for test_fold_idx, per_label_splits in enumerate(zip(*per_label_cvs)):
    568             for label, (_, test_split) in zip(unique_labels, per_label_splits):
--> 569                 label_test_folds = test_folds[y == label]
    570                 # the test split can be too big because we used
    571                 # KFold(max(c, self.n_folds), self.n_folds) instead of

IndexError: too many indices for array

---------------------------------------------------------------------------
索引器回溯（最后一次最近调用）
在（）
42返回中点，差分
43
--->44绘制学习曲线（clf，u“学习曲线”，x，y）
在绘图学习曲线中（估计器、标题、x、y、ylim、cv、n作业、培训规模、详细信息、绘图）
8.
9培训规模、培训分数、考试分数=学习曲线(
--->10估计器，x，y，cv=cv，n_作业=n_作业，序列尺寸=序列尺寸，详细=详细）
11
12系列分数平均值=np.平均值（系列分数，轴=1）
D:\Anaconda3\lib\site packages\sklearn\learning\u curve.py in learning\u curve（估计器、X、y、训练规模、cv、评分、利用增量学习、n作业、预调度、详细信息、错误评分）
138 X，y=可转位（X，y）
139#列出一个列表，因为我们将在褶皱上重复多次
-->140 cv=列表（检查cv（cv，X，y，分类器=is_分类器（估计器）））
141评分员=检查评分（评估员，评分=评分）
142
D:\Anaconda3\lib\site packages\sklearn\cross\u validation.py in check\u cv（cv、X、y、分类器）
1821如果分类器：
1822如果[‘二进制’、‘多类’]中的_目标（y）的_类型：
->1823 cv=层状褶皱（y，cv）
1824其他：
1825 cv=KFold（_num_样本（y），cv）
D:\Anaconda3\lib\site packages\sklearn\cross\u validation.py处于初始化状态（self、y、n\u folds、shuffle、random\u状态）
567对于test_fold_idx，枚举（zip（*per_label_cvs））中的每个标签分割：
568对于zip中的标签（U，测试_U拆分）（唯一的_U标签，每个_U标签拆分）：
-->569标签\测试\折叠=测试\折叠[y==标签]
570#由于我们使用了
571#KFold（最大（c，自n倍），自n倍）代替
索引器：数组的索引太多

逻辑回归接受，交叉验证程序似乎只接受y值的数组。你好像通过了一个矩阵

检查差异：

您正在通过以下步骤：

df.iloc[:,0:1].as_matrix()
array([[0],
       [1],
       [2]], dtype=int64)

但使用它可能更好

df.iloc[:,0].as_matrix()
array([0, 1, 2], dtype=int64)

您可以试试吗？

逻辑回归接受，交叉验证程序似乎只接受y值的数组。你好像通过了一个矩阵

检查差异：

您正在通过以下步骤：

df.iloc[:,0:1].as_matrix()
array([[0],
       [1],
       [2]], dtype=int64)

但使用它可能更好

df.iloc[:,0].as_matrix()
array([0, 1, 2], dtype=int64)

你能试试吗？

你的数据看起来和问题中的表格完全一样吗？@MaximilianPeters是的，我表格中的数据都是int类型，只是有更多的列，与示例没有太大的不同。错误是相同的。你的数据看起来和问题中的表格完全一样吗？@MaximilianPeters是的，我的表中的数据都是int类型，只是有更多的列，与示例没有太大的不同。错误是samethank，您非常了解，您指出了原因和方法。我刚才尝试了您的方法，但结果是另一个错误：ValueError：此解算器需要数据中至少两个类的样本，但是数据只包含一个类：0I将这些代码放在def之前，问题与此类似：另外，我制作了一个100行的示例，错误仍在发生data\u train=shuffle（data\u train）x=data\u train.iloc[：，1:。as\u matrix（）y=data\u train.iloc[：，0]。as\u matrix（）非常感谢，您指出了原因和方法。我刚才尝试了您的方法，但结果是另一个错误：ValueError：此解算器需要数据中至少两个类的样本，但数据仅包含一个类：0我将这些代码放在def之前，问题与此类似：另外，我制作了一个100行示例，错误仍在发生data\u train=shuffle（data\u train）x=data\u train.iloc[：，1:]。as\u matrix（）y=data\u train.iloc[：，0]。as\u matrix（）