python sklearn:indexer:';数组'的索引太多;

python sklearn:indexer:';数组'的索引太多;,python,pandas,scikit-learn,Python,Pandas,Scikit Learn,我是机器学习的新手。我最近遇到了一个问题,并且已经搜索了关于同一主题的StackOverFlow,但仍然无法找到它。谁能看一下吗?非常感谢 #-*- coding:utf-8 -*- import pandas as pd import numpy as np import matplotlib.pyplot as plt data_train = pd.read_excel('py_train.xlsx',index_col=0) test_data = pd.read_excel('py_

我是机器学习的新手。我最近遇到了一个问题,并且已经搜索了关于同一主题的StackOverFlow,但仍然无法找到它。谁能看一下吗?非常感谢

#-*- coding:utf-8 -*-
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data_train = pd.read_excel('py_train.xlsx',index_col=0)
test_data = pd.read_excel('py_test.xlsx',index_col=0)


from sklearn import preprocessing

x = data_train.iloc[:,1:].as_matrix()
y = data_train.iloc[:,0:1].as_matrix()

sx = preprocessing.scale(x)

from sklearn import linear_model
clf = linear_model.LogisticRegression()
clf.fit(sx,y)

clf
代码以前运行得很好,并且数据都被清除了。我将数据融入其中,比如:

id  rep a   b   c   d
1   0   1   2   3   4
2   0   2   3   4   5
3   0   3   4   5   6
4   1   4   5   6   7
5   1   5   6   7   8
6   1   6   7   8   9
7   1   7   8   9   10
8   1   8   9   10  11
9   1   9   10  11  12
10  1   10  11  12  13
下面的代码显示了一个索引器。为什么?我该如何修复它

谢谢

import numpy as np
import matplotlib.pyplot as plt
from sklearn.learning_curve import learning_curve


def plot_learning_curve(estimator, title, x, y, ylim=None, cv=None, n_jobs=1, 
                        train_sizes=np.linspace(.05, 1., 20), verbose=0, plot=True):

    train_sizes, train_scores, test_scores = learning_curve(
        estimator, x, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose)

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    if plot:
        plt.figure()
        plt.title(title)
        if ylim is not None:
            plt.ylim(*ylim)   #ylim=y's limit
        plt.xlabel(u"train set size")
        plt.ylabel(u"score")
        plt.gca().invert_yaxis()
        plt.grid()    #网格

        plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, 
                         alpha=0.1, color="b")       # generates a shaded region 
        plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, 
                         alpha=0.1, color="r")
        plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label=u"train set score")    
        plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label=u"CV score")

        plt.legend(loc="best")

        plt.draw()
        plt.gca().invert_yaxis()
        plt.show()

    midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2
    diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1])
    return midpoint, diff

plot_learning_curve(clf, u"learning_curve", x, y)
详情如下:

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-18-0dc3d0934602> in <module>()
     42     return midpoint, diff
     43 
---> 44 plot_learning_curve(clf, u"learning_curve", x, y)

<ipython-input-18-0dc3d0934602> in plot_learning_curve(estimator, title, x, y, ylim, cv, n_jobs, train_sizes, verbose, plot)
      8 
      9     train_sizes, train_scores, test_scores = learning_curve(
---> 10         estimator, x, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose)
     11 
     12     train_scores_mean = np.mean(train_scores, axis=1)

D:\Anaconda3\lib\site-packages\sklearn\learning_curve.py in learning_curve(estimator, X, y, train_sizes, cv, scoring, exploit_incremental_learning, n_jobs, pre_dispatch, verbose, error_score)
    138     X, y = indexable(X, y)
    139     # Make a list since we will be iterating multiple times over the folds
--> 140     cv = list(check_cv(cv, X, y, classifier=is_classifier(estimator)))
    141     scorer = check_scoring(estimator, scoring=scoring)
    142 

D:\Anaconda3\lib\site-packages\sklearn\cross_validation.py in check_cv(cv, X, y, classifier)
   1821         if classifier:
   1822             if type_of_target(y) in ['binary', 'multiclass']:
-> 1823                 cv = StratifiedKFold(y, cv)
   1824             else:
   1825                 cv = KFold(_num_samples(y), cv)

D:\Anaconda3\lib\site-packages\sklearn\cross_validation.py in __init__(self, y, n_folds, shuffle, random_state)
    567         for test_fold_idx, per_label_splits in enumerate(zip(*per_label_cvs)):
    568             for label, (_, test_split) in zip(unique_labels, per_label_splits):
--> 569                 label_test_folds = test_folds[y == label]
    570                 # the test split can be too big because we used
    571                 # KFold(max(c, self.n_folds), self.n_folds) instead of

IndexError: too many indices for array
---------------------------------------------------------------------------
索引器回溯(最后一次最近调用)
在()
42返回中点,差分
43
--->44绘制学习曲线(clf,u“学习曲线”,x,y)
在绘图学习曲线中(估计器、标题、x、y、ylim、cv、n作业、培训规模、详细信息、绘图)
8.
9培训规模、培训分数、考试分数=学习曲线(
--->10估计器,x,y,cv=cv,n_作业=n_作业,序列尺寸=序列尺寸,详细=详细)
11
12系列分数平均值=np.平均值(系列分数,轴=1)
D:\Anaconda3\lib\site packages\sklearn\learning\u curve.py in learning\u curve(估计器、X、y、训练规模、cv、评分、利用增量学习、n作业、预调度、详细信息、错误评分)
138 X,y=可转位(X,y)
139#列出一个列表,因为我们将在褶皱上重复多次
-->140 cv=列表(检查cv(cv,X,y,分类器=is_分类器(估计器)))
141评分员=检查评分(评估员,评分=评分)
142
D:\Anaconda3\lib\site packages\sklearn\cross\u validation.py in check\u cv(cv、X、y、分类器)
1821如果分类器:
1822如果[‘二进制’、‘多类’]中的_目标(y)的_类型:
->1823 cv=层状褶皱(y,cv)
1824其他:
1825 cv=KFold(_num_样本(y),cv)
D:\Anaconda3\lib\site packages\sklearn\cross\u validation.py处于初始化状态(self、y、n\u folds、shuffle、random\u状态)
567对于test_fold_idx,枚举(zip(*per_label_cvs))中的每个标签分割:
568对于zip中的标签(U,测试_U拆分)(唯一的_U标签,每个_U标签拆分):
-->569标签\测试\折叠=测试\折叠[y==标签]
570#由于我们使用了
571#KFold(最大(c,自n倍),自n倍)代替
索引器:数组的索引太多

逻辑回归接受,交叉验证程序似乎只接受y值的数组。你好像通过了一个矩阵

检查差异:

您正在通过以下步骤:

df.iloc[:,0:1].as_matrix()
array([[0],
       [1],
       [2]], dtype=int64)
但使用它可能更好

df.iloc[:,0].as_matrix()
array([0, 1, 2], dtype=int64)

您可以试试吗?

逻辑回归接受,交叉验证程序似乎只接受y值的数组。你好像通过了一个矩阵

检查差异:

您正在通过以下步骤:

df.iloc[:,0:1].as_matrix()
array([[0],
       [1],
       [2]], dtype=int64)
但使用它可能更好

df.iloc[:,0].as_matrix()
array([0, 1, 2], dtype=int64)

你能试试吗?

你的数据看起来和问题中的表格完全一样吗?@MaximilianPeters是的,我表格中的数据都是int类型,只是有更多的列,与示例没有太大的不同。错误是相同的。你的数据看起来和问题中的表格完全一样吗?@MaximilianPeters是的,我的表中的数据都是int类型,只是有更多的列,与示例没有太大的不同。错误是samethank,您非常了解,您指出了原因和方法。我刚才尝试了您的方法,但结果是另一个错误:ValueError:此解算器需要数据中至少两个类的样本,但是数据只包含一个类:0I将这些代码放在def之前,问题与此类似:另外,我制作了一个100行的示例,错误仍在发生data\u train=shuffle(data\u train)x=data\u train.iloc[:,1:。as\u matrix()y=data\u train.iloc[:,0]。as\u matrix()非常感谢,您指出了原因和方法。我刚才尝试了您的方法,但结果是另一个错误:ValueError:此解算器需要数据中至少两个类的样本,但数据仅包含一个类:0我将这些代码放在def之前,问题与此类似:另外,我制作了一个100行示例,错误仍在发生data\u train=shuffle(data\u train)x=data\u train.iloc[:,1:]。as\u matrix()y=data\u train.iloc[:,0]。as\u matrix()