python sklearn:indexer:';数组'的索引太多;
我是机器学习的新手。我最近遇到了一个问题,并且已经搜索了关于同一主题的StackOverFlow,但仍然无法找到它。谁能看一下吗?非常感谢python sklearn:indexer:';数组'的索引太多;,python,pandas,scikit-learn,Python,Pandas,Scikit Learn,我是机器学习的新手。我最近遇到了一个问题,并且已经搜索了关于同一主题的StackOverFlow,但仍然无法找到它。谁能看一下吗?非常感谢 #-*- coding:utf-8 -*- import pandas as pd import numpy as np import matplotlib.pyplot as plt data_train = pd.read_excel('py_train.xlsx',index_col=0) test_data = pd.read_excel('py_
#-*- coding:utf-8 -*-
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data_train = pd.read_excel('py_train.xlsx',index_col=0)
test_data = pd.read_excel('py_test.xlsx',index_col=0)
from sklearn import preprocessing
x = data_train.iloc[:,1:].as_matrix()
y = data_train.iloc[:,0:1].as_matrix()
sx = preprocessing.scale(x)
from sklearn import linear_model
clf = linear_model.LogisticRegression()
clf.fit(sx,y)
clf
代码以前运行得很好,并且数据都被清除了。我将数据融入其中,比如:
id rep a b c d
1 0 1 2 3 4
2 0 2 3 4 5
3 0 3 4 5 6
4 1 4 5 6 7
5 1 5 6 7 8
6 1 6 7 8 9
7 1 7 8 9 10
8 1 8 9 10 11
9 1 9 10 11 12
10 1 10 11 12 13
下面的代码显示了一个索引器。为什么?我该如何修复它
谢谢
import numpy as np
import matplotlib.pyplot as plt
from sklearn.learning_curve import learning_curve
def plot_learning_curve(estimator, title, x, y, ylim=None, cv=None, n_jobs=1,
train_sizes=np.linspace(.05, 1., 20), verbose=0, plot=True):
train_sizes, train_scores, test_scores = learning_curve(
estimator, x, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
if plot:
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim) #ylim=y's limit
plt.xlabel(u"train set size")
plt.ylabel(u"score")
plt.gca().invert_yaxis()
plt.grid() #网格
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std,
alpha=0.1, color="b") # generates a shaded region
plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std,
alpha=0.1, color="r")
plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label=u"train set score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label=u"CV score")
plt.legend(loc="best")
plt.draw()
plt.gca().invert_yaxis()
plt.show()
midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2
diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1])
return midpoint, diff
plot_learning_curve(clf, u"learning_curve", x, y)
详情如下:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-18-0dc3d0934602> in <module>()
42 return midpoint, diff
43
---> 44 plot_learning_curve(clf, u"learning_curve", x, y)
<ipython-input-18-0dc3d0934602> in plot_learning_curve(estimator, title, x, y, ylim, cv, n_jobs, train_sizes, verbose, plot)
8
9 train_sizes, train_scores, test_scores = learning_curve(
---> 10 estimator, x, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose)
11
12 train_scores_mean = np.mean(train_scores, axis=1)
D:\Anaconda3\lib\site-packages\sklearn\learning_curve.py in learning_curve(estimator, X, y, train_sizes, cv, scoring, exploit_incremental_learning, n_jobs, pre_dispatch, verbose, error_score)
138 X, y = indexable(X, y)
139 # Make a list since we will be iterating multiple times over the folds
--> 140 cv = list(check_cv(cv, X, y, classifier=is_classifier(estimator)))
141 scorer = check_scoring(estimator, scoring=scoring)
142
D:\Anaconda3\lib\site-packages\sklearn\cross_validation.py in check_cv(cv, X, y, classifier)
1821 if classifier:
1822 if type_of_target(y) in ['binary', 'multiclass']:
-> 1823 cv = StratifiedKFold(y, cv)
1824 else:
1825 cv = KFold(_num_samples(y), cv)
D:\Anaconda3\lib\site-packages\sklearn\cross_validation.py in __init__(self, y, n_folds, shuffle, random_state)
567 for test_fold_idx, per_label_splits in enumerate(zip(*per_label_cvs)):
568 for label, (_, test_split) in zip(unique_labels, per_label_splits):
--> 569 label_test_folds = test_folds[y == label]
570 # the test split can be too big because we used
571 # KFold(max(c, self.n_folds), self.n_folds) instead of
IndexError: too many indices for array
---------------------------------------------------------------------------
索引器回溯(最后一次最近调用)
在()
42返回中点,差分
43
--->44绘制学习曲线(clf,u“学习曲线”,x,y)
在绘图学习曲线中(估计器、标题、x、y、ylim、cv、n作业、培训规模、详细信息、绘图)
8.
9培训规模、培训分数、考试分数=学习曲线(
--->10估计器,x,y,cv=cv,n_作业=n_作业,序列尺寸=序列尺寸,详细=详细)
11
12系列分数平均值=np.平均值(系列分数,轴=1)
D:\Anaconda3\lib\site packages\sklearn\learning\u curve.py in learning\u curve(估计器、X、y、训练规模、cv、评分、利用增量学习、n作业、预调度、详细信息、错误评分)
138 X,y=可转位(X,y)
139#列出一个列表,因为我们将在褶皱上重复多次
-->140 cv=列表(检查cv(cv,X,y,分类器=is_分类器(估计器)))
141评分员=检查评分(评估员,评分=评分)
142
D:\Anaconda3\lib\site packages\sklearn\cross\u validation.py in check\u cv(cv、X、y、分类器)
1821如果分类器:
1822如果[‘二进制’、‘多类’]中的_目标(y)的_类型:
->1823 cv=层状褶皱(y,cv)
1824其他:
1825 cv=KFold(_num_样本(y),cv)
D:\Anaconda3\lib\site packages\sklearn\cross\u validation.py处于初始化状态(self、y、n\u folds、shuffle、random\u状态)
567对于test_fold_idx,枚举(zip(*per_label_cvs))中的每个标签分割:
568对于zip中的标签(U,测试_U拆分)(唯一的_U标签,每个_U标签拆分):
-->569标签\测试\折叠=测试\折叠[y==标签]
570#由于我们使用了
571#KFold(最大(c,自n倍),自n倍)代替
索引器:数组的索引太多
逻辑回归接受,交叉验证程序似乎只接受y值的数组。你好像通过了一个矩阵
检查差异:
您正在通过以下步骤:
df.iloc[:,0:1].as_matrix()
array([[0],
[1],
[2]], dtype=int64)
但使用它可能更好
df.iloc[:,0].as_matrix()
array([0, 1, 2], dtype=int64)
您可以试试吗?逻辑回归接受,交叉验证程序似乎只接受y值的数组。你好像通过了一个矩阵 检查差异: 您正在通过以下步骤:
df.iloc[:,0:1].as_matrix()
array([[0],
[1],
[2]], dtype=int64)
但使用它可能更好
df.iloc[:,0].as_matrix()
array([0, 1, 2], dtype=int64)
你能试试吗?你的数据看起来和问题中的表格完全一样吗?@MaximilianPeters是的,我表格中的数据都是int类型,只是有更多的列,与示例没有太大的不同。错误是相同的。你的数据看起来和问题中的表格完全一样吗?@MaximilianPeters是的,我的表中的数据都是int类型,只是有更多的列,与示例没有太大的不同。错误是samethank,您非常了解,您指出了原因和方法。我刚才尝试了您的方法,但结果是另一个错误:ValueError:此解算器需要数据中至少两个类的样本,但是数据只包含一个类:0I将这些代码放在def之前,问题与此类似:另外,我制作了一个100行的示例,错误仍在发生data\u train=shuffle(data\u train)x=data\u train.iloc[:,1:。as\u matrix()y=data\u train.iloc[:,0]。as\u matrix()非常感谢,您指出了原因和方法。我刚才尝试了您的方法,但结果是另一个错误:ValueError:此解算器需要数据中至少两个类的样本,但数据仅包含一个类:0我将这些代码放在def之前,问题与此类似:另外,我制作了一个100行示例,错误仍在发生data\u train=shuffle(data\u train)x=data\u train.iloc[:,1:]。as\u matrix()y=data\u train.iloc[:,0]。as\u matrix()