Python 关于Logistic回归的几个问题
我现在使用Open教室()中的训练集来尝试逻辑回归,我只使用LR,不像那个页面使用LR和牛顿的方法。 下面是我的代码:Python 关于Logistic回归的几个问题,python,machine-learning,logistic-regression,Python,Machine Learning,Logistic Regression,我现在使用Open教室()中的训练集来尝试逻辑回归,我只使用LR,不像那个页面使用LR和牛顿的方法。 下面是我的代码: from numpy import * import matplotlib.pyplot as plt def loadDataSet(): dataMat = []; labelMat = [] frX = open('../ex4x.dat') frY = open('../ex4y.dat') for line1 in frX.readl
from numpy import *
import matplotlib.pyplot as plt
def loadDataSet():
dataMat = []; labelMat = []
frX = open('../ex4x.dat')
frY = open('../ex4y.dat')
for line1 in frX.readlines():
lineArr1 = line1.strip().split()
dataMat.append([1.0, float(lineArr1[0]), float(lineArr1[1])])
for line2 in frY.readlines():
lineArr2 = line2.strip().split()
labelMat.append(float(lineArr2[0]))
return dataMat,labelMat
def sigmoid(inX):
return 1.0/(1+exp(-inX))
# def autoNorm(dataSet):
# # newValue = (oldValue-min)/(max-min)
# minVals = min(dataSet)
# maxVals = max(dataSet)
# ranges = list(map(lambda x: x[0]-x[1], zip(maxVals, minVals)))
# normDataSet = zeros(shape(dataSet))
# m,n = shape(dataSet)
# normDataSet = list(map(lambda x: x[0]-x[1], zip(dataSet,tile(minVals, (m,1)))))
# normDataSet = normDataSet/tile(ranges, (m,1))
# return normDataSet, ranges, minVals
def gradDescent(dataMatIn, classLabels):
x = mat(dataMatIn)
y = mat(classLabels).transpose()
m,n = shape(x)
alpha = 0.001
maxCycles = 100000
theta = ones((n,1))
for k in range(maxCycles):
h = sigmoid(x*theta)
error = h - y
cost = -1*dot(log(h).T,y)-dot((1-y).T,log(1-h))
print("Iteration %d | Cost: %f" % (k, cost))
theta = theta - alpha * (x.transpose() * error /m)
return theta
def plotBestFit(weights):
dataMat,labelMat=loadDataSet()
dataArr = array(dataMat)
n = shape(dataArr)[0]
xcord1 = []; ycord1 = []
xcord2 = []; ycord2 = []
for i in range(n):
if int(labelMat[i])== 1:
xcord1.append(dataArr[i,1]);ycord1.append(dataArr[i,2])
else:
xcord2.append(dataArr[i,1]);ycord2.append(dataArr[i,2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
ax.scatter(xcord2, ycord2, s=30, c='green')
min_x = min(mat(dataMat)[:, 1])
max_x = max(mat(dataMat)[:, 1])
x = arange(min_x, max_x, 1)
y = (-weights[0]-weights[1]*x)/weights[2]
ax.plot(x, y)
plt.xlabel('X1'); plt.ylabel('X2');
plt.show()
dataMat, classLabel = loadDataSet()
weights = gradDescent(dataMat, classLabel)
print weights
plotBestFit(weights.getA())
以下是我的问题:
1.我对它进行了100000次训练,每次迭代都会打印错误,但我并没有看到它被转换,事实上我不确定。
2.我不知道如何使用matplotlib正确绘制分类器,当maxCycle为200000时,我可以得到一个稍微合理的分类器,以及maxCyle为100000时,绘制似乎根本不合理
更新代码:
count = 0
for i in range(80):
result = sigmoid(dataMat[i] * weights)
if result > 0.5:
a = 1
else:
a = 0
if float(a) != classLabel[i][0]:
count += 1
errorRate = (float(count)/80)
print "error count is: %f, error rate is: %f" %(count,errorRate)
你的代码实际上很好!以下是一些评论:
1
的值,因为theta
和x
的乘积给出非常大的数字。log(1-h)
的计算可能会导致错误,因为log
未在0
中定义。我更喜欢用0的初始化θ
m
。这与算法无关,但最好遵循理论500.000次
迭代中获得了一个很好的结果import numpy as np
import matplotlib.pyplot as plt
def loadDataSet():
dataMat = []; labelMat = []
frX = open('../ex4x.dat')
frY = open('../ex4y.dat')
for line1 in frX.readlines():
lineArr1 = line1.strip().split()
dataMat.append([1.0, float(lineArr1[0]), float(lineArr1[1])])
for line2 in frY.readlines():
lineArr2 = line2.strip().split()
labelMat.append([float(lineArr2[0])])
return dataMat,labelMat
def sigmoid(inX):
return 1.0/(1+np.exp(-inX))
def gradDescent(dataMatIn, classLabels, alpha, maxCycles):
x = np.mat(dataMatIn)
y = np.mat(classLabels)
m,n = np.shape(x)
n = n - 1 #usually n is the number of features (without the 1's)
theta = np.zeros((n+1,1))
cost_history = [] #list to accumulate the cost values
for k in range(maxCycles):
h = sigmoid(x*theta)
cost = ((-np.multiply(y, np.log(h)) -np.multiply(1-y, np.log(1-h))).sum(axis=0)/m)[0, 0]
if ((k % 1000) == 0):
cost_history.append(cost) #on each 1000th iteration the cost is saved to a list
grad = (x.transpose() * (h - y))/m
theta = theta - alpha*grad
plot_cost = 1
if (plot_cost == 1):
plt.plot(cost_history)
plt.title("Cost")
plt.show()
return theta
def plotBestFit(dataMat, classLabel, weights):
arrY = np.asarray(classLabel)
arrX = np.asarray(dataMat)
ind1 = np.where(arrY == 1)[0]
ind0 = np.where(arrY == 0)[0]
min_x1 = min(np.mat(dataMat)[:, 1])
max_x1 = max(np.mat(dataMat)[:, 1])
x1_val = np.arange(min_x1, max_x1, 1)
x2_val = (-weights[0, 0]-weights[1, 0]*x1_val)/weights[2, 0]
plt.scatter(arrX[ind1, 1], arrX[ind1, 2], s=30, c='red', marker='s')
plt.scatter(arrX[ind0, 1], arrX[ind0, 2], s=30, c='blue', marker='s')
plt.plot(x1_val, x2_val)
plt.xlabel('X1', fontsize=18)
plt.ylabel('X2', fontsize=18)
plt.title("Separation border")
plt.show()
dataMat, classLabel = loadDataSet()
weights = gradDescent(dataMat, classLabel, 0.0014, 500000)
print(weights)
plotBestFit(dataMat, classLabel, weights)
更新
在阅读了第一版文章评论中的问题后,我尝试优化代码,以使用更少的迭代次数实现成本函数的收敛
事实上,功能标准化创造了奇迹:)
仅经过30次迭代,就获得了更好的结果
以下是新的情节:
由于标准化,您需要缩放每个新的测试示例,以便对其进行分类
这是新代码。我更改了一些数据类型以避免不必要的数据类型转换
import numpy as np
import matplotlib.pyplot as plt
def loadDataSet():
dataMat = []; labelMat = []
frX = open('../ex4x.dat')
frY = open('../ex4y.dat')
for line1 in frX.readlines():
lineArr1 = line1.strip().split()
dataMat.append([1.0, float(lineArr1[0]), float(lineArr1[1])])
for line2 in frY.readlines():
lineArr2 = line2.strip().split()
labelMat.append([float(lineArr2[0])])
return np.asarray(dataMat), np.asarray(labelMat)
def sigmoid(inX):
return 1.0/(1+np.exp(-inX))
def gradDescent(x, y, alpha, maxCycles):
m,n = np.shape(x)
n = n - 1 #usually n is the number of features (without the 1's)
theta = np.zeros((n+1,1))
cost_history = [] #list to accumulate the cost values
cost_iter = []
for k in range(maxCycles):
h = sigmoid(np.dot(x, theta))
cost = np.sum(-np.multiply(y, np.log(h)) -np.multiply(1-y, np.log(1-h)))/m
cost_history.append(cost) #on each 1000th iteration the cost is saved to a list
cost_iter.append(k)
grad = np.dot(x.transpose(), (h - y))/m
theta = theta - alpha*grad
plot_cost = 1
if (plot_cost == 1):
plt.plot(cost_iter, cost_history)
plt.title("Cost")
plt.show()
return theta
def plotBestFit(arrX, arrY, weights):
ind1 = np.where(arrY == 1)[0]
ind0 = np.where(arrY == 0)[0]
min_x1 = min(arrX[:, 1:2])
max_x1 = max(arrX[:, 1:2])
x1_val = np.arange(min_x1, max_x1, 0.1)
x2_val = (-weights[0, 0]-weights[1, 0]*x1_val)/weights[2, 0]
plt.scatter(arrX[ind1, 1], arrX[ind1, 2], s=30, c='red', marker='s')
plt.scatter(arrX[ind0, 1], arrX[ind0, 2], s=30, c='blue', marker='s')
plt.plot(x1_val, x2_val)
plt.xlabel('X1', fontsize=18)
plt.ylabel('X2', fontsize=18)
plt.title("Separation border")
plt.show()
dataMat, classLabel = loadDataSet()
m = np.shape(dataMat)[0]
#standardization
dataMatMean = np.mean(dataMat, axis=0)
dataMatStd = np.std(dataMat, axis=0)
dataMatMean_m = np.tile(dataMatMean, (m, 1))
dataMatStd_m = np.tile(dataMatStd, (m, 1))
dataMatStand = np.copy(dataMat)
dataMatStand[:, 1:3] = np.divide( (dataMatStand[:, 1:3] - dataMatMean_m[:, 1:3]), dataMatStd_m[:, 1:3])
weights = gradDescent(dataMatStand, classLabel, 1.0, 30)
print(weights)
plotBestFit(dataMatStand, classLabel, weights)
你的代码实际上很好!以下是一些评论:
1
的值,因为theta
和x
的乘积给出非常大的数字。log(1-h)
的计算可能会导致错误,因为log
未在0
中定义。我更喜欢用0的初始化θ
m
。这与算法无关,但最好遵循理论500.000次
迭代中获得了一个很好的结果import numpy as np
import matplotlib.pyplot as plt
def loadDataSet():
dataMat = []; labelMat = []
frX = open('../ex4x.dat')
frY = open('../ex4y.dat')
for line1 in frX.readlines():
lineArr1 = line1.strip().split()
dataMat.append([1.0, float(lineArr1[0]), float(lineArr1[1])])
for line2 in frY.readlines():
lineArr2 = line2.strip().split()
labelMat.append([float(lineArr2[0])])
return dataMat,labelMat
def sigmoid(inX):
return 1.0/(1+np.exp(-inX))
def gradDescent(dataMatIn, classLabels, alpha, maxCycles):
x = np.mat(dataMatIn)
y = np.mat(classLabels)
m,n = np.shape(x)
n = n - 1 #usually n is the number of features (without the 1's)
theta = np.zeros((n+1,1))
cost_history = [] #list to accumulate the cost values
for k in range(maxCycles):
h = sigmoid(x*theta)
cost = ((-np.multiply(y, np.log(h)) -np.multiply(1-y, np.log(1-h))).sum(axis=0)/m)[0, 0]
if ((k % 1000) == 0):
cost_history.append(cost) #on each 1000th iteration the cost is saved to a list
grad = (x.transpose() * (h - y))/m
theta = theta - alpha*grad
plot_cost = 1
if (plot_cost == 1):
plt.plot(cost_history)
plt.title("Cost")
plt.show()
return theta
def plotBestFit(dataMat, classLabel, weights):
arrY = np.asarray(classLabel)
arrX = np.asarray(dataMat)
ind1 = np.where(arrY == 1)[0]
ind0 = np.where(arrY == 0)[0]
min_x1 = min(np.mat(dataMat)[:, 1])
max_x1 = max(np.mat(dataMat)[:, 1])
x1_val = np.arange(min_x1, max_x1, 1)
x2_val = (-weights[0, 0]-weights[1, 0]*x1_val)/weights[2, 0]
plt.scatter(arrX[ind1, 1], arrX[ind1, 2], s=30, c='red', marker='s')
plt.scatter(arrX[ind0, 1], arrX[ind0, 2], s=30, c='blue', marker='s')
plt.plot(x1_val, x2_val)
plt.xlabel('X1', fontsize=18)
plt.ylabel('X2', fontsize=18)
plt.title("Separation border")
plt.show()
dataMat, classLabel = loadDataSet()
weights = gradDescent(dataMat, classLabel, 0.0014, 500000)
print(weights)
plotBestFit(dataMat, classLabel, weights)
更新
在阅读了第一版文章评论中的问题后,我尝试优化代码,以使用更少的迭代次数实现成本函数的收敛
事实上,功能标准化创造了奇迹:)
仅经过30次迭代,就获得了更好的结果
以下是新的情节:
由于标准化,您需要缩放每个新的测试示例,以便对其进行分类
这是新代码。我更改了一些数据类型以避免不必要的数据类型转换
import numpy as np
import matplotlib.pyplot as plt
def loadDataSet():
dataMat = []; labelMat = []
frX = open('../ex4x.dat')
frY = open('../ex4y.dat')
for line1 in frX.readlines():
lineArr1 = line1.strip().split()
dataMat.append([1.0, float(lineArr1[0]), float(lineArr1[1])])
for line2 in frY.readlines():
lineArr2 = line2.strip().split()
labelMat.append([float(lineArr2[0])])
return np.asarray(dataMat), np.asarray(labelMat)
def sigmoid(inX):
return 1.0/(1+np.exp(-inX))
def gradDescent(x, y, alpha, maxCycles):
m,n = np.shape(x)
n = n - 1 #usually n is the number of features (without the 1's)
theta = np.zeros((n+1,1))
cost_history = [] #list to accumulate the cost values
cost_iter = []
for k in range(maxCycles):
h = sigmoid(np.dot(x, theta))
cost = np.sum(-np.multiply(y, np.log(h)) -np.multiply(1-y, np.log(1-h)))/m
cost_history.append(cost) #on each 1000th iteration the cost is saved to a list
cost_iter.append(k)
grad = np.dot(x.transpose(), (h - y))/m
theta = theta - alpha*grad
plot_cost = 1
if (plot_cost == 1):
plt.plot(cost_iter, cost_history)
plt.title("Cost")
plt.show()
return theta
def plotBestFit(arrX, arrY, weights):
ind1 = np.where(arrY == 1)[0]
ind0 = np.where(arrY == 0)[0]
min_x1 = min(arrX[:, 1:2])
max_x1 = max(arrX[:, 1:2])
x1_val = np.arange(min_x1, max_x1, 0.1)
x2_val = (-weights[0, 0]-weights[1, 0]*x1_val)/weights[2, 0]
plt.scatter(arrX[ind1, 1], arrX[ind1, 2], s=30, c='red', marker='s')
plt.scatter(arrX[ind0, 1], arrX[ind0, 2], s=30, c='blue', marker='s')
plt.plot(x1_val, x2_val)
plt.xlabel('X1', fontsize=18)
plt.ylabel('X2', fontsize=18)
plt.title("Separation border")
plt.show()
dataMat, classLabel = loadDataSet()
m = np.shape(dataMat)[0]
#standardization
dataMatMean = np.mean(dataMat, axis=0)
dataMatStd = np.std(dataMat, axis=0)
dataMatMean_m = np.tile(dataMatMean, (m, 1))
dataMatStd_m = np.tile(dataMatStd, (m, 1))
dataMatStand = np.copy(dataMat)
dataMatStand[:, 1:3] = np.divide( (dataMatStand[:, 1:3] - dataMatMean_m[:, 1:3]), dataMatStd_m[:, 1:3])
weights = gradDescent(dataMatStand, classLabel, 1.0, 30)
print(weights)
plotBestFit(dataMatStand, classLabel, weights)
我确实在倍频程中尝试过,在100次迭代中我可以得到一个很好的结果,唯一的区别是我使用了内置的fminunc函数。如果不使用特征缩放,达到500.000次的好结果是合理的还是可以接受的?我将学习率改为0.001,迭代次数改为500.000,然后我得到了一个结果“错误计数为:15.000000,错误率为:0.187500“,这是一个好结果还是我可以做些什么来改善结果?顺便说一句,训练集的数量是80@iamcodylee,我更新了我的帖子。我尝试缩放这些特性,现在只需要30次迭代就可以得到结果。使用500.000次迭代确实是一件坏事,但也许这是一个很好的例子,可以深入了解这些内容及其优化。专业系统使用更高效的算法。我想没有人会使用纯梯度法。我只是一个机器学习的新手,所以我想一步一步地研究它,通过特征缩放(我自己在这里使用标准化,不管我使用什么方法缩放特征),尽管它收敛得比以前快得多,但我不知道你是否注意到错误计数/错误率,错误计数从15到40 w/c意味着我得到一半正确一半错误的结果。我认为这不是一个好结果。我能做些什么来提高成绩?我想我没有得到这个问题。你能用一个例子描述一下所提到的关于错误计数/错误率的问题吗?或者你可以编辑你的问题并添加一些新的注释和示例?我确实在八度音阶中尝试过,在100次迭代中我可以得到一个很好的结果,唯一的区别是我使用了内置的fminunc f