Python 关于Logistic回归的几个问题_Python_Machine Learning_Logistic Regression

Python 关于Logistic回归的几个问题

python machine-learning

Python 关于Logistic回归的几个问题,python,machine-learning,logistic-regression,Python,Machine Learning,Logistic Regression,我现在使用Open教室（）中的训练集来尝试逻辑回归，我只使用LR，不像那个页面使用LR和牛顿的方法。下面是我的代码： from numpy import * import matplotlib.pyplot as plt def loadDataSet(): dataMat = []; labelMat = [] frX = open('../ex4x.dat') frY = open('../ex4y.dat') for line1 in frX.readl

我现在使用Open教室（）中的训练集来尝试逻辑回归，我只使用LR，不像那个页面使用LR和牛顿的方法。下面是我的代码：

from numpy import *
import matplotlib.pyplot as plt

def loadDataSet():
    dataMat = []; labelMat = []
    frX = open('../ex4x.dat')
    frY = open('../ex4y.dat')
    for line1 in frX.readlines():
        lineArr1 = line1.strip().split()
        dataMat.append([1.0, float(lineArr1[0]), float(lineArr1[1])])

    for line2 in frY.readlines():
        lineArr2 = line2.strip().split()
        labelMat.append(float(lineArr2[0]))
    return dataMat,labelMat

def sigmoid(inX):
    return 1.0/(1+exp(-inX))

# def autoNorm(dataSet):
# #   newValue = (oldValue-min)/(max-min)
#     minVals = min(dataSet)
#     maxVals = max(dataSet)
#     ranges = list(map(lambda x: x[0]-x[1], zip(maxVals, minVals)))
#     normDataSet = zeros(shape(dataSet))
#     m,n = shape(dataSet)
#     normDataSet = list(map(lambda x: x[0]-x[1], zip(dataSet,tile(minVals, (m,1)))))
#     normDataSet = normDataSet/tile(ranges, (m,1))
#     return normDataSet, ranges, minVals

def gradDescent(dataMatIn, classLabels):
    x = mat(dataMatIn)
    y = mat(classLabels).transpose()
    m,n = shape(x)
    alpha = 0.001
    maxCycles = 100000
    theta = ones((n,1))
    for k in range(maxCycles):
        h = sigmoid(x*theta)
        error = h - y
        cost = -1*dot(log(h).T,y)-dot((1-y).T,log(1-h))
        print("Iteration %d | Cost: %f" % (k, cost))
        theta = theta - alpha * (x.transpose() * error /m)
    return theta

def plotBestFit(weights):
    dataMat,labelMat=loadDataSet()
    dataArr = array(dataMat)
    n = shape(dataArr)[0]
    xcord1 = []; ycord1 = []
    xcord2 = []; ycord2 = []
    for i in range(n):
        if int(labelMat[i])== 1:
            xcord1.append(dataArr[i,1]);ycord1.append(dataArr[i,2])
        else:
            xcord2.append(dataArr[i,1]);ycord2.append(dataArr[i,2])
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
    ax.scatter(xcord2, ycord2, s=30, c='green')
    min_x = min(mat(dataMat)[:, 1])
    max_x = max(mat(dataMat)[:, 1])
    x = arange(min_x, max_x, 1)
    y = (-weights[0]-weights[1]*x)/weights[2]
    ax.plot(x, y)
    plt.xlabel('X1'); plt.ylabel('X2');
    plt.show()

dataMat, classLabel = loadDataSet()
weights = gradDescent(dataMat, classLabel)
print weights
plotBestFit(weights.getA())

以下是我的问题： 1.我对它进行了100000次训练，每次迭代都会打印错误，但我并没有看到它被转换，事实上我不确定。 2.我不知道如何使用matplotlib正确绘制分类器，当maxCycle为200000时，我可以得到一个稍微合理的分类器，以及maxCyle为100000时，绘制似乎根本不合理

更新代码：

count = 0
for i in range(80):
    result = sigmoid(dataMat[i] * weights)
    if result > 0.5:
        a = 1
    else:
        a = 0

    if float(a) != classLabel[i][0]:
        count += 1
errorRate = (float(count)/80)
print "error count is: %f, error rate is: %f" %(count,errorRate)

你的代码实际上很好！以下是一些评论：

您已将θ初始化为所有θ。在本例中，我不会这样做。sigmoid函数的第一次调用将返回接近

的值，因为

theta

和

的乘积给出非常大的数字。

log（1-h）

的计算可能会导致错误，因为

log

未在

中定义。我更喜欢用

0的初始化θ


计算成本函数时，您错过了除以m
。这与算法无关，但最好遵循理论
绘制成本函数是个好主意，而不仅仅是打印它的值。正确的趋势可以非常清楚地看到
为了收敛，这个特殊的例子需要更多的迭代。我在500.000次
迭代中获得了一个很好的结果
帖子已更新，请参见下面的更新
以下是我的情节：


如您所见，生成的分隔线与教程中显示的绘图非常匹配
这是我的密码。它和你的有点不同，但它们非常相似
import numpy as np
import matplotlib.pyplot as plt

def loadDataSet():
    dataMat = []; labelMat = []
    frX = open('../ex4x.dat')
    frY = open('../ex4y.dat')
    for line1 in frX.readlines():
        lineArr1 = line1.strip().split()
        dataMat.append([1.0, float(lineArr1[0]), float(lineArr1[1])])

    for line2 in frY.readlines():
        lineArr2 = line2.strip().split()
        labelMat.append([float(lineArr2[0])])
    return dataMat,labelMat

def sigmoid(inX):
    return 1.0/(1+np.exp(-inX))    

def gradDescent(dataMatIn, classLabels, alpha, maxCycles):
    x = np.mat(dataMatIn)
    y = np.mat(classLabels)
    m,n = np.shape(x)
    n = n - 1               #usually n is the number of features (without the 1's)

    theta = np.zeros((n+1,1))

    cost_history = []       #list to accumulate the cost values

    for k in range(maxCycles):

        h = sigmoid(x*theta)

        cost = ((-np.multiply(y, np.log(h)) -np.multiply(1-y, np.log(1-h))).sum(axis=0)/m)[0, 0]

        if ((k % 1000) == 0):
            cost_history.append(cost)   #on each 1000th iteration the cost is saved to a list

        grad = (x.transpose() * (h - y))/m

        theta = theta - alpha*grad

    plot_cost = 1 
    if (plot_cost == 1):
        plt.plot(cost_history)
        plt.title("Cost")
        plt.show()

    return theta   

def plotBestFit(dataMat, classLabel, weights):
    arrY = np.asarray(classLabel)
    arrX = np.asarray(dataMat)
    ind1 = np.where(arrY == 1)[0]
    ind0 = np.where(arrY == 0)[0]

    min_x1 = min(np.mat(dataMat)[:, 1])
    max_x1 = max(np.mat(dataMat)[:, 1])
    x1_val = np.arange(min_x1, max_x1, 1)
    x2_val = (-weights[0, 0]-weights[1, 0]*x1_val)/weights[2, 0]

    plt.scatter(arrX[ind1, 1], arrX[ind1, 2], s=30, c='red', marker='s')
    plt.scatter(arrX[ind0, 1], arrX[ind0, 2], s=30, c='blue', marker='s')
    plt.plot(x1_val, x2_val)
    plt.xlabel('X1', fontsize=18)
    plt.ylabel('X2', fontsize=18)
    plt.title("Separation border")
    plt.show()


dataMat, classLabel = loadDataSet()
weights = gradDescent(dataMat, classLabel, 0.0014, 500000) 

print(weights)
plotBestFit(dataMat, classLabel, weights)

更新
在阅读了第一版文章评论中的问题后，我尝试优化代码，以使用更少的迭代次数实现成本函数的收敛
事实上，功能标准化创造了奇迹：）
仅经过30次迭代，就获得了更好的结果
以下是新的情节：


由于标准化，您需要缩放每个新的测试示例，以便对其进行分类
这是新代码。我更改了一些数据类型以避免不必要的数据类型转换
import numpy as np
import matplotlib.pyplot as plt

def loadDataSet():
    dataMat = []; labelMat = []
    frX = open('../ex4x.dat')
    frY = open('../ex4y.dat')
    for line1 in frX.readlines():
        lineArr1 = line1.strip().split()
        dataMat.append([1.0, float(lineArr1[0]), float(lineArr1[1])])

    for line2 in frY.readlines():
        lineArr2 = line2.strip().split()
        labelMat.append([float(lineArr2[0])])

    return np.asarray(dataMat), np.asarray(labelMat)

def sigmoid(inX):
    return 1.0/(1+np.exp(-inX))    

def gradDescent(x, y, alpha, maxCycles):

    m,n = np.shape(x)
    n = n - 1               #usually n is the number of features (without the 1's)

    theta = np.zeros((n+1,1))

    cost_history = []       #list to accumulate the cost values
    cost_iter = []

    for k in range(maxCycles):

        h = sigmoid(np.dot(x, theta))

        cost = np.sum(-np.multiply(y, np.log(h)) -np.multiply(1-y, np.log(1-h)))/m


        cost_history.append(cost)   #on each 1000th iteration the cost is saved to a list
        cost_iter.append(k)

        grad = np.dot(x.transpose(), (h - y))/m

        theta = theta - alpha*grad

    plot_cost = 1 
    if (plot_cost == 1):
        plt.plot(cost_iter, cost_history)
        plt.title("Cost")
        plt.show()

    return theta   

def plotBestFit(arrX, arrY, weights):

    ind1 = np.where(arrY == 1)[0]
    ind0 = np.where(arrY == 0)[0]

    min_x1 = min(arrX[:, 1:2])
    max_x1 = max(arrX[:, 1:2])
    x1_val = np.arange(min_x1, max_x1, 0.1)
    x2_val = (-weights[0, 0]-weights[1, 0]*x1_val)/weights[2, 0]

    plt.scatter(arrX[ind1, 1], arrX[ind1, 2], s=30, c='red', marker='s')
    plt.scatter(arrX[ind0, 1], arrX[ind0, 2], s=30, c='blue', marker='s')
    plt.plot(x1_val, x2_val)
    plt.xlabel('X1', fontsize=18)
    plt.ylabel('X2', fontsize=18)
    plt.title("Separation border")
    plt.show()


dataMat, classLabel = loadDataSet()
m = np.shape(dataMat)[0]

#standardization
dataMatMean = np.mean(dataMat, axis=0)
dataMatStd = np.std(dataMat, axis=0)

dataMatMean_m = np.tile(dataMatMean, (m, 1))
dataMatStd_m = np.tile(dataMatStd, (m, 1))

dataMatStand = np.copy(dataMat)
dataMatStand[:, 1:3] = np.divide(  (dataMatStand[:, 1:3] - dataMatMean_m[:, 1:3]),   dataMatStd_m[:, 1:3])

weights = gradDescent(dataMatStand, classLabel, 1.0, 30) 

print(weights)
plotBestFit(dataMatStand, classLabel, weights)

你的代码实际上很好！以下是一些评论：
您已将θ初始化为所有θ。在本例中，我不会这样做。sigmoid函数的第一次调用将返回接近1
的值，因为theta
和x
的乘积给出非常大的数字。log（1-h）
的计算可能会导致错误，因为log
未在0
中定义。我更喜欢用0的初始化θ

计算成本函数时，您错过了除以m
。这与算法无关，但最好遵循理论
绘制成本函数是个好主意，而不仅仅是打印它的值。正确的趋势可以非常清楚地看到
为了收敛，这个特殊的例子需要更多的迭代。我在500.000次
迭代中获得了一个很好的结果
帖子已更新，请参见下面的更新
以下是我的情节：


如您所见，生成的分隔线与教程中显示的绘图非常匹配
这是我的密码。它和你的有点不同，但它们非常相似
import numpy as np
import matplotlib.pyplot as plt

def loadDataSet():
    dataMat = []; labelMat = []
    frX = open('../ex4x.dat')
    frY = open('../ex4y.dat')
    for line1 in frX.readlines():
        lineArr1 = line1.strip().split()
        dataMat.append([1.0, float(lineArr1[0]), float(lineArr1[1])])

    for line2 in frY.readlines():
        lineArr2 = line2.strip().split()
        labelMat.append([float(lineArr2[0])])
    return dataMat,labelMat

def sigmoid(inX):
    return 1.0/(1+np.exp(-inX))    

def gradDescent(dataMatIn, classLabels, alpha, maxCycles):
    x = np.mat(dataMatIn)
    y = np.mat(classLabels)
    m,n = np.shape(x)
    n = n - 1               #usually n is the number of features (without the 1's)

    theta = np.zeros((n+1,1))

    cost_history = []       #list to accumulate the cost values

    for k in range(maxCycles):

        h = sigmoid(x*theta)

        cost = ((-np.multiply(y, np.log(h)) -np.multiply(1-y, np.log(1-h))).sum(axis=0)/m)[0, 0]

        if ((k % 1000) == 0):
            cost_history.append(cost)   #on each 1000th iteration the cost is saved to a list

        grad = (x.transpose() * (h - y))/m

        theta = theta - alpha*grad

    plot_cost = 1 
    if (plot_cost == 1):
        plt.plot(cost_history)
        plt.title("Cost")
        plt.show()

    return theta   

def plotBestFit(dataMat, classLabel, weights):
    arrY = np.asarray(classLabel)
    arrX = np.asarray(dataMat)
    ind1 = np.where(arrY == 1)[0]
    ind0 = np.where(arrY == 0)[0]

    min_x1 = min(np.mat(dataMat)[:, 1])
    max_x1 = max(np.mat(dataMat)[:, 1])
    x1_val = np.arange(min_x1, max_x1, 1)
    x2_val = (-weights[0, 0]-weights[1, 0]*x1_val)/weights[2, 0]

    plt.scatter(arrX[ind1, 1], arrX[ind1, 2], s=30, c='red', marker='s')
    plt.scatter(arrX[ind0, 1], arrX[ind0, 2], s=30, c='blue', marker='s')
    plt.plot(x1_val, x2_val)
    plt.xlabel('X1', fontsize=18)
    plt.ylabel('X2', fontsize=18)
    plt.title("Separation border")
    plt.show()


dataMat, classLabel = loadDataSet()
weights = gradDescent(dataMat, classLabel, 0.0014, 500000) 

print(weights)
plotBestFit(dataMat, classLabel, weights)

更新
在阅读了第一版文章评论中的问题后，我尝试优化代码，以使用更少的迭代次数实现成本函数的收敛
事实上，功能标准化创造了奇迹：）
仅经过30次迭代，就获得了更好的结果
以下是新的情节：


由于标准化，您需要缩放每个新的测试示例，以便对其进行分类
这是新代码。我更改了一些数据类型以避免不必要的数据类型转换
import numpy as np
import matplotlib.pyplot as plt

def loadDataSet():
    dataMat = []; labelMat = []
    frX = open('../ex4x.dat')
    frY = open('../ex4y.dat')
    for line1 in frX.readlines():
        lineArr1 = line1.strip().split()
        dataMat.append([1.0, float(lineArr1[0]), float(lineArr1[1])])

    for line2 in frY.readlines():
        lineArr2 = line2.strip().split()
        labelMat.append([float(lineArr2[0])])

    return np.asarray(dataMat), np.asarray(labelMat)

def sigmoid(inX):
    return 1.0/(1+np.exp(-inX))    

def gradDescent(x, y, alpha, maxCycles):

    m,n = np.shape(x)
    n = n - 1               #usually n is the number of features (without the 1's)

    theta = np.zeros((n+1,1))

    cost_history = []       #list to accumulate the cost values
    cost_iter = []

    for k in range(maxCycles):

        h = sigmoid(np.dot(x, theta))

        cost = np.sum(-np.multiply(y, np.log(h)) -np.multiply(1-y, np.log(1-h)))/m


        cost_history.append(cost)   #on each 1000th iteration the cost is saved to a list
        cost_iter.append(k)

        grad = np.dot(x.transpose(), (h - y))/m

        theta = theta - alpha*grad

    plot_cost = 1 
    if (plot_cost == 1):
        plt.plot(cost_iter, cost_history)
        plt.title("Cost")
        plt.show()

    return theta   

def plotBestFit(arrX, arrY, weights):

    ind1 = np.where(arrY == 1)[0]
    ind0 = np.where(arrY == 0)[0]

    min_x1 = min(arrX[:, 1:2])
    max_x1 = max(arrX[:, 1:2])
    x1_val = np.arange(min_x1, max_x1, 0.1)
    x2_val = (-weights[0, 0]-weights[1, 0]*x1_val)/weights[2, 0]

    plt.scatter(arrX[ind1, 1], arrX[ind1, 2], s=30, c='red', marker='s')
    plt.scatter(arrX[ind0, 1], arrX[ind0, 2], s=30, c='blue', marker='s')
    plt.plot(x1_val, x2_val)
    plt.xlabel('X1', fontsize=18)
    plt.ylabel('X2', fontsize=18)
    plt.title("Separation border")
    plt.show()


dataMat, classLabel = loadDataSet()
m = np.shape(dataMat)[0]

#standardization
dataMatMean = np.mean(dataMat, axis=0)
dataMatStd = np.std(dataMat, axis=0)

dataMatMean_m = np.tile(dataMatMean, (m, 1))
dataMatStd_m = np.tile(dataMatStd, (m, 1))

dataMatStand = np.copy(dataMat)
dataMatStand[:, 1:3] = np.divide(  (dataMatStand[:, 1:3] - dataMatMean_m[:, 1:3]),   dataMatStd_m[:, 1:3])

weights = gradDescent(dataMatStand, classLabel, 1.0, 30) 

print(weights)
plotBestFit(dataMatStand, classLabel, weights)

我确实在倍频程中尝试过，在100次迭代中我可以得到一个很好的结果，唯一的区别是我使用了内置的fminunc函数。如果不使用特征缩放，达到500.000次的好结果是合理的还是可以接受的？我将学习率改为0.001，迭代次数改为500.000，然后我得到了一个结果“错误计数为：15.000000，错误率为：0.187500“，这是一个好结果还是我可以做些什么来改善结果？顺便说一句，训练集的数量是80@iamcodylee，我更新了我的帖子。我尝试缩放这些特性，现在只需要30次迭代就可以得到结果。使用500.000次迭代确实是一件坏事，但也许这是一个很好的例子，可以深入了解这些内容及其优化。专业系统使用更高效的算法。我想没有人会使用纯梯度法。我只是一个机器学习的新手，所以我想一步一步地研究它，通过特征缩放（我自己在这里使用标准化，不管我使用什么方法缩放特征），尽管它收敛得比以前快得多，但我不知道你是否注意到错误计数/错误率，错误计数从15到40 w/c意味着我得到一半正确一半错误的结果。我认为这不是一个好结果。我能做些什么来提高成绩？我想我没有得到这个问题。你能用一个例子描述一下所提到的关于错误计数/错误率的问题吗？或者你可以编辑你的问题并添加一些新的注释和示例？我确实在八度音阶中尝试过，在100次迭代中我可以得到一个很好的结果，唯一的区别是我使用了内置的fminunc f