Python 关于Logistic回归的几个问题

Python 关于Logistic回归的几个问题,python,machine-learning,logistic-regression,Python,Machine Learning,Logistic Regression,我现在使用Open教室()中的训练集来尝试逻辑回归,我只使用LR,不像那个页面使用LR和牛顿的方法。 下面是我的代码: from numpy import * import matplotlib.pyplot as plt def loadDataSet(): dataMat = []; labelMat = [] frX = open('../ex4x.dat') frY = open('../ex4y.dat') for line1 in frX.readl

我现在使用Open教室()中的训练集来尝试逻辑回归,我只使用LR,不像那个页面使用LR和牛顿的方法。 下面是我的代码:

from numpy import *
import matplotlib.pyplot as plt

def loadDataSet():
    dataMat = []; labelMat = []
    frX = open('../ex4x.dat')
    frY = open('../ex4y.dat')
    for line1 in frX.readlines():
        lineArr1 = line1.strip().split()
        dataMat.append([1.0, float(lineArr1[0]), float(lineArr1[1])])

    for line2 in frY.readlines():
        lineArr2 = line2.strip().split()
        labelMat.append(float(lineArr2[0]))
    return dataMat,labelMat

def sigmoid(inX):
    return 1.0/(1+exp(-inX))

# def autoNorm(dataSet):
# #   newValue = (oldValue-min)/(max-min)
#     minVals = min(dataSet)
#     maxVals = max(dataSet)
#     ranges = list(map(lambda x: x[0]-x[1], zip(maxVals, minVals)))
#     normDataSet = zeros(shape(dataSet))
#     m,n = shape(dataSet)
#     normDataSet = list(map(lambda x: x[0]-x[1], zip(dataSet,tile(minVals, (m,1)))))
#     normDataSet = normDataSet/tile(ranges, (m,1))
#     return normDataSet, ranges, minVals

def gradDescent(dataMatIn, classLabels):
    x = mat(dataMatIn)
    y = mat(classLabels).transpose()
    m,n = shape(x)
    alpha = 0.001
    maxCycles = 100000
    theta = ones((n,1))
    for k in range(maxCycles):
        h = sigmoid(x*theta)
        error = h - y
        cost = -1*dot(log(h).T,y)-dot((1-y).T,log(1-h))
        print("Iteration %d | Cost: %f" % (k, cost))
        theta = theta - alpha * (x.transpose() * error /m)
    return theta

def plotBestFit(weights):
    dataMat,labelMat=loadDataSet()
    dataArr = array(dataMat)
    n = shape(dataArr)[0]
    xcord1 = []; ycord1 = []
    xcord2 = []; ycord2 = []
    for i in range(n):
        if int(labelMat[i])== 1:
            xcord1.append(dataArr[i,1]);ycord1.append(dataArr[i,2])
        else:
            xcord2.append(dataArr[i,1]);ycord2.append(dataArr[i,2])
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
    ax.scatter(xcord2, ycord2, s=30, c='green')
    min_x = min(mat(dataMat)[:, 1])
    max_x = max(mat(dataMat)[:, 1])
    x = arange(min_x, max_x, 1)
    y = (-weights[0]-weights[1]*x)/weights[2]
    ax.plot(x, y)
    plt.xlabel('X1'); plt.ylabel('X2');
    plt.show()

dataMat, classLabel = loadDataSet()
weights = gradDescent(dataMat, classLabel)
print weights
plotBestFit(weights.getA())
以下是我的问题: 1.我对它进行了100000次训练,每次迭代都会打印错误,但我并没有看到它被转换,事实上我不确定。 2.我不知道如何使用matplotlib正确绘制分类器,当maxCycle为200000时,我可以得到一个稍微合理的分类器,以及maxCyle为100000时,绘制似乎根本不合理

更新代码:

count = 0
for i in range(80):
    result = sigmoid(dataMat[i] * weights)
    if result > 0.5:
        a = 1
    else:
        a = 0

    if float(a) != classLabel[i][0]:
        count += 1
errorRate = (float(count)/80)
print "error count is: %f, error rate is: %f" %(count,errorRate)

你的代码实际上很好!以下是一些评论:

  • 您已将θ初始化为所有θ。在本例中,我不会这样做。sigmoid函数的第一次调用将返回接近
    1
    的值,因为
    theta
    x
    的乘积给出非常大的数字。
    log(1-h)
    的计算可能会导致错误,因为
    log
    未在
    0
    中定义。我更喜欢用
    0的
    初始化θ

  • 计算成本函数时,您错过了除以
    m
    。这与算法无关,但最好遵循理论

  • 绘制成本函数是个好主意,而不仅仅是打印它的值。正确的趋势可以非常清楚地看到

  • 为了收敛,这个特殊的例子需要更多的迭代。我在
    500.000次
    迭代中获得了一个很好的结果

  • 帖子已更新,请参见下面的更新

    以下是我的情节:

    如您所见,生成的分隔线与教程中显示的绘图非常匹配

    这是我的密码。它和你的有点不同,但它们非常相似

    import numpy as np
    import matplotlib.pyplot as plt
    
    def loadDataSet():
        dataMat = []; labelMat = []
        frX = open('../ex4x.dat')
        frY = open('../ex4y.dat')
        for line1 in frX.readlines():
            lineArr1 = line1.strip().split()
            dataMat.append([1.0, float(lineArr1[0]), float(lineArr1[1])])
    
        for line2 in frY.readlines():
            lineArr2 = line2.strip().split()
            labelMat.append([float(lineArr2[0])])
        return dataMat,labelMat
    
    def sigmoid(inX):
        return 1.0/(1+np.exp(-inX))    
    
    def gradDescent(dataMatIn, classLabels, alpha, maxCycles):
        x = np.mat(dataMatIn)
        y = np.mat(classLabels)
        m,n = np.shape(x)
        n = n - 1               #usually n is the number of features (without the 1's)
    
        theta = np.zeros((n+1,1))
    
        cost_history = []       #list to accumulate the cost values
    
        for k in range(maxCycles):
    
            h = sigmoid(x*theta)
    
            cost = ((-np.multiply(y, np.log(h)) -np.multiply(1-y, np.log(1-h))).sum(axis=0)/m)[0, 0]
    
            if ((k % 1000) == 0):
                cost_history.append(cost)   #on each 1000th iteration the cost is saved to a list
    
            grad = (x.transpose() * (h - y))/m
    
            theta = theta - alpha*grad
    
        plot_cost = 1 
        if (plot_cost == 1):
            plt.plot(cost_history)
            plt.title("Cost")
            plt.show()
    
        return theta   
    
    def plotBestFit(dataMat, classLabel, weights):
        arrY = np.asarray(classLabel)
        arrX = np.asarray(dataMat)
        ind1 = np.where(arrY == 1)[0]
        ind0 = np.where(arrY == 0)[0]
    
        min_x1 = min(np.mat(dataMat)[:, 1])
        max_x1 = max(np.mat(dataMat)[:, 1])
        x1_val = np.arange(min_x1, max_x1, 1)
        x2_val = (-weights[0, 0]-weights[1, 0]*x1_val)/weights[2, 0]
    
        plt.scatter(arrX[ind1, 1], arrX[ind1, 2], s=30, c='red', marker='s')
        plt.scatter(arrX[ind0, 1], arrX[ind0, 2], s=30, c='blue', marker='s')
        plt.plot(x1_val, x2_val)
        plt.xlabel('X1', fontsize=18)
        plt.ylabel('X2', fontsize=18)
        plt.title("Separation border")
        plt.show()
    
    
    dataMat, classLabel = loadDataSet()
    weights = gradDescent(dataMat, classLabel, 0.0014, 500000) 
    
    print(weights)
    plotBestFit(dataMat, classLabel, weights)
    
    更新

    在阅读了第一版文章评论中的问题后,我尝试优化代码,以使用更少的迭代次数实现成本函数的收敛

    事实上,功能标准化创造了奇迹:)

    仅经过30次迭代,就获得了更好的结果

    以下是新的情节:

    由于标准化,您需要缩放每个新的测试示例,以便对其进行分类

    这是新代码。我更改了一些数据类型以避免不必要的数据类型转换

    import numpy as np
    import matplotlib.pyplot as plt
    
    def loadDataSet():
        dataMat = []; labelMat = []
        frX = open('../ex4x.dat')
        frY = open('../ex4y.dat')
        for line1 in frX.readlines():
            lineArr1 = line1.strip().split()
            dataMat.append([1.0, float(lineArr1[0]), float(lineArr1[1])])
    
        for line2 in frY.readlines():
            lineArr2 = line2.strip().split()
            labelMat.append([float(lineArr2[0])])
    
        return np.asarray(dataMat), np.asarray(labelMat)
    
    def sigmoid(inX):
        return 1.0/(1+np.exp(-inX))    
    
    def gradDescent(x, y, alpha, maxCycles):
    
        m,n = np.shape(x)
        n = n - 1               #usually n is the number of features (without the 1's)
    
        theta = np.zeros((n+1,1))
    
        cost_history = []       #list to accumulate the cost values
        cost_iter = []
    
        for k in range(maxCycles):
    
            h = sigmoid(np.dot(x, theta))
    
            cost = np.sum(-np.multiply(y, np.log(h)) -np.multiply(1-y, np.log(1-h)))/m
    
    
            cost_history.append(cost)   #on each 1000th iteration the cost is saved to a list
            cost_iter.append(k)
    
            grad = np.dot(x.transpose(), (h - y))/m
    
            theta = theta - alpha*grad
    
        plot_cost = 1 
        if (plot_cost == 1):
            plt.plot(cost_iter, cost_history)
            plt.title("Cost")
            plt.show()
    
        return theta   
    
    def plotBestFit(arrX, arrY, weights):
    
        ind1 = np.where(arrY == 1)[0]
        ind0 = np.where(arrY == 0)[0]
    
        min_x1 = min(arrX[:, 1:2])
        max_x1 = max(arrX[:, 1:2])
        x1_val = np.arange(min_x1, max_x1, 0.1)
        x2_val = (-weights[0, 0]-weights[1, 0]*x1_val)/weights[2, 0]
    
        plt.scatter(arrX[ind1, 1], arrX[ind1, 2], s=30, c='red', marker='s')
        plt.scatter(arrX[ind0, 1], arrX[ind0, 2], s=30, c='blue', marker='s')
        plt.plot(x1_val, x2_val)
        plt.xlabel('X1', fontsize=18)
        plt.ylabel('X2', fontsize=18)
        plt.title("Separation border")
        plt.show()
    
    
    dataMat, classLabel = loadDataSet()
    m = np.shape(dataMat)[0]
    
    #standardization
    dataMatMean = np.mean(dataMat, axis=0)
    dataMatStd = np.std(dataMat, axis=0)
    
    dataMatMean_m = np.tile(dataMatMean, (m, 1))
    dataMatStd_m = np.tile(dataMatStd, (m, 1))
    
    dataMatStand = np.copy(dataMat)
    dataMatStand[:, 1:3] = np.divide(  (dataMatStand[:, 1:3] - dataMatMean_m[:, 1:3]),   dataMatStd_m[:, 1:3])
    
    weights = gradDescent(dataMatStand, classLabel, 1.0, 30) 
    
    print(weights)
    plotBestFit(dataMatStand, classLabel, weights)
    

    你的代码实际上很好!以下是一些评论:

  • 您已将θ初始化为所有θ。在本例中,我不会这样做。sigmoid函数的第一次调用将返回接近
    1
    的值,因为
    theta
    x
    的乘积给出非常大的数字。
    log(1-h)
    的计算可能会导致错误,因为
    log
    未在
    0
    中定义。我更喜欢用
    0的
    初始化θ

  • 计算成本函数时,您错过了除以
    m
    。这与算法无关,但最好遵循理论

  • 绘制成本函数是个好主意,而不仅仅是打印它的值。正确的趋势可以非常清楚地看到

  • 为了收敛,这个特殊的例子需要更多的迭代。我在
    500.000次
    迭代中获得了一个很好的结果

  • 帖子已更新,请参见下面的更新

    以下是我的情节:

    如您所见,生成的分隔线与教程中显示的绘图非常匹配

    这是我的密码。它和你的有点不同,但它们非常相似

    import numpy as np
    import matplotlib.pyplot as plt
    
    def loadDataSet():
        dataMat = []; labelMat = []
        frX = open('../ex4x.dat')
        frY = open('../ex4y.dat')
        for line1 in frX.readlines():
            lineArr1 = line1.strip().split()
            dataMat.append([1.0, float(lineArr1[0]), float(lineArr1[1])])
    
        for line2 in frY.readlines():
            lineArr2 = line2.strip().split()
            labelMat.append([float(lineArr2[0])])
        return dataMat,labelMat
    
    def sigmoid(inX):
        return 1.0/(1+np.exp(-inX))    
    
    def gradDescent(dataMatIn, classLabels, alpha, maxCycles):
        x = np.mat(dataMatIn)
        y = np.mat(classLabels)
        m,n = np.shape(x)
        n = n - 1               #usually n is the number of features (without the 1's)
    
        theta = np.zeros((n+1,1))
    
        cost_history = []       #list to accumulate the cost values
    
        for k in range(maxCycles):
    
            h = sigmoid(x*theta)
    
            cost = ((-np.multiply(y, np.log(h)) -np.multiply(1-y, np.log(1-h))).sum(axis=0)/m)[0, 0]
    
            if ((k % 1000) == 0):
                cost_history.append(cost)   #on each 1000th iteration the cost is saved to a list
    
            grad = (x.transpose() * (h - y))/m
    
            theta = theta - alpha*grad
    
        plot_cost = 1 
        if (plot_cost == 1):
            plt.plot(cost_history)
            plt.title("Cost")
            plt.show()
    
        return theta   
    
    def plotBestFit(dataMat, classLabel, weights):
        arrY = np.asarray(classLabel)
        arrX = np.asarray(dataMat)
        ind1 = np.where(arrY == 1)[0]
        ind0 = np.where(arrY == 0)[0]
    
        min_x1 = min(np.mat(dataMat)[:, 1])
        max_x1 = max(np.mat(dataMat)[:, 1])
        x1_val = np.arange(min_x1, max_x1, 1)
        x2_val = (-weights[0, 0]-weights[1, 0]*x1_val)/weights[2, 0]
    
        plt.scatter(arrX[ind1, 1], arrX[ind1, 2], s=30, c='red', marker='s')
        plt.scatter(arrX[ind0, 1], arrX[ind0, 2], s=30, c='blue', marker='s')
        plt.plot(x1_val, x2_val)
        plt.xlabel('X1', fontsize=18)
        plt.ylabel('X2', fontsize=18)
        plt.title("Separation border")
        plt.show()
    
    
    dataMat, classLabel = loadDataSet()
    weights = gradDescent(dataMat, classLabel, 0.0014, 500000) 
    
    print(weights)
    plotBestFit(dataMat, classLabel, weights)
    
    更新

    在阅读了第一版文章评论中的问题后,我尝试优化代码,以使用更少的迭代次数实现成本函数的收敛

    事实上,功能标准化创造了奇迹:)

    仅经过30次迭代,就获得了更好的结果

    以下是新的情节:

    由于标准化,您需要缩放每个新的测试示例,以便对其进行分类

    这是新代码。我更改了一些数据类型以避免不必要的数据类型转换

    import numpy as np
    import matplotlib.pyplot as plt
    
    def loadDataSet():
        dataMat = []; labelMat = []
        frX = open('../ex4x.dat')
        frY = open('../ex4y.dat')
        for line1 in frX.readlines():
            lineArr1 = line1.strip().split()
            dataMat.append([1.0, float(lineArr1[0]), float(lineArr1[1])])
    
        for line2 in frY.readlines():
            lineArr2 = line2.strip().split()
            labelMat.append([float(lineArr2[0])])
    
        return np.asarray(dataMat), np.asarray(labelMat)
    
    def sigmoid(inX):
        return 1.0/(1+np.exp(-inX))    
    
    def gradDescent(x, y, alpha, maxCycles):
    
        m,n = np.shape(x)
        n = n - 1               #usually n is the number of features (without the 1's)
    
        theta = np.zeros((n+1,1))
    
        cost_history = []       #list to accumulate the cost values
        cost_iter = []
    
        for k in range(maxCycles):
    
            h = sigmoid(np.dot(x, theta))
    
            cost = np.sum(-np.multiply(y, np.log(h)) -np.multiply(1-y, np.log(1-h)))/m
    
    
            cost_history.append(cost)   #on each 1000th iteration the cost is saved to a list
            cost_iter.append(k)
    
            grad = np.dot(x.transpose(), (h - y))/m
    
            theta = theta - alpha*grad
    
        plot_cost = 1 
        if (plot_cost == 1):
            plt.plot(cost_iter, cost_history)
            plt.title("Cost")
            plt.show()
    
        return theta   
    
    def plotBestFit(arrX, arrY, weights):
    
        ind1 = np.where(arrY == 1)[0]
        ind0 = np.where(arrY == 0)[0]
    
        min_x1 = min(arrX[:, 1:2])
        max_x1 = max(arrX[:, 1:2])
        x1_val = np.arange(min_x1, max_x1, 0.1)
        x2_val = (-weights[0, 0]-weights[1, 0]*x1_val)/weights[2, 0]
    
        plt.scatter(arrX[ind1, 1], arrX[ind1, 2], s=30, c='red', marker='s')
        plt.scatter(arrX[ind0, 1], arrX[ind0, 2], s=30, c='blue', marker='s')
        plt.plot(x1_val, x2_val)
        plt.xlabel('X1', fontsize=18)
        plt.ylabel('X2', fontsize=18)
        plt.title("Separation border")
        plt.show()
    
    
    dataMat, classLabel = loadDataSet()
    m = np.shape(dataMat)[0]
    
    #standardization
    dataMatMean = np.mean(dataMat, axis=0)
    dataMatStd = np.std(dataMat, axis=0)
    
    dataMatMean_m = np.tile(dataMatMean, (m, 1))
    dataMatStd_m = np.tile(dataMatStd, (m, 1))
    
    dataMatStand = np.copy(dataMat)
    dataMatStand[:, 1:3] = np.divide(  (dataMatStand[:, 1:3] - dataMatMean_m[:, 1:3]),   dataMatStd_m[:, 1:3])
    
    weights = gradDescent(dataMatStand, classLabel, 1.0, 30) 
    
    print(weights)
    plotBestFit(dataMatStand, classLabel, weights)
    

    我确实在倍频程中尝试过,在100次迭代中我可以得到一个很好的结果,唯一的区别是我使用了内置的fminunc函数。如果不使用特征缩放,达到500.000次的好结果是合理的还是可以接受的?我将学习率改为0.001,迭代次数改为500.000,然后我得到了一个结果“错误计数为:15.000000,错误率为:0.187500“,这是一个好结果还是我可以做些什么来改善结果?顺便说一句,训练集的数量是80@iamcodylee,我更新了我的帖子。我尝试缩放这些特性,现在只需要30次迭代就可以得到结果。使用500.000次迭代确实是一件坏事,但也许这是一个很好的例子,可以深入了解这些内容及其优化。专业系统使用更高效的算法。我想没有人会使用纯梯度法。我只是一个机器学习的新手,所以我想一步一步地研究它,通过特征缩放(我自己在这里使用标准化,不管我使用什么方法缩放特征),尽管它收敛得比以前快得多,但我不知道你是否注意到错误计数/错误率,错误计数从15到40 w/c意味着我得到一半正确一半错误的结果。我认为这不是一个好结果。我能做些什么来提高成绩?我想我没有得到这个问题。你能用一个例子描述一下所提到的关于错误计数/错误率的问题吗?或者你可以编辑你的问题并添加一些新的注释和示例?我确实在八度音阶中尝试过,在100次迭代中我可以得到一个很好的结果,唯一的区别是我使用了内置的fminunc f