线性回归的梯度下降算法不选择y截距参数 我遵循Andrew Ng Coursera的机器学习教程,尝试在Python中实现梯度下降算法。我对y-intercept参数有问题,因为它看起来不是最佳值。这是我的密码: # IMPORTS import pandas as pd import seaborn as sns import matplotlib.pyplot as plt %matplotlib inline # Acquiring Data # Source: https://github.com/mattnedrich/GradientDescentExample data = pd.read_csv('data.csv') def cost_function(a, b, x_values, y_values): ''' Calculates the square mean error for a given dataset with (x,y) pairs and the model y' = a + bx a: y-intercept for the model b: slope of the curve x_values, y_values: points (x,y) of the dataset ''' data_len = len(x_values) total_error = sum([((a + b * x_values[i]) - y_values[i])**2 for i in range(data_len)]) return total_error / (2 * float(data_len)) def a_gradient(a, b, x_values, y_values): ''' Partial derivative of the cost_function with respect to 'a' a, b: values for 'a' and 'b' x_values, y_values: points (x,y) of the dataset ''' data_len = len(x_values) a_gradient = sum([((a + b * x_values[i]) - y_values[i]) for i in range(data_len)]) return a_gradient / float(data_len) def b_gradient(a, b, x_values, y_values): ''' Partial derivative of the cost_function with respect to 'b' a, b: values for 'a' and 'b' x_values, y_values: points (x,y) of the dataset ''' data_len = len(x_values) b_gradient = sum([(((a + b * x_values[i]) - y_values[i]) * x_values[i]) for i in range(data_len)]) return b_gradient / float(data_len) def gradient_descent_step(a_current, b_current, x_values, y_values, alpha): ''' Give a step in direction of the minimum of the cost_function using the 'a' and 'b' gradiants. Return new values for 'a' and 'b'. a_current, b_current: the current values for 'a' and 'b' x_values, y_values: points (x,y) of the dataset ''' new_a = a_current - alpha * a_gradient(a_current, b_current, x_values, y_values) new_b = b_current - alpha * b_gradient(a_current, b_current, x_values, y_values) return (new_a, new_b) def run_gradient_descent(a, b, x_values, y_values, alpha, precision, plot=False, verbose=False): ''' Runs the gradient_descent_step function and updates (a,b) until the value of the cost function varies less than 'precision'. a, b: initial values for the point a and b in the cost_function x_values, y_values: points (x,y) of the dataset alpha: learning rate for the algorithm precision: value for the algorithm to stop calculation ''' iterations = 0 delta_cost = cost_function(a, b, x_values, y_values) error_list = [delta_cost] iteration_list = [0] # The loop runs until the delta_cost reaches the precision defined # When the variation in cost_function is small it means that the # the function is near its minimum and the parameters 'a' and 'b' # are a good guess for modeling the dataset. while delta_cost > precision: iterations += 1 iteration_list.append(iterations) # Calculates the initial error with current a,b values prev_cost = cost_function(a, b, x_values, y_values) # Calculates new values for a and b a, b = gradient_descent_step(a, b, x_values, y_values, alpha) # Updates the value of the error actual_cost = cost_function(a, b, x_values, y_values) error_list.append(actual_cost) # Calculates the difference between previous and actual error values. delta_cost = prev_cost - actual_cost # Plot the error in each iteration to see how it decreases # and some information about our final results if plot: plt.plot(iteration_list, error_list, '-') plt.title('Error Minimization') plt.xlabel('Iteration',fontsize=12) plt.ylabel('Error',fontsize=12) plt.show() if verbose: print('Iterations = ' + str(iterations)) print('Cost Function Value = '+ str(cost_function(a, b, x_values, y_values))) print('a = ' + str(a) + ' and b = ' + str(b)) return (actual_cost, a, b)

线性回归的梯度下降算法不选择y截距参数 我遵循Andrew Ng Coursera的机器学习教程,尝试在Python中实现梯度下降算法。我对y-intercept参数有问题,因为它看起来不是最佳值。这是我的密码: # IMPORTS import pandas as pd import seaborn as sns import matplotlib.pyplot as plt %matplotlib inline # Acquiring Data # Source: https://github.com/mattnedrich/GradientDescentExample data = pd.read_csv('data.csv') def cost_function(a, b, x_values, y_values): ''' Calculates the square mean error for a given dataset with (x,y) pairs and the model y' = a + bx a: y-intercept for the model b: slope of the curve x_values, y_values: points (x,y) of the dataset ''' data_len = len(x_values) total_error = sum([((a + b * x_values[i]) - y_values[i])**2 for i in range(data_len)]) return total_error / (2 * float(data_len)) def a_gradient(a, b, x_values, y_values): ''' Partial derivative of the cost_function with respect to 'a' a, b: values for 'a' and 'b' x_values, y_values: points (x,y) of the dataset ''' data_len = len(x_values) a_gradient = sum([((a + b * x_values[i]) - y_values[i]) for i in range(data_len)]) return a_gradient / float(data_len) def b_gradient(a, b, x_values, y_values): ''' Partial derivative of the cost_function with respect to 'b' a, b: values for 'a' and 'b' x_values, y_values: points (x,y) of the dataset ''' data_len = len(x_values) b_gradient = sum([(((a + b * x_values[i]) - y_values[i]) * x_values[i]) for i in range(data_len)]) return b_gradient / float(data_len) def gradient_descent_step(a_current, b_current, x_values, y_values, alpha): ''' Give a step in direction of the minimum of the cost_function using the 'a' and 'b' gradiants. Return new values for 'a' and 'b'. a_current, b_current: the current values for 'a' and 'b' x_values, y_values: points (x,y) of the dataset ''' new_a = a_current - alpha * a_gradient(a_current, b_current, x_values, y_values) new_b = b_current - alpha * b_gradient(a_current, b_current, x_values, y_values) return (new_a, new_b) def run_gradient_descent(a, b, x_values, y_values, alpha, precision, plot=False, verbose=False): ''' Runs the gradient_descent_step function and updates (a,b) until the value of the cost function varies less than 'precision'. a, b: initial values for the point a and b in the cost_function x_values, y_values: points (x,y) of the dataset alpha: learning rate for the algorithm precision: value for the algorithm to stop calculation ''' iterations = 0 delta_cost = cost_function(a, b, x_values, y_values) error_list = [delta_cost] iteration_list = [0] # The loop runs until the delta_cost reaches the precision defined # When the variation in cost_function is small it means that the # the function is near its minimum and the parameters 'a' and 'b' # are a good guess for modeling the dataset. while delta_cost > precision: iterations += 1 iteration_list.append(iterations) # Calculates the initial error with current a,b values prev_cost = cost_function(a, b, x_values, y_values) # Calculates new values for a and b a, b = gradient_descent_step(a, b, x_values, y_values, alpha) # Updates the value of the error actual_cost = cost_function(a, b, x_values, y_values) error_list.append(actual_cost) # Calculates the difference between previous and actual error values. delta_cost = prev_cost - actual_cost # Plot the error in each iteration to see how it decreases # and some information about our final results if plot: plt.plot(iteration_list, error_list, '-') plt.title('Error Minimization') plt.xlabel('Iteration',fontsize=12) plt.ylabel('Error',fontsize=12) plt.show() if verbose: print('Iterations = ' + str(iterations)) print('Cost Function Value = '+ str(cost_function(a, b, x_values, y_values))) print('a = ' + str(a) + ' and b = ' + str(b)) return (actual_cost, a, b),python,algorithm,data-science,gradient-descent,Python,Algorithm,Data Science,Gradient Descent,当我使用以下命令运行算法时: run_gradient_descent(0, 0, data['x'], data['y'], 0.0001, 0.01) 我得到(a=0.0496688656535,b=1.47825808018) 但“a”的最佳值约为7.9(尝试了另一个用于线性回归的资源) 此外,如果我更改参数“a”的初始猜测,算法只需尝试调整参数“b” 例如,如果我设置a=200,b=0 run_gradient_descent(200, 0, data['x'], data['y'],

当我使用以下命令运行算法时:

run_gradient_descent(0, 0, data['x'], data['y'], 0.0001, 0.01)
我得到(a=0.0496688656535,b=1.47825808018)

但“a”的最佳值约为7.9(尝试了另一个用于线性回归的资源)

此外,如果我更改参数“a”的初始猜测,算法只需尝试调整参数“b”

例如,如果我设置a=200,b=0

run_gradient_descent(200, 0, data['x'], data['y'], 0.0001, 0.01)
我得到(a=199.933763331和b=2.44824996193)


我没有发现代码有任何错误,我意识到问题在于
a
参数的初始猜测。请参见上面我自己的答案,其中我定义了一个帮助函数,以获得搜索初始
a
guess的一些值的范围。

梯度下降并不保证找到全局最优值。您找到全局最优值的机会取决于您的起始值。为了得到参数的真实值,首先我解决了保证全局最小值的最小二乘问题

data = pd.read_csv('data.csv',header=-1)

x,y = data[0],data[1]

from scipy.stats import linregress

linregress(x,y)
这导致了以下统计数据:

LinregressResult(slope=1.32243102275536, intercept=7.9910209822703848, rvalue=0.77372849988782377, pvalue=3.855655536990139e-21, stderr=0.109377979589804)
因此
b=1.32243102275536
a=7.9910209822703848
。有鉴于此,使用您的代码,我使用随机起始值
a
b
多次解决了这个问题:

a,b = np.random.rand()*10,np.random.rand()*10

print("Initial values of parameters: ")

print("a=%f\tb=%f" % (a,b))

run_gradient_descent(a, b,x,y,1e-4,1e-2)
以下是我得到的解决方案:

Initial values of parameters: 
a=6.100305  b=2.606448

Iterations = 21
Cost Function Value = 55.2093808263
a = 6.07601889437 and b = 1.36310312751
因此,无法接近最小值的原因似乎是因为选择了初始参数值。您自己也会看到它,如果您将从最小二乘法获得的
a
b
放入梯度下降算法,它将只迭代一次并保持不变


不知何故,在某个点上,
delta_cost>precision
True
,它停在那里,认为这是一个局部最优值。如果降低
精度
,并且运行足够长的时间,则可能会找到全局最优值。

我的梯度下降实现的完整代码可以在我的Github存储库中找到:

考虑到@relay所说的梯度下降算法不能保证找到全局极小值,我试图提出一个辅助函数,以限制在特定搜索范围内对参数
a
的猜测,如下所示:

def search_range(x, y, plot=False):
    '''
    Given a dataset with points (x, y) searches for a best guess for 
    initial values of 'a'.
    '''
    data_lenght = len(x)             # Total size of of the dataset
    q_lenght = int(data_lenght / 4)  # Size of a quartile of the dataset

    # Finding the max and min value for y in the first quartile
    min_Q1 = (x[0], y[0])
    max_Q1 = (x[0], y[0])

    for i in range(q_lenght):
        temp_point = (x[i], y[i])
        if temp_point[1] < min_Q1[1]:
            min_Q1 = temp_point
        if temp_point[1] > max_Q1[1]:
            max_Q1 = temp_point

    # Finding the max and min value for y in the 4th quartile
    min_Q4 = (x[data_lenght - 1], y[data_lenght - 1])
    max_Q4 = (x[data_lenght - 1], y[data_lenght - 1])

    for i in range(data_lenght - 1, data_lenght - q_lenght, -1):
        temp_point = (x[i], y[i])
        if temp_point[1] < min_Q4[1]:
            min_Q4 = temp_point
        if temp_point[1] > max_Q4[1]:
            max_Q4 = temp_point

    mean_Q4 = (((min_Q4[0] + max_Q4[0]) / 2), ((min_Q4[1] + max_Q4[1]) / 2))

    # Finding max_y and min_y given the points found above
    # Two lines need to be defined, L1 and L2.
    # L1 will pass through min_Q1 and mean_Q4
    # L2 will pass through max_Q1 and mean_Q4

    # Calculatin slope for L1 and L2 given m = Delta(y) / Delta (x)
    slope_L1 = (min_Q1[1] - mean_Q4[1]) / (min_Q1[0] - mean_Q4[0])
    slope_L2 = (max_Q1[1] - mean_Q4[1]) / (max_Q1[0] -mean_Q4[0])

    # Calculating y-intercepts for L1 and L2 given line equation in the form y = mx + b
    # Float numbers are converted to int because they will be used as range for itaration
    y_L1 = int(min_Q1[1] - min_Q1[0] * slope_L1)
    y_L2 = int(max_Q1[1] - max_Q1[0] * slope_L2)

    # Ploting L1 and L2
    if plot:
        L1 = [(y_L1 + slope_L1 * x) for x in data['x']]
        L2 = [(y_L2 + slope_L2 * x) for x in data['x']]

        plt.plot(data['x'], data['y'], '.')
        plt.plot(data['x'], L1, '-', color='r') 
        plt.plot(data['x'], L2, '-', color='r') 
        plt.title('Scatterplot of Sample Data')
        plt.xlabel('x',fontsize=12)
        plt.ylabel('y',fontsize=12)
        plt.show()

    return y_L1, y_L2
运行代码

运行搜索梯度下降(数据['x'],数据['y'],0.0001,0.001,verbose=True)

我有:

成本函数=55.1294483959
a=8.0259599606和b=1.3209768383

为了进行比较,使用scipy.stats返回的线性回归


a=7.99102098227和b=1.32243102276

我正在考虑实现一些东西来测试
a
b
的一些初始值。根据我的记忆,这个例子中的cost_函数只有一个全局极小值,所以不可能陷入另一个局部极小值。我认为代码中有错误,但我还看不出来。谢谢。我改变了答案,请看上面。你的解决方案有效吗?我在这里试过,也遇到了同样的问题。我认为在导数之前加上这个常数不会改变解。谢谢。我建议你检查什么是导数,试着自己取误差函数的导数。这些不仅仅是一些随机常数。有错误的导数会改变你的结果。不要这么生气。我知道什么是导数,2是从哪里来的。在写代码之前,我用纸和笔做了导数。如果你看到我的成本函数,你会注意到它被除以2,这抵消了a和b的导数中的2。我尝试了你的解决方案,因为也许我遗漏了一些东西,但效果是一样的。优化过程中没有行为变化,y截距也没有适当变化以收敛到正确的值。你试过密码了吗?对你有用吗?请在这里发布你的代码和结果。Tks!我在处理和你一样的问题,而且。。。我被困在同一个台阶上。我用我的输入尝试了你的程序:令人惊讶的是,我的程序和你的程序有相同的输出。除了代码相关的问题,我们可能忘记了一些东西。如果你有办法让我联系你,我们可能会一起解决这个问题。我没有找到一种方法在这里直接向你发送消息。我也不愿意在这里透露我的电子邮件:D你有什么建议吗?谢谢
def run_search_gradient_descent(x_values, y_values, alpha, precision, verbose=False):
    '''
    Runs the gradient_descent_step function and updates (a,b) until
    the value of the cost function varies less than 'precision'.

    x_values, y_values: points (x,y) of the dataset
    alpha: learning rate for the algorithm
    precision: value for the algorithm to stop calculation
    '''    
    from math import inf

    a1, a2 = search_range(x_values, y_values)

    best_guess = [inf, 0, 0]

    for a in range(a1, a2):

        cost, linear_coef, slope = run_gradient_descent(a, 0, x_values, y_values, alpha, precision)

        # Saving value for cost_function and parameters (a,b)        
        if cost < best_guess[0]:
            best_guess = [cost, linear_coef, slope]
    if verbose:        
        print('Cost Function = ' + str(best_guess[0]))
        print('a = ' + str(best_guess[1]) + ' and b = ' + str(best_guess[2]))

    return (best_guess[0], best_guess[1], best_guess[2])