Python 回归的结果不正确

Python 回归的结果不正确,python,tensorflow,machine-learning,regression,Python,Tensorflow,Machine Learning,Regression,我是机器学习(ML)的初学者,不明白为什么我的回归不正确或绘图不正确。这是我目前所拥有的,大部分是从我正在使用的一本书的前面的示例中获取的。如果有人能解释一下成本函数公式的来源,那就太好了 import time import csv import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split import tensorflow as tf import numpy as np de

我是机器学习(ML)的初学者,不明白为什么我的回归不正确或绘图不正确。这是我目前所拥有的,大部分是从我正在使用的一本书的前面的示例中获取的。如果有人能解释一下成本函数公式的来源,那就太好了

import time
import csv
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np


def read_csv(filepath, bucket=7):

    days_in_year = 365

    freq = {}
    for period in range(0, int(days_in_year / bucket)):
        freq[period] = 0

    with open(filepath, 'r') as csvfile:
        csvreader = csv.reader(csvfile)
        csvreader.__next__()
        for row in csvreader:
            if row[0] == '':
                continue
            t = time.strptime(row[0], "%m/%d/%Y")
            if t.tm_year == 2014 and t.tm_yday < (days_in_year-1):
                freq[int(t.tm_yday / bucket)] += 1
    print("finished reading input")
    return freq


freq = read_csv("311_data.csv")

x_dataset = list(freq.keys())
y_dataset = list(freq.values())

print(x_dataset)
print(y_dataset)


def normalize(data):
    y = np.empty(len(data))
    count = 0
    for x in data:
        y[count] = (x-min(data))/(max(data)-min(data))
        count += 1
    return y


x_dataset = normalize(x_dataset)
y_dataset = normalize(y_dataset)

(x_train, x_test, y_train, y_test) = train_test_split(x_dataset, y_dataset, train_size=0.7, test_size=0.3)

learning_rate = 0.01
training_epochs = 40
num_coeffs = 9
reg_lambda = 0.

X = tf.placeholder(tf.float32)
Y = tf.placeholder(tf.float32)


def model(X, w):
    terms = []
    for i in range(num_coeffs):
        term = tf.multiply(w[i], tf.pow(X, i))
        terms.append(term)
    return tf.add_n(terms)


w = tf.Variable([0.] * num_coeffs, name="parameters")
y_model = model(X, w)

cost = tf.div(tf.add(tf.reduce_sum(tf.square(Y-y_model)), tf.multiply(reg_lambda, tf.reduce_sum(tf.square(w)))),
          2*x_train.size)

train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)

sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

for reg_lambda in np.linspace(0, 1, 100):
    for epoch in range(training_epochs):
            sess.run(train_op, feed_dict={X: x_train, Y: y_train})
    final_cost = sess.run(cost, feed_dict={X: x_test, Y: y_test})
    print('reg lambda ', reg_lambda)
    print('final cost ', final_cost)

w_val = sess.run(w)
print(w_val)
sess.close()

plt.scatter(x_train, y_train)
y_learned = 0

for i in range(num_coeffs):
    y_learned += w_val[i] * np.power(x_train, i)

plt.plot(x_train, y_learned, 'r')
plt.show()
导入时间
导入csv
将matplotlib.pyplot作为plt导入
从sklearn.model\u选择导入列车\u测试\u拆分
导入tensorflow作为tf
将numpy作为np导入
def read_csv(文件路径,bucket=7):
年内天数=365
频率={}
对于范围(0,int(年/桶中的天数))内的期间:
频率[周期]=0
将open(filepath,'r')作为csvfile:
csvreader=csv.reader(csvfile)
csvreader.\uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu
对于csvreader中的行:
如果行[0]='':
持续
t=time.strtime(第[0]行,%m/%d/%Y”)
如果t.tm_year==2014,t.tm_yeday<(第1年的天数):
频率[int(t.tm_yday/bucket)]+=1
打印(“完成读取输入”)
返回频率
freq=read_csv(“311_data.csv”)
x_数据集=列表(频率键())
y_数据集=列表(频率值())
打印(x_数据集)
打印(y_数据集)
def正常化(数据):
y=np.空(len(数据))
计数=0
对于数据中的x:
y[计数]=(x最小值(数据))/(最大值(数据)-最小值(数据))
计数+=1
返回y
x_数据集=规范化(x_数据集)
y_数据集=规格化(y_数据集)
(x_序列,x_测试,y_序列,y_测试)=序列测试分割(x_数据集,y_数据集,序列大小=0.7,测试大小=0.3)
学习率=0.01
培训时间=40
num_系数=9
reg_lambda=0。
X=tf.placeholder(tf.float32)
Y=tf.placeholder(tf.float32)
def型号(X,w):
术语=[]
对于范围内的i(数值系数):
项=tf.multiply(w[i],tf.pow(X,i))
条款。附加(条款)
返回tf.add\n(条款)
w=tf.变量([0.]*num_系数,name=“parameters”)
y_模型=模型(X,w)
cost=tf.div(tf.add(tf.reduce_sum)(tf.square(Y-Y_模型)),tf.multiply(reg_lambda,tf.reduce_sum(tf.square(w)),
2*x_列车尺寸)
训练op=tf.train.GRADENTDESCENTOPTIMIZER(学习率)。最小化(成本)
sess=tf.Session()
init=tf.global_variables_initializer()
sess.run(初始化)
对于np.linspace(0,1100)中的reg_lambda:
对于范围内的历元(训练历元):
sess.run(train_op,feed_dict={X:X_train,Y:Y_train})
final_cost=sess.run(cost,feed_dict={X:X_test,Y:Y_test})
打印('reg lambda',reg_lambda)
打印(“最终成本”,最终成本)
w_val=sess.run(w)
打印(w_val)
sess.close()
plt.分散(x_列车、y_列车)
y_=0
对于范围内的i(数值系数):
y_学习+=w_val[i]*np.功率(x_列,i)
plt.plot(x_train,y_learn,'r')
plt.show()
这就是我绘图时的结果:


看起来你得到了正确的X,Y坐标,你只是在无序地绘制它们。尝试
plt.scatter
而不是
plt.plot
来获得点而不是线,以验证这一点。如果仍然需要行,可以按以下方式成对排序:

points = zip(x_train, y_learned)
points = sorted(points, key=lambda p: p[0])
x_plot, y_plot = zip(*points)
plt.plot(x_plot, y_plot, 'r')