Python SKNN回归考勤预测

Python SKNN回归考勤预测,python,numpy,scikit-learn,neural-network,prediction,Python,Numpy,Scikit Learn,Neural Network,Prediction,我之前问过一个关于同一个问题的问题,但是因为我的方法改变了,我现在有了不同的问题 我当前的代码: from sklearn import preprocessing from openpyxl import load_workbook import numpy as np from numpy import exp, array, random, dot from sklearn.model_selection import train_test_split from sklearn.neura

我之前问过一个关于同一个问题的问题,但是因为我的方法改变了,我现在有了不同的问题

我当前的代码:

from sklearn import preprocessing
from openpyxl import load_workbook
import numpy as np
from numpy import exp, array, random, dot
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
#Set sizes
rowSize = 200
numColumns = 4

# read  from excel file
wb = load_workbook('python_excel_read.xlsx')
sheet_1 = wb["Sheet1"]

date = np.zeros(rowSize)
day = np.zeros(rowSize)
rain = np.zeros(rowSize)
temp = np.zeros(rowSize)
out = np.zeros(rowSize)

for i in range(0, rowSize):
    date[i] = sheet_1.cell(row=i + 1, column=1).value
    day[i] = sheet_1.cell(row=i + 1, column=2).value
    rain[i] = sheet_1.cell(row=i + 1, column=3).value
    temp[i] = sheet_1.cell(row=i + 1, column=4).value
    out[i] = sheet_1.cell(row=i + 1, column=5).value

train = np.zeros(shape=(rowSize,numColumns))
t_o = np.zeros(shape=(rowSize,1))

for i in range(0, rowSize):
    train[i] = [date[i], day[i], rain[i], temp[i]]
    t_o[i] = [out[i]]


X = train
# Output
y = t_o

X_train, X_test, y_train, y_test = train_test_split(X, y)

####Neural Net
nn = MLPRegressor(
    hidden_layer_sizes=(3,),  activation='relu', solver='adam', alpha=0.001, batch_size='auto',
    learning_rate='constant', learning_rate_init=0.01, power_t=0.5, max_iter=10000, shuffle=True,
    random_state=9, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True,
    early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
nn.fit(X_train, y_train.ravel())


y_pred = nn.predict(X_test)

###Linear Regression
# lm = LinearRegression()
# lm.fit(X_train,y_train)
# y_pred = lm.predict(X_test)

fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.scatter(X_test[:,0], y_pred, s=1, c='b', marker="s", label='real')
ax1.scatter(X_test[:,0], y_test, s=10, c='r', marker="o", label='NN Prediction')
plt.show()

#Calc MSE
mse = np.square(y_test-y_pred).mean()

print(mse)
由此得出的结果表明,对测试数据的预测相当糟糕。因为我是新手,我不确定这是我的数据、模型还是我的编码。根据图,我认为模型对数据是错误的(模型似乎预测了接近线性或平方的东西,而实际数据似乎分散得多)

以下是一些数据点: 格式为一年中的某一天(2是1月2日)、工作日(1)/周末(0)、有雨(1)/无雨(0)、F中的温度、出勤率(这是输出)

我总共有一千多个数据点,但我并没有将它们全部用于培训/测试。一个想法是我需要更多,另一个想法是我需要更多的因素,因为温度/降雨/一周中的某一天对出勤率的影响不够

图为:

我该怎么做才能使我的模型更准确,并给出更好的预测

谢谢

编辑:我添加了更多的数据点和另一个因素。我似乎无法上传excel文件,所以我把数据放在这里,更好地解释了它的格式

编辑: 以下是最新的代码:

from sklearn import preprocessing
from openpyxl import load_workbook
import numpy as np
from numpy import exp, array, random, dot
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import LeaveOneOut
#Set sizes
rowSize = 500
numColumns = 254

# read  from excel file
wb = load_workbook('python_excel_read.xlsx')
sheet_1 = wb["Sheet1"]

input = np.zeros(shape=(rowSize,numColumns))
out = np.zeros(rowSize)
for i in range(0, rowSize):
    for j in range(0,numColumns):
        input[i,j] = sheet_1.cell(row=i + 1, column=j+1).value
    out[i] = sheet_1.cell(row=i + 1, column=numColumns+1).value

output = np.zeros(shape=(rowSize,1))

for i in range(0, rowSize):
    output[i] = [out[i]]


X = input
# Output
y = output

print(X)
print(y)
y[y < 500] = 0
y[np.logical_and(y >= 500, y <= 1000)] = 1
y[np.logical_and(y > 1000, y <= 1200)] = 2
y[y > 1200] = 3

# Use cross-validation
#kf = KFold(n_splits = 10, random_state=0)
loo = LeaveOneOut()
# Try different models
clf = svm.SVC()
scaler = StandardScaler()
pipe = Pipeline([('scaler', scaler), ('svc', clf)])

accuracy = cross_val_score(pipe, X, y.ravel(), cv = loo, scoring = "accuracy")
print(accuracy.mean())

#y_pred = cross_val_predict(clf, X, y.ravel(), cv = kf)
#cm = confusion_matrix(y, y_pred)
从sklearn导入预处理
从openpyxl导入加载工作簿
将numpy作为np导入
从numpy导入exp、数组、随机、点
从sklearn.model\u选择导入列车\u测试\u拆分
从sklearn.neural_网络导入MLPREGESSOR
从sklearn.preprocessing导入StandardScaler
从sklearn.metrics导入分类报告、混淆矩阵
将matplotlib.pyplot作为plt导入
从sklearn.linear\u模型导入线性回归
从sklearn.model_选择导入KFold
从sklearn.model\u选择导入交叉值\u预测
从sk学习输入svm
从sklearn.model_选择导入交叉值_分数
从sklearn.metrics导入混淆矩阵
从sklearn.preprocessing导入StandardScaler
从sklearn.pipeline导入管道
从sklearn.model_选择导入LeaveOnOut
#设定尺寸
行大小=500
numColumns=254
#从excel文件中读取
wb=load\u工作簿('python\u excel\u read.xlsx')
表1=wb[“表1”]
输入=np.0(形状=(行大小,numColumns))
out=np.零(行大小)
对于范围内的i(0,行大小):
对于范围内的j(0,numColumns):
输入[i,j]=表1.单元格(行=i+1,列=j+1).值
out[i]=表1.单元格(行=i+1,列=numColumns+1).值
输出=np.0(形状=(行大小,1))
对于范围内的i(0,行大小):
输出[i]=[out[i]]
X=输入
#输出
y=输出
打印(X)
打印(y)
y[y<500]=0
y[np.逻辑_和(y>=500,y 1000,y 1200]=3
#使用交叉验证
#kf=KFold(n_分割=10,随机状态=0)
loo=发酵剂()
#尝试不同的模式
clf=svm.SVC()
scaler=StandardScaler()
管道=管道([('scaler',scaler),('svc',clf)])
准确度=交叉评分(管道、X、y.拉威尔(),cv=loo,评分=“准确度”)
打印(精度.mean())
#y_pred=交叉值预测(clf,X,y.ravel(),cv=kf)
#cm=混淆矩阵(y,y\u pred)
这是最新的数据,我可以添加尽可能多的功能。请注意,这是来自完整数据的随机样本:

电流输出: 0.6230954290296712


我的最终目标是达到90%或更高的准确率……我不相信我能找到更多的功能,但如果有用的话,我会继续收集尽可能多的功能。您的问题非常笼统,但我有一些建议。您可以使用
交叉验证
并尝试不同的模型。我个人会尝试
SVR
随机森林
作为最后一个选择,我将使用一个
MLPR

我对您的代码进行了一些修改,以显示一个简单的示例:

import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import LeaveOneOut
import pandas as pd
from sklearn.decomposition import PCA

# read the data
df = pd.read_excel('python_excel_read.xlsx', header = None)
rows, cols = df.shape

X = df.iloc[: , 0:(cols - 1)]
y = df.iloc[: , cols - 1 ]
print(X.shape)
print(y.shape)

y[y < 500] = 0
y[np.logical_and(y >= 500, y <= 1000)] = 1
y[np.logical_and(y > 1000, y <= 1200)] = 2
y[y > 1200] = 3
print(np.unique(y))

# We can apply PCA to reduce the dimensions of the data
# pca = PCA(n_components=2)
# pca.fit(X)
# X = pca.fit_transform(X)

# Use cross-validation
#kf = KFold(n_splits = 10, random_state=0)
loo = LeaveOneOut()
# Try different models
clf = svm.SVC(kernel = 'linear')
scaler = StandardScaler()
pipe = Pipeline([('scaler', scaler), ('svc', clf)])

accuracy = cross_val_score(pipe, X, y.ravel(), cv = loo, scoring = "accuracy")
print(accuracy.mean())

#y_pred = cross_val_predict(clf, X, y.ravel(), cv = kf)
#cm = confusion_matrix(y, y_pred)
将numpy导入为np
从sklearn.preprocessing导入StandardScaler
从sklearn.metrics导入分类报告、混淆矩阵
将matplotlib.pyplot作为plt导入
从sklearn.linear\u模型导入线性回归
从sklearn.model_选择导入KFold
从sklearn.model\u选择导入交叉值\u预测
从sk学习输入svm
从sklearn.model_选择导入交叉值_分数
从sklearn.metrics导入混淆矩阵
从sklearn.preprocessing导入StandardScaler
从sklearn.pipeline导入管道
从sklearn.model_选择导入LeaveOnOut
作为pd进口熊猫
从sklearn.decomposition导入PCA
#读取数据
df=pd.read\u excel('python\u excel\u read.xlsx',header=None)
行,cols=df.shape
X=df.iloc[:,0:(cols-1)]
y=df.iloc[:,cols-1]
打印(X.shape)
打印(y形)
y[y<500]=0
y[np.逻辑\u和(y>=500,y 1000,y 1200]=3
打印(np.唯一(y))
#我们可以应用PCA来降低数据的维数
#pca=pca(n_分量=2)
#pca.fit(X)
#X=pca.fit_变换(X)
#使用交叉验证
#kf=KFold(n_分割=10,随机状态=0)
loo=发酵剂()
#尝试不同的模式
clf=svm.SVC(内核=‘线性’)
scaler=StandardScaler()
管道=管道([('scaler',scaler),('svc',clf)])
准确度=交叉评分(管道、X、y.拉威尔(),cv=loo,评分=“准确度”)
打印(精度.mean())
#y_pred=交叉值预测(clf,X,y.ravel(),cv=kf)
#cm=混淆矩阵(y,y\u pred)

这个任务可能很适合使用线性回归,什么使你做神经网络?这似乎是学习它们的好机会。而且,我假设这不是线性的,但自从我的统计课已经2年了,也许我误解了it@ChrisM你能添加
python\u excel\u read.xlsx
数据吗?然后我可以提供一个如何提高预测性能的答案。@seralouk我在问题中添加了更多数据,并上传了我最新的代码/结果。我不确定如何上传文件,因为SO没有文件托管服务。我想我可以使用一个,并在需要时提供完美的链接。你的目标是预测输出。你尝试过使用任何东西吗lse期待MLPREGESSOR?我试过了,按照你的备忘单(非常有用的顺便说一句),我也试过rigidregression、ensembleregression和lasso。所有的
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import LeaveOneOut
import pandas as pd
from sklearn.decomposition import PCA

# read the data
df = pd.read_excel('python_excel_read.xlsx', header = None)
rows, cols = df.shape

X = df.iloc[: , 0:(cols - 1)]
y = df.iloc[: , cols - 1 ]
print(X.shape)
print(y.shape)

y[y < 500] = 0
y[np.logical_and(y >= 500, y <= 1000)] = 1
y[np.logical_and(y > 1000, y <= 1200)] = 2
y[y > 1200] = 3
print(np.unique(y))

# We can apply PCA to reduce the dimensions of the data
# pca = PCA(n_components=2)
# pca.fit(X)
# X = pca.fit_transform(X)

# Use cross-validation
#kf = KFold(n_splits = 10, random_state=0)
loo = LeaveOneOut()
# Try different models
clf = svm.SVC(kernel = 'linear')
scaler = StandardScaler()
pipe = Pipeline([('scaler', scaler), ('svc', clf)])

accuracy = cross_val_score(pipe, X, y.ravel(), cv = loo, scoring = "accuracy")
print(accuracy.mean())

#y_pred = cross_val_predict(clf, X, y.ravel(), cv = kf)
#cm = confusion_matrix(y, y_pred)