Python SKNN回归考勤预测
我之前问过一个关于同一个问题的问题,但是因为我的方法改变了,我现在有了不同的问题 我当前的代码:Python SKNN回归考勤预测,python,numpy,scikit-learn,neural-network,prediction,Python,Numpy,Scikit Learn,Neural Network,Prediction,我之前问过一个关于同一个问题的问题,但是因为我的方法改变了,我现在有了不同的问题 我当前的代码: from sklearn import preprocessing from openpyxl import load_workbook import numpy as np from numpy import exp, array, random, dot from sklearn.model_selection import train_test_split from sklearn.neura
from sklearn import preprocessing
from openpyxl import load_workbook
import numpy as np
from numpy import exp, array, random, dot
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
#Set sizes
rowSize = 200
numColumns = 4
# read from excel file
wb = load_workbook('python_excel_read.xlsx')
sheet_1 = wb["Sheet1"]
date = np.zeros(rowSize)
day = np.zeros(rowSize)
rain = np.zeros(rowSize)
temp = np.zeros(rowSize)
out = np.zeros(rowSize)
for i in range(0, rowSize):
date[i] = sheet_1.cell(row=i + 1, column=1).value
day[i] = sheet_1.cell(row=i + 1, column=2).value
rain[i] = sheet_1.cell(row=i + 1, column=3).value
temp[i] = sheet_1.cell(row=i + 1, column=4).value
out[i] = sheet_1.cell(row=i + 1, column=5).value
train = np.zeros(shape=(rowSize,numColumns))
t_o = np.zeros(shape=(rowSize,1))
for i in range(0, rowSize):
train[i] = [date[i], day[i], rain[i], temp[i]]
t_o[i] = [out[i]]
X = train
# Output
y = t_o
X_train, X_test, y_train, y_test = train_test_split(X, y)
####Neural Net
nn = MLPRegressor(
hidden_layer_sizes=(3,), activation='relu', solver='adam', alpha=0.001, batch_size='auto',
learning_rate='constant', learning_rate_init=0.01, power_t=0.5, max_iter=10000, shuffle=True,
random_state=9, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True,
early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
nn.fit(X_train, y_train.ravel())
y_pred = nn.predict(X_test)
###Linear Regression
# lm = LinearRegression()
# lm.fit(X_train,y_train)
# y_pred = lm.predict(X_test)
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.scatter(X_test[:,0], y_pred, s=1, c='b', marker="s", label='real')
ax1.scatter(X_test[:,0], y_test, s=10, c='r', marker="o", label='NN Prediction')
plt.show()
#Calc MSE
mse = np.square(y_test-y_pred).mean()
print(mse)
由此得出的结果表明,对测试数据的预测相当糟糕。因为我是新手,我不确定这是我的数据、模型还是我的编码。根据图,我认为模型对数据是错误的(模型似乎预测了接近线性或平方的东西,而实际数据似乎分散得多)
以下是一些数据点:
格式为一年中的某一天(2是1月2日)、工作日(1)/周末(0)、有雨(1)/无雨(0)、F中的温度、出勤率(这是输出)
我总共有一千多个数据点,但我并没有将它们全部用于培训/测试。一个想法是我需要更多,另一个想法是我需要更多的因素,因为温度/降雨/一周中的某一天对出勤率的影响不够
图为:
我该怎么做才能使我的模型更准确,并给出更好的预测
谢谢
编辑:我添加了更多的数据点和另一个因素。我似乎无法上传excel文件,所以我把数据放在这里,更好地解释了它的格式
编辑:
以下是最新的代码:
from sklearn import preprocessing
from openpyxl import load_workbook
import numpy as np
from numpy import exp, array, random, dot
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import LeaveOneOut
#Set sizes
rowSize = 500
numColumns = 254
# read from excel file
wb = load_workbook('python_excel_read.xlsx')
sheet_1 = wb["Sheet1"]
input = np.zeros(shape=(rowSize,numColumns))
out = np.zeros(rowSize)
for i in range(0, rowSize):
for j in range(0,numColumns):
input[i,j] = sheet_1.cell(row=i + 1, column=j+1).value
out[i] = sheet_1.cell(row=i + 1, column=numColumns+1).value
output = np.zeros(shape=(rowSize,1))
for i in range(0, rowSize):
output[i] = [out[i]]
X = input
# Output
y = output
print(X)
print(y)
y[y < 500] = 0
y[np.logical_and(y >= 500, y <= 1000)] = 1
y[np.logical_and(y > 1000, y <= 1200)] = 2
y[y > 1200] = 3
# Use cross-validation
#kf = KFold(n_splits = 10, random_state=0)
loo = LeaveOneOut()
# Try different models
clf = svm.SVC()
scaler = StandardScaler()
pipe = Pipeline([('scaler', scaler), ('svc', clf)])
accuracy = cross_val_score(pipe, X, y.ravel(), cv = loo, scoring = "accuracy")
print(accuracy.mean())
#y_pred = cross_val_predict(clf, X, y.ravel(), cv = kf)
#cm = confusion_matrix(y, y_pred)
从sklearn导入预处理
从openpyxl导入加载工作簿
将numpy作为np导入
从numpy导入exp、数组、随机、点
从sklearn.model\u选择导入列车\u测试\u拆分
从sklearn.neural_网络导入MLPREGESSOR
从sklearn.preprocessing导入StandardScaler
从sklearn.metrics导入分类报告、混淆矩阵
将matplotlib.pyplot作为plt导入
从sklearn.linear\u模型导入线性回归
从sklearn.model_选择导入KFold
从sklearn.model\u选择导入交叉值\u预测
从sk学习输入svm
从sklearn.model_选择导入交叉值_分数
从sklearn.metrics导入混淆矩阵
从sklearn.preprocessing导入StandardScaler
从sklearn.pipeline导入管道
从sklearn.model_选择导入LeaveOnOut
#设定尺寸
行大小=500
numColumns=254
#从excel文件中读取
wb=load\u工作簿('python\u excel\u read.xlsx')
表1=wb[“表1”]
输入=np.0(形状=(行大小,numColumns))
out=np.零(行大小)
对于范围内的i(0,行大小):
对于范围内的j(0,numColumns):
输入[i,j]=表1.单元格(行=i+1,列=j+1).值
out[i]=表1.单元格(行=i+1,列=numColumns+1).值
输出=np.0(形状=(行大小,1))
对于范围内的i(0,行大小):
输出[i]=[out[i]]
X=输入
#输出
y=输出
打印(X)
打印(y)
y[y<500]=0
y[np.逻辑_和(y>=500,y 1000,y 1200]=3
#使用交叉验证
#kf=KFold(n_分割=10,随机状态=0)
loo=发酵剂()
#尝试不同的模式
clf=svm.SVC()
scaler=StandardScaler()
管道=管道([('scaler',scaler),('svc',clf)])
准确度=交叉评分(管道、X、y.拉威尔(),cv=loo,评分=“准确度”)
打印(精度.mean())
#y_pred=交叉值预测(clf,X,y.ravel(),cv=kf)
#cm=混淆矩阵(y,y\u pred)
这是最新的数据,我可以添加尽可能多的功能。请注意,这是来自完整数据的随机样本:
电流输出:
0.6230954290296712
我的最终目标是达到90%或更高的准确率……我不相信我能找到更多的功能,但如果有用的话,我会继续收集尽可能多的功能。您的问题非常笼统,但我有一些建议。您可以使用
交叉验证并尝试不同的模型。我个人会尝试SVR
,随机森林
作为最后一个选择,我将使用一个MLPR
我对您的代码进行了一些修改,以显示一个简单的示例:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import LeaveOneOut
import pandas as pd
from sklearn.decomposition import PCA
# read the data
df = pd.read_excel('python_excel_read.xlsx', header = None)
rows, cols = df.shape
X = df.iloc[: , 0:(cols - 1)]
y = df.iloc[: , cols - 1 ]
print(X.shape)
print(y.shape)
y[y < 500] = 0
y[np.logical_and(y >= 500, y <= 1000)] = 1
y[np.logical_and(y > 1000, y <= 1200)] = 2
y[y > 1200] = 3
print(np.unique(y))
# We can apply PCA to reduce the dimensions of the data
# pca = PCA(n_components=2)
# pca.fit(X)
# X = pca.fit_transform(X)
# Use cross-validation
#kf = KFold(n_splits = 10, random_state=0)
loo = LeaveOneOut()
# Try different models
clf = svm.SVC(kernel = 'linear')
scaler = StandardScaler()
pipe = Pipeline([('scaler', scaler), ('svc', clf)])
accuracy = cross_val_score(pipe, X, y.ravel(), cv = loo, scoring = "accuracy")
print(accuracy.mean())
#y_pred = cross_val_predict(clf, X, y.ravel(), cv = kf)
#cm = confusion_matrix(y, y_pred)
将numpy导入为np
从sklearn.preprocessing导入StandardScaler
从sklearn.metrics导入分类报告、混淆矩阵
将matplotlib.pyplot作为plt导入
从sklearn.linear\u模型导入线性回归
从sklearn.model_选择导入KFold
从sklearn.model\u选择导入交叉值\u预测
从sk学习输入svm
从sklearn.model_选择导入交叉值_分数
从sklearn.metrics导入混淆矩阵
从sklearn.preprocessing导入StandardScaler
从sklearn.pipeline导入管道
从sklearn.model_选择导入LeaveOnOut
作为pd进口熊猫
从sklearn.decomposition导入PCA
#读取数据
df=pd.read\u excel('python\u excel\u read.xlsx',header=None)
行,cols=df.shape
X=df.iloc[:,0:(cols-1)]
y=df.iloc[:,cols-1]
打印(X.shape)
打印(y形)
y[y<500]=0
y[np.逻辑\u和(y>=500,y 1000,y 1200]=3
打印(np.唯一(y))
#我们可以应用PCA来降低数据的维数
#pca=pca(n_分量=2)
#pca.fit(X)
#X=pca.fit_变换(X)
#使用交叉验证
#kf=KFold(n_分割=10,随机状态=0)
loo=发酵剂()
#尝试不同的模式
clf=svm.SVC(内核=‘线性’)
scaler=StandardScaler()
管道=管道([('scaler',scaler),('svc',clf)])
准确度=交叉评分(管道、X、y.拉威尔(),cv=loo,评分=“准确度”)
打印(精度.mean())
#y_pred=交叉值预测(clf,X,y.ravel(),cv=kf)
#cm=混淆矩阵(y,y\u pred)
这个任务可能很适合使用线性回归,什么使你做神经网络?这似乎是学习它们的好机会。而且,我假设这不是线性的,但自从我的统计课已经2年了,也许我误解了it@ChrisM你能添加python\u excel\u read.xlsx
数据吗?然后我可以提供一个如何提高预测性能的答案。@seralouk我在问题中添加了更多数据,并上传了我最新的代码/结果。我不确定如何上传文件,因为SO没有文件托管服务。我想我可以使用一个,并在需要时提供完美的链接。你的目标是预测输出。你尝试过使用任何东西吗lse期待MLPREGESSOR?我试过了,按照你的备忘单(非常有用的顺便说一句),我也试过rigidregression、ensembleregression和lasso。所有的
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import LeaveOneOut
import pandas as pd
from sklearn.decomposition import PCA
# read the data
df = pd.read_excel('python_excel_read.xlsx', header = None)
rows, cols = df.shape
X = df.iloc[: , 0:(cols - 1)]
y = df.iloc[: , cols - 1 ]
print(X.shape)
print(y.shape)
y[y < 500] = 0
y[np.logical_and(y >= 500, y <= 1000)] = 1
y[np.logical_and(y > 1000, y <= 1200)] = 2
y[y > 1200] = 3
print(np.unique(y))
# We can apply PCA to reduce the dimensions of the data
# pca = PCA(n_components=2)
# pca.fit(X)
# X = pca.fit_transform(X)
# Use cross-validation
#kf = KFold(n_splits = 10, random_state=0)
loo = LeaveOneOut()
# Try different models
clf = svm.SVC(kernel = 'linear')
scaler = StandardScaler()
pipe = Pipeline([('scaler', scaler), ('svc', clf)])
accuracy = cross_val_score(pipe, X, y.ravel(), cv = loo, scoring = "accuracy")
print(accuracy.mean())
#y_pred = cross_val_predict(clf, X, y.ravel(), cv = kf)
#cm = confusion_matrix(y, y_pred)