Deep learning Pytorch:网络根本不学习+;体重太低了

Deep learning Pytorch:网络根本不学习+;体重太低了,deep-learning,pytorch,Deep Learning,Pytorch,关于输入。对不起,格式不好。对于每两行,第一行是键,第二行是值。18~20_ride为标签,不包含在输入中。下面是一个输入。列车组由40万辆这样的车组成 bus_route_id station_code latitude longitude 6~7_ride 0 4270000 344 33.48990 126.49373 7~8_ride 8~9_ride 9~10_ride 10

关于输入。对不起,格式不好。对于每两行,第一行是键,第二行是值。18~20_ride为标签,不包含在输入中。下面是一个输入。列车组由40万辆这样的车组成

bus_route_id    station_code    latitude    longitude   6~7_ride    
0               4270000         344         33.48990    126.49373
7~8_ride    8~9_ride    9~10_ride   10~11_ride  11~12_ride  6~7_takeoff  
0.0         1.0         2.0         5.0         2.0         6.0
7~8_takeoff 8~9_takeoff 9~10_takeoff    10~11_takeoff   11~12_takeoff    
0.0         0.0         0.0             0.0             0.0 
18~20_ride  weekday dis_jejusi  dis_seoquipo            
0.0         6       2.954920    26.256744
示例权重:在第四纪元捕获。经过20个时期的训练后,我得到的值要小得多(例如-7e-44或1e-55)

模型的预测与目标

#Target
[2.],
[0.],
[0.]

#Prediction
[1.4187],
[1.4187],
[1.4187]
MyDataset.py

from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import torch
import os

class MyDataset(Dataset):
  def __init__(self, csv_filename):
    self.dataset = pd.read_csv(csv_filename, index_col=0)
    self.labels = self.dataset.pop("18~20_ride")
    self.dataset = self.dataset.values
    self.labels = np.reshape(self.labels.values,(-1,1))

  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, idx):
    return self.dataset[idx], self.labels[idx]
模型

培训和验证

def train(model, device, train_loader, optimizer, loss_fn, log_interval, epoch):
  print("Training")
  model.train()
  for batch_idx, (data, target) in enumerate(train_loader):
    data, target = data.float().to(device), target.float().to(device)
    optimizer.zero_grad()
    output = model(data)
    loss = loss_fn(output, target)
    loss.backward()
    optimizer.step()
    if batch_idx % log_interval == 0:
        print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
            epoch, (batch_idx+1) * len(data), len(train_loader.dataset),
            100. * batch_idx / len(train_loader), loss.item()))

def validate(model, device, loader, loss_fn):
  print("\nValidating")
  model.eval()
  test_loss = 0
  correct = 0
  with torch.no_grad():
    for batch_idx, (data, target) in enumerate(loader):
      data, target = data.float().to(device), target.float().to(device)
      output = model(data)
      test_loss += loss_fn(output, target).item()  # sum up batch loss

  test_loss /= len(loader)

  print('Validation average loss: {:.4f}\n'.format(
      test_loss))
  return test_loss
from MyDataset import MyDataset
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import StepLR
from datetime import datetime

train_dataset_path = "/content/drive/My Drive/root/bus/dataset/train.csv"
val_dataset_path = "/content/drive/My Drive/root/bus/dataset/val.csv"
model_base_path = "/content/drive/My Drive/root/bus/models/"

model_file = "/content/drive/My Drive/root/bus/models/checkpoints/1574427776.202017.pt"

"""
Training Config
"""
epochs = 10
batch_size = 32
learning_rate = 0.5

check_interval = 4

log_interval = int(40000/batch_size)
gamma = 0.1

load_model = False
save_model = True
make_checkpoint = True
"""
End of config
"""

# Read test set
train_set = MyDataset(train_dataset_path)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
val_set = MyDataset(val_dataset_path)
val_loader = DataLoader(val_set, batch_size=1)
print("Data READY")

device = torch.device("cuda")
net = Network(19).float().to(device)
if load_model:
  net.load_state_dict(torch.load(model_file))
loss_fn = torch.nn.MSELoss()
optimizer = optim.AdamW(net.parameters(), lr=learning_rate)

best_loss = float('inf')
isAbort = False
for epoch in range(1, epochs+1):
  train(net, device, train_loader, optimizer, loss_fn, log_interval, epoch)
  val_loss = validate(net, device, val_loader, loss_fn)
  if epoch%check_interval==0:
    if make_checkpoint:
      print("Saving new checkpoint")
      torch.save(net.state_dict(), model_base_path+"checkpoints/"+str(datetime.today().timestamp())+".pt")
      """
  if val_loss < best_loss and epoch%check_interval==0:
    best_loss = val_loss
    if make_checkpoint:
      print("Saving new checkpoint")
      torch.save(net.state_dict(), model_base_path+"checkpoints/"+str(datetime.today().timestamp())+".pt")
  else:
    print("Model overfit detected. Aborting training")
    isAbort = True
    break
    """
if save_model and not isAbort:
    torch.save(net.state_dict(), model_base_path+"finals/"+str(datetime.today().timestamp())+".pt")
培训和验证的整个过程

def train(model, device, train_loader, optimizer, loss_fn, log_interval, epoch):
  print("Training")
  model.train()
  for batch_idx, (data, target) in enumerate(train_loader):
    data, target = data.float().to(device), target.float().to(device)
    optimizer.zero_grad()
    output = model(data)
    loss = loss_fn(output, target)
    loss.backward()
    optimizer.step()
    if batch_idx % log_interval == 0:
        print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
            epoch, (batch_idx+1) * len(data), len(train_loader.dataset),
            100. * batch_idx / len(train_loader), loss.item()))

def validate(model, device, loader, loss_fn):
  print("\nValidating")
  model.eval()
  test_loss = 0
  correct = 0
  with torch.no_grad():
    for batch_idx, (data, target) in enumerate(loader):
      data, target = data.float().to(device), target.float().to(device)
      output = model(data)
      test_loss += loss_fn(output, target).item()  # sum up batch loss

  test_loss /= len(loader)

  print('Validation average loss: {:.4f}\n'.format(
      test_loss))
  return test_loss
from MyDataset import MyDataset
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import StepLR
from datetime import datetime

train_dataset_path = "/content/drive/My Drive/root/bus/dataset/train.csv"
val_dataset_path = "/content/drive/My Drive/root/bus/dataset/val.csv"
model_base_path = "/content/drive/My Drive/root/bus/models/"

model_file = "/content/drive/My Drive/root/bus/models/checkpoints/1574427776.202017.pt"

"""
Training Config
"""
epochs = 10
batch_size = 32
learning_rate = 0.5

check_interval = 4

log_interval = int(40000/batch_size)
gamma = 0.1

load_model = False
save_model = True
make_checkpoint = True
"""
End of config
"""

# Read test set
train_set = MyDataset(train_dataset_path)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
val_set = MyDataset(val_dataset_path)
val_loader = DataLoader(val_set, batch_size=1)
print("Data READY")

device = torch.device("cuda")
net = Network(19).float().to(device)
if load_model:
  net.load_state_dict(torch.load(model_file))
loss_fn = torch.nn.MSELoss()
optimizer = optim.AdamW(net.parameters(), lr=learning_rate)

best_loss = float('inf')
isAbort = False
for epoch in range(1, epochs+1):
  train(net, device, train_loader, optimizer, loss_fn, log_interval, epoch)
  val_loss = validate(net, device, val_loader, loss_fn)
  if epoch%check_interval==0:
    if make_checkpoint:
      print("Saving new checkpoint")
      torch.save(net.state_dict(), model_base_path+"checkpoints/"+str(datetime.today().timestamp())+".pt")
      """
  if val_loss < best_loss and epoch%check_interval==0:
    best_loss = val_loss
    if make_checkpoint:
      print("Saving new checkpoint")
      torch.save(net.state_dict(), model_base_path+"checkpoints/"+str(datetime.today().timestamp())+".pt")
  else:
    print("Model overfit detected. Aborting training")
    isAbort = True
    break
    """
if save_model and not isAbort:
    torch.save(net.state_dict(), model_base_path+"finals/"+str(datetime.today().timestamp())+".pt")
从MyDataset导入MyDataset
从torch.utils.data导入数据加载器
从torch.optim.lr\u计划程序导入步骤lr
从日期时间导入日期时间
train_dataset_path=“/content/drive/My drive/root/bus/dataset/train.csv”
val_dataset_path=“/content/drive/My drive/root/bus/dataset/val.csv”
model_base_path=“/content/drive/My drive/root/bus/models/”
model_file=“/content/drive/My drive/root/bus/models/checkpoints/157442776.202017.pt”
"""
训练配置
"""
纪元=10
批量大小=32
学习率=0.5
检查间隔=4
日志间隔=整数(40000/批大小)
伽马=0.1
加载模式=错误
save_model=True
使检查点为True
"""
配置结束
"""
#读取测试集
列车组=我的数据集(列车数据集路径)
序列加载器=数据加载器(序列集,批量大小=批量大小,随机播放=真)
val_set=MyDataset(val_dataset_路径)
val_加载程序=数据加载程序(val_集合,批次大小=1)
打印(“数据准备就绪”)
装置=火炬装置(“cuda”)
net=网络(19).float()到(设备)
如果加载模式为:
net.load\u state\u dict(torch.load(model\u文件))
损耗=火炬.nn.MSELoss()
optimizer=optim.AdamW(net.parameters(),lr=learning\u rate)
最佳损失=浮动('inf')
isAbort=False
对于范围内的历元(1,历元+1):
列车(网络、设备、列车装载机、优化器、损耗、日志间隔、历元)
val_损耗=验证(网络、设备、val_加载程序、损耗fn)
如果历元%check_interval==0:
如果设置检查点:
打印(“保存新检查点”)
torch.save(net.state_dict(),model_base_path+“checkpoints/”+str(datetime.today().timestamp())+“.pt”)
"""
如果val_损失<最佳损失和历元百分比检查间隔==0:
最佳损失=价值损失
如果设置检查点:
打印(“保存新检查点”)
torch.save(net.state_dict(),model_base_path+“checkpoints/”+str(datetime.today().timestamp())+“.pt”)
其他:
打印(“检测到模型过盈。正在中止培训”)
isAbort=True
打破
"""
如果保存_model而不是isport:
torch.save(net.state_dict(),model_base_path+“finals/”+str(datetime.today().timestamp())+“.pt”)
所以我试着用GoogleColab为回归问题训练一个完全连通的模型。但是它没有得到很好的训练;损失绝对没有减少。所以我深入研究,发现重量真的很小。你知道为什么会这样吗?我怎么才能避免?非常感谢。 我使用MSE进行损耗,并使用ADaW优化器。下面是我尝试过的东西

  • 尝试了其他架构(改变层数大小、改变激活功能ReLU、GELU),但损失没有减少
  • 尝试将学习速率从3e-1更改为1e-3,甚至尝试了1
  • 已尝试对数据进行其他预处理(使用日/月/年而不是工作日)
  • 给定输入数据中的标签,但损失没有减少
  • 尝试了不同的批量大小(4、10、32、64)
  • 删除了批处理规范化
  • 其他类型的优化器,如SGD、Adam
  • 训练了20个阶段,但损失并没有减少
  • 重量会在损失时发生变化
  • TL;DR:输入数据无效!!检查NaN或NULL


    好吧,问题已经过去了。几乎什么都试过了,但可能会把项目设置搞砸。因此,我删除了该项目并再次尝试:相同。再次删除并迁移到TF2:结果相同!所以我发现安装没有任何问题。所以我找了其他地方。最后我找到了原因。输入列实际上是我自己修改的。(删除一些高度相关的功能)。它不是原创的。在修改过程中,我弄乱了一些浮点值,结果得到了NaN值。因此,请检查数据集是否包含无效值

    这是相当出乎意料的。你能分享更多的细节吗。例如,你什么时候检查这些重量的?是在开始培训之前还是在培训之间?这些细节将帮助我们缩小你的问题范围。@ShagunSodhani损失没有减少,所以我决定停止训练,看看发生了什么。经过4个阶段的训练后获得了重量。你有多少数据样本?你们班的分布情况如何?你的训练时间是否超过了4个时代?“其他数据预处理”包括什么?当前的步骤是什么?您尝试过哪些不同的体系结构?请查看并包含所有必要的信息,包括数据样本。@Denninger我已经包含了更多信息。非常感谢。