Deep learning 梯度累积问题

Deep learning 梯度累积问题,deep-learning,pytorch,gradient,Deep Learning,Pytorch,Gradient,下面是我试图运行渐变累积的代码 import torch import torch.nn as nn import numpy as np from DataInitialization import initialize_data from Architecture import BlackScholesMertonModel1 torch.manual_seed(123) exercise_price = 100 sigma = 0.4 r = 0.03 dividend = 0.07

下面是我试图运行渐变累积的代码

import torch
import torch.nn as nn
import numpy as np
from DataInitialization import initialize_data
from Architecture import BlackScholesMertonModel1


torch.manual_seed(123)
exercise_price = 100
sigma = 0.4
r = 0.03
dividend = 0.07
tau = 3

N_b = 100
N_init = 100
N_f = 30000

lb = [0, 0]
ub = [500, tau]

X_f, f_collocation, u_collocation, X_b, u_boundary, X_init, u_init = initialize_data(N_b, N_init, N_f, lb, ub)

# initializing the pde solver
model1 = BlackScholesMertonModel1()

# Original weight initialization didn't change the weights.
# Initialize the weights
for m in model1.modules():
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight, gain=1.0)
        torch.nn.init.constant_(m.bias, 0)

# perform backprop
MAX_EPOCHS_1 = int(210)
LRATE = 8e-3

# use Adam for training
optimizer = torch.optim.Adam(model1.parameters(), lr=LRATE, eps=1e-7)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 140, gamma=0.5, last_epoch=-1, verbose=False)

X_f.requires_grad = True

mini_batch_size = 32

# send everything to GPU.
device = "cuda" if torch.cuda.is_available() else "cpu"
X_f = X_f.to(device)
X_b = X_b.to(device)
X_init = X_init.to(device)
model1 = model1.to(device)
u_boundary = u_boundary.to(device)
u_init = u_init.to(device)
u_collocation = u_collocation.to(device)
f_collocation = f_collocation.to(device)

accumulation_steps = 10

for epoch in range(MAX_EPOCHS_1):
    # boundary loss
    u_b_pred = model1(X_b)
    mse_u_b = torch.nn.MSELoss()(u_b_pred, u_boundary)

    # initial time loss
    u_init_pred = model1(X_init)
    mse_u_init = torch.nn.MSELoss()(u_init_pred, u_init)
    
    for i in range(len(X_f)):
        instance = X_f[i, :].clone()

        u_pred = model1(instance)

        u_pred_first_partials = torch.autograd.grad(u_pred.sum(), instance, allow_unused = True, retain_graph = True)[0].clone()
        u_pred_dt = u_pred_first_partials[0].clone()
        u_pred_ds = u_pred_first_partials[1].clone()

        f_pred = u_pred_dt

        f_true = f_collocation[i] / accumulation_steps

        mse_f = 100 * torch.nn.MSELoss()(f_pred, f_true)
        loss = mse_f + mse_u_init + mse_u_b
        loss.backward(retain_graph = True)
        if ((i + 1) % accumulation_steps) == 0:
            optimizer.step()
            optimizer.zero_grad()
            
    u_pred = model1(X_f)

    u_pred_first_partials = torch.autograd.grad(u_pred.sum(), X_f_c, create_graph = True, allow_unused = True)[0]
    u_pred_dt = u_pred_first_partials[:, 0:1]
    u_pred_ds = u_pred_first_partials[:, 1:2]

    f_pred = u_pred_dt
    
    mse_f = torch.nn.MSELoss()(f_pred, f_collocation)
    mse_function = torch.nn.MSELoss()(u_pred, u_collocation)
    
    loss_history_f_1.append(mse_f)
    loss_history_boundary_1.append(mse_u_b)
    loss_history_init_1.append(mse_u_init)
    loss_history_function_1.append(mse_function)

    if (epoch % 10) == 0:
        print("- - - - - - - - - - - - - - -")
        print("Epoch : ", epoch)
        print(f"Loss Residual:\t{loss_history_f_1[-1]:.4f}")
        print(f"Loss Boundary:\t{loss_history_boundary_1[-1]:.4f}")
        print(f"Loss Initial Time:\t{loss_history_init_1[-1]:.4f}")
        print(f"Loss Function:\t{loss_history_function_1[-1]:.4f}")
print("----------------------------------------------------------")
代码提供的错误如下所示

> RuntimeError: one of the variables needed for gradient computation has
> been modified by an inplace operation: [torch.cuda.FloatTensor [50,
> 1]], which is output 0 of TBackward, is at version 3; expected version
> 2 instead. Hint: enable anomaly detection to find the operation that
> failed to compute its gradient, with
> torch.autograd.set_detect_anomaly(True).

我不清楚到底是什么导致了这个问题。我知道这与变量的修改方式有关(因此我尝试使用.clone()),但这并没有产生效果。但是,我很难找到导致问题的就地操作,因为没有更新任何值,它们只是用来计算损失。

您应该提供整个stacktrace。这样就更容易看出错误的来源