Python 运行时错误:CUDA内存不足:无法训练SEGAN

Python 运行时错误:CUDA内存不足:无法训练SEGAN,python,deep-learning,neural-network,pytorch,Python,Deep Learning,Neural Network,Pytorch,我目前正在尝试运行SEGAN进行语音增强,但似乎无法让网络开始训练,因为它运行以下错误: 运行时错误:CUDA内存不足:尝试分配30.00个MiB(GPU 0;3.00 GiB总容量;2.00 GiB已分配;5.91个MiB空闲;PyTorch总共保留2.03 GiB 我已经尝试包含torch.cuda.empty_cache(),但这似乎并没有解决问题 这是我当前正在运行的脚本 import argparse import os import torch import torch.nn as

我目前正在尝试运行SEGAN进行语音增强,但似乎无法让网络开始训练,因为它运行以下错误:

运行时错误:CUDA内存不足:尝试分配30.00个MiB(GPU 0;3.00 GiB总容量;2.00 GiB已分配;5.91个MiB空闲;PyTorch总共保留2.03 GiB

我已经尝试包含torch.cuda.empty_cache(),但这似乎并没有解决问题

这是我当前正在运行的脚本

import argparse
import os

import torch
import torch.nn as nn
from scipy.io import wavfile
from torch import optim
from torch.autograd import Variable
from torch.utils.data import DataLoader
from tqdm import tqdm

from data_preprocess import sample_rate
from model import Generator, Discriminator
from utils import AudioDataset, emphasis

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Train Audio Enhancement')
    parser.add_argument('--batch_size', default=50, type=int, help='train batch size')
    parser.add_argument('--num_epochs', default=86, type=int, help='train epochs number')

    opt = parser.parse_args()
    BATCH_SIZE = opt.batch_size
    NUM_EPOCHS = opt.num_epochs

    # load data
    torch.cuda.empty_cache()

    print('loading data...')
    train_dataset = AudioDataset(data_type='train')
    test_dataset = AudioDataset(data_type='test')
    train_data_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
    test_data_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
    # generate reference batch
    ref_batch = train_dataset.reference_batch(BATCH_SIZE)

    # create D and G instances
    discriminator = Discriminator()
    generator = Generator()
    if torch.cuda.is_available():
        discriminator.cuda()
        generator.cuda()
        ref_batch = ref_batch.cuda()
    ref_batch = Variable(ref_batch)
    print("# generator parameters:", sum(param.numel() for param in generator.parameters()))
    print("# discriminator parameters:", sum(param.numel() for param in discriminator.parameters()))
    # optimizers
    g_optimizer = optim.RMSprop(generator.parameters(), lr=0.0001)
    d_optimizer = optim.RMSprop(discriminator.parameters(), lr=0.0001)


    for epoch in range(NUM_EPOCHS):
        train_bar = tqdm(train_data_loader)
        for train_batch, train_clean, train_noisy in train_bar:

            # latent vector - normal distribution
            z = nn.init.normal(torch.Tensor(train_batch.size(0), 1024, 8))
            if torch.cuda.is_available():
                train_batch, train_clean, train_noisy = train_batch.cuda(), train_clean.cuda(), train_noisy.cuda()
                z = z.cuda()
            train_batch, train_clean, train_noisy = Variable(train_batch), Variable(train_clean), Variable(train_noisy)
            z = Variable(z)

            # TRAIN D to recognize clean audio as clean
            # training batch pass
            discriminator.zero_grad()
            outputs = discriminator(train_batch, ref_batch)
            clean_loss = torch.mean((outputs - 1.0) ** 2)  # L2 loss - we want them all to be 1
            clean_loss.backward()

            # TRAIN D to recognize generated audio as noisy
            generated_outputs = generator(train_noisy, z)
            outputs = discriminator(torch.cat((generated_outputs, train_noisy), dim=1), ref_batch)
            noisy_loss = torch.mean(outputs ** 2)  # L2 loss - we want them all to be 0
            noisy_loss.backward()

            # d_loss = clean_loss + noisy_loss
            d_optimizer.step()  # update parameters

            # TRAIN G so that D recognizes G(z) as real
            generator.zero_grad()
            generated_outputs = generator(train_noisy, z)
            gen_noise_pair = torch.cat((generated_outputs, train_noisy), dim=1)
            outputs = discriminator(gen_noise_pair, ref_batch)

            g_loss_ = 0.5 * torch.mean((outputs - 1.0) ** 2)
            # L1 loss between generated output and clean sample
            l1_dist = torch.abs(torch.add(generated_outputs, torch.neg(train_clean)))
            g_cond_loss = 100 * torch.mean(l1_dist)  # conditional loss
            g_loss = g_loss_ + g_cond_loss

            # backprop + optimize
            g_loss.backward()
            g_optimizer.step()

            train_bar.set_description(
                'Epoch {}: d_clean_loss {:.4f}, d_noisy_loss {:.4f}, g_loss {:.4f}, g_conditional_loss {:.4f}'
                    .format(epoch + 1, clean_loss.data[0], noisy_loss.data[0], g_loss.data[0], g_cond_loss.data[0]))

        # TEST model
        test_bar = tqdm(test_data_loader, desc='Test model and save generated audios')
        for test_file_names, test_noisy in test_bar:
            z = nn.init.normal(torch.Tensor(test_noisy.size(0), 1024, 8))
            if torch.cuda.is_available():
                test_noisy, z = test_noisy.cuda(), z.cuda()
            test_noisy, z = Variable(test_noisy), Variable(z)
            fake_speech = generator(test_noisy, z).data.cpu().numpy()  # convert to numpy array
            fake_speech = emphasis(fake_speech, emph_coeff=0.95, pre=False)

            for idx in range(fake_speech.shape[0]):
                generated_sample = fake_speech[idx]
                file_name = os.path.join('results',
                                         '{}_e{}.wav'.format(test_file_names[idx].replace('.npy', ''), epoch + 1))
                wavfile.write(file_name, sample_rate, generated_sample.T)

        # save the model parameters for each epoch
        g_path = os.path.join('epochs', 'generator-{}.pkl'.format(epoch + 1))
        d_path = os.path.join('epochs', 'discriminator-{}.pkl'.format(epoch + 1))
        torch.save(generator.state_dict(), g_path)
        torch.save(discriminator.state_dict(), d_path)


尝试降低批处理大小(如David S提到的)。也可以使用torch的
运行测试,而不使用梯度计算。no_grad():
语句。如果您希望使用更大的批处理运行培训,并且内存不足,一个解决方案是使用梯度累积。

尝试降低批处理大小(如David S提到的)。也可以使用torch使用
进行梯度计算来运行测试。no_grad():
语句。如果您希望以更大的批量运行培训,并且内存不足,一个解决方案是使用梯度累积。

您可以做几件事:

  • 较低的批量(如前所述)
  • 对于推理,使用torch.no_grad()可通过不在内存中保留渐变来节省CUDA内存
  • 使用自动混合精度
自动混合精度 您需要PyTorch层(
1.6.0
)来轻松完成此操作。请检查

在可行的情况下(例如,
torch.nn.Conv2d
),参数将被强制转换为
float16
,这将加快训练速度并减少内存需求(在某些情况下,由于运行平均值,像
BatchNorm
这样的层将保持为
float32

在您的鉴别器和生成器案例中,检查第节

多个模型的示例代码(有关详细信息及其工作原理,请参阅文档):


(要点是执行单个
定标器
更新)。

您可以做几件事:

  • 较低的批量(如前所述)
  • 对于推理,使用torch.no_grad()
可通过不在内存中保留渐变来节省CUDA内存
  • 使用自动混合精度
  • 自动混合精度 您需要PyTorch层(
    1.6.0
    )来轻松完成此操作。请检查

    在可行的情况下(例如,
    torch.nn.Conv2d
    ),参数将被强制转换为
    float16
    ,这将加快训练速度并减少内存需求(在某些情况下,由于运行平均值,像
    BatchNorm
    这样的层将保持为
    float32

    在您的鉴别器和生成器案例中,检查第节

    多个模型的示例代码(有关详细信息及其工作原理,请参阅文档):


    (主要是执行单个
    缩放器
    更新)。

    您可能应该降低starter的批大小。您可能应该降低starter的批大小。感谢您的建议。我已尝试降低批大小,但没有解决问题。如何使用torch实现。no_grad()在我的代码中?谢谢你的建议。我已尝试减少batchsize,但没有解决问题。如何在代码中使用torch.no_grad()实现?
    scaler = torch.cuda.amp.GradScaler()
    
    for epoch in epochs:
        for input, target in data:
            optimizer0.zero_grad()
            optimizer1.zero_grad()
            with autocast():
                output0 = model0(input)
                output1 = model1(input)
                loss0 = loss_fn(2 * output0 + 3 * output1, target)
                loss1 = loss_fn(3 * output0 - 5 * output1, target)
    
            scaler.scale(loss0).backward(retain_graph=True)
            scaler.scale(loss1).backward()
    
            # You can choose which optimizers receive explicit unscaling, if you
            # want to inspect or modify the gradients of the params they own.
            scaler.unscale_(optimizer0)
    
            scaler.step(optimizer0)
            scaler.step(optimizer1)
    
            scaler.update()