Python GPU内存不足

Python GPU内存不足,python,pytorch,Python,Pytorch,我正在尝试在pytorch中实现Yolo-v2。然而,仅仅通过网络传递数据,我的内存就好像用完了。模型较大,如下所示。然而,我觉得我在用我的网络做一些愚蠢的事情(比如不在某处释放内存)。网络在cpu上按预期工作 测试代码(内存耗尽时)为: 问题 我的模型有什么明显的问题吗 如何使用内存使其更高效 其他评论 模型: import torch from torch import nn import torch.nn.functional as F class Yolov2(nn.Module):

我正在尝试在pytorch中实现Yolo-v2。然而,仅仅通过网络传递数据,我的内存就好像用完了。模型较大,如下所示。然而,我觉得我在用我的网络做一些愚蠢的事情(比如不在某处释放内存)。网络在cpu上按预期工作

测试代码(内存耗尽时)为:

问题

  • 我的模型有什么明显的问题吗
  • 如何使用内存使其更高效
  • 其他评论
  • 模型:

    import torch
    from torch import nn
    import torch.nn.functional as F
    
    class Yolov2(nn.Module):
    
        def __init__(self):
            super(Yolov2, self).__init__()
    
            self.conv1 = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, stride=1, padding=1, bias=False)
            self.batchnorm1 = nn.BatchNorm2d(32)
    
            self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1, bias=False)
            self.batchnorm2 = nn.BatchNorm2d(64)
    
            self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1, bias=False)
            self.batchnorm3 = nn.BatchNorm2d(128)
            self.conv4 = nn.Conv2d(in_channels=128, out_channels=64, kernel_size=1, stride=1, padding=0, bias=False)
            self.batchnorm4 = nn.BatchNorm2d(64)
            self.conv5 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1, bias=False)
            self.batchnorm5 = nn.BatchNorm2d(128)
    
            self.conv6 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1, bias=False)
            self.batchnorm6 = nn.BatchNorm2d(256)
            self.conv7 = nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1, stride=1, padding=0, bias=False)
            self.batchnorm7 = nn.BatchNorm2d(128)
            self.conv8 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1, bias=False)
            self.batchnorm8 = nn.BatchNorm2d(256)
    
            self.conv9 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1, bias=False)
            self.batchnorm9 = nn.BatchNorm2d(512)
            self.conv10 = nn.Conv2d(in_channels=512, out_channels=256, kernel_size=1, stride=1, padding=0, bias=False)
            self.batchnorm10 = nn.BatchNorm2d(256)
            self.conv11 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1, bias=False)
            self.batchnorm11 = nn.BatchNorm2d(512)
            self.conv12 = nn.Conv2d(in_channels=512, out_channels=256, kernel_size=1, stride=1, padding=0, bias=False)
            self.batchnorm12 = nn.BatchNorm2d(256)
            self.conv13 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1, bias=False)
            self.batchnorm13 = nn.BatchNorm2d(512)
    
            self.conv14 = nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False)
            self.batchnorm14 = nn.BatchNorm2d(1024)
            self.conv15 = nn.Conv2d(in_channels=1024, out_channels=512, kernel_size=1, stride=1, padding=0, bias=False)
            self.batchnorm15 = nn.BatchNorm2d(512)
            self.conv16 = nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False)
            self.batchnorm16 = nn.BatchNorm2d(1024)
            self.conv17 = nn.Conv2d(in_channels=1024, out_channels=512, kernel_size=1, stride=1, padding=0, bias=False)
            self.batchnorm17 = nn.BatchNorm2d(512)
            self.conv18 = nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False)
            self.batchnorm18 = nn.BatchNorm2d(1024)
    
            self.conv19 = nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False)
            self.batchnorm19 = nn.BatchNorm2d(1024)
            self.conv20 = nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False)
            self.batchnorm20 = nn.BatchNorm2d(1024)
    
            self.conv21 = nn.Conv2d(in_channels=3072, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False)
            self.batchnorm21 = nn.BatchNorm2d(1024)
    
            self.conv22 = nn.Conv2d(in_channels=1024, out_channels=125, kernel_size=1, stride=1, padding=0)
    
        def reorg_layer(self, x):
            stride = 2
            batch_size, channels, height, width = x.size()
            new_ht = int(height/stride)
            new_wd = int(width/stride)
            new_channels = channels * stride * stride
    
    #         from IPython.core.debugger import Tracer; Tracer()()
            passthrough = x.permute(0, 2, 3, 1)
            passthrough = passthrough.contiguous().view(-1, new_ht, stride, new_wd, stride, channels)
            passthrough = passthrough.permute(0, 1, 3, 2, 4, 5)
            passthrough = passthrough.contiguous().view(-1, new_ht, new_wd, new_channels)
            passthrough = passthrough.permute(0, 3, 1, 2)
            return passthrough
    
        def forward(self, x):
            out = F.max_pool2d(F.leaky_relu(self.batchnorm1(self.conv1(x)), negative_slope=0.1), 2, stride=2)
            out = F.max_pool2d(F.leaky_relu(self.batchnorm2(self.conv2(out)), negative_slope=0.1), 2, stride=2)
    
            out = F.leaky_relu(self.batchnorm3(self.conv3(out)), negative_slope=0.1)
            out = F.leaky_relu(self.batchnorm4(self.conv4(out)), negative_slope=0.1)
            out = F.leaky_relu(self.batchnorm5(self.conv5(out)), negative_slope=0.1)
            out = F.max_pool2d(out, 2, stride=2)
    
            out = F.leaky_relu(self.batchnorm6(self.conv6(out)), negative_slope=0.1)
            out = F.leaky_relu(self.batchnorm7(self.conv7(out)), negative_slope=0.1)
            out = F.leaky_relu(self.batchnorm8(self.conv8(out)), negative_slope=0.1)
            out = F.max_pool2d(out, 2, stride=2)
    
            out = F.leaky_relu(self.batchnorm9(self.conv9(out)), negative_slope=0.1)
            out = F.leaky_relu(self.batchnorm10(self.conv10(out)), negative_slope=0.1)
            out = F.leaky_relu(self.batchnorm11(self.conv11(out)), negative_slope=0.1)
            out = F.leaky_relu(self.batchnorm12(self.conv12(out)), negative_slope=0.1)
            out = F.leaky_relu(self.batchnorm13(self.conv13(out)), negative_slope=0.1)
    #         from IPython.core.debugger import Tracer; Tracer()()
            passthrough = self.reorg_layer(out)
            out = F.max_pool2d(out, 2, stride=2)
    
            out = F.leaky_relu(self.batchnorm14(self.conv14(out)), negative_slope=0.1)
            out = F.leaky_relu(self.batchnorm15(self.conv15(out)), negative_slope=0.1)
            out = F.leaky_relu(self.batchnorm16(self.conv16(out)), negative_slope=0.1)
            out = F.leaky_relu(self.batchnorm17(self.conv17(out)), negative_slope=0.1)
            out = F.leaky_relu(self.batchnorm18(self.conv18(out)), negative_slope=0.1)
    
            out = F.leaky_relu(self.batchnorm19(self.conv19(out)), negative_slope=0.1)
            out = F.leaky_relu(self.batchnorm20(self.conv20(out)), negative_slope=0.1)
    
            out = torch.cat([passthrough, out], 1)
            out = F.leaky_relu(self.batchnorm21(self.conv21(out)), negative_slope=0.1)
            out = self.conv22(out)
    
            return out
    
    其他信息:
    • 火炬版本是
      '0.4.1.post2'
    • 在aws p2.xlarge上运行(限制12gb GPU内存)

    • 该模型的参数数量为67137565。这将占用我将尝试使用较小的批量。从1开始,然后检查最大值是多少。 我也可以尝试减少你的输入张量维数。
      您的网络对于您的GPU来说并不是那么小,以下是您可以尝试的周期:

      ma = torch.cuda.memory_allocated()
      print(ma)
      
      mc = torch.cuda.memory_cached()
      print(mc)
      
      torch.cuda.empty_cache()
      
      ma = torch.cuda.memory_allocated()
      print(ma)
      
      mc = torch.cuda.memory_cached()
      print(mc)
      
      # 653475518
      # 952107008
      # 383533568
      # 385875968
      
      看看如何释放内存。另一种技术称为按需向GPU加载批处理。因此,不是整个数据集,而是单个批次或多个批次


      将单个批次加载到GPU中,基本上以μ秒为单位进行测量,而矩阵乘法等张量操作则以m秒为单位。

      我将批次大小减少到8,并开始工作。无法真正更改其他尺寸,因为这是图纸中指定的尺寸,并且不想影响图像大小。
      ma = torch.cuda.memory_allocated()
      print(ma)
      
      mc = torch.cuda.memory_cached()
      print(mc)
      
      torch.cuda.empty_cache()
      
      ma = torch.cuda.memory_allocated()
      print(ma)
      
      mc = torch.cuda.memory_cached()
      print(mc)
      
      # 653475518
      # 952107008
      # 383533568
      # 385875968