Google colaboratory colab上的mxnet胶子:cudaMalloc重试失败:内存不足

Google colaboratory colab上的mxnet胶子:cudaMalloc重试失败:内存不足,google-colaboratory,mxnet,gluon,faster-rcnn,Google Colaboratory,Mxnet,Gluon,Faster Rcnn,我正在尝试使用mxnet胶子和VOC数据集在google colab上训练更快的RCNN。 在训练网络之后,当我试图分析一个新的图像来测试它时,我得到了下面的错误。 这个问题不管有多少个时代、批次的限制或它的大小都会出现 这是教练 trainer = gluon.Trainer(net.collect_params(), 'sgd',{'learning_rate': 15, 'wd': 0.05, 'momentum': 0.9}) 这是训练周期 #net.hybridize() for

我正在尝试使用mxnet胶子和VOC数据集在google colab上训练更快的RCNN。 在训练网络之后,当我试图分析一个新的图像来测试它时,我得到了下面的错误。 这个问题不管有多少个时代、批次的限制或它的大小都会出现

这是教练

trainer = gluon.Trainer(net.collect_params(), 'sgd',{'learning_rate': 15, 'wd': 0.05, 'momentum': 0.9})
这是训练周期

#net.hybridize()
for epoch in range(epochs):
    with Timer("epochTime"):
        print("epoch = ", epoch, "----------------------------------------------------")
        for ib, batch in enumerate(train_loader):
            if ib > batchesLimit:
                break
            for dataa, label, rpn_cls_targets, rpn_box_targets, rpn_box_masks in zip(*batch):

                dataa = dataa.as_in_context(mx.gpu(0))
                label = label.as_in_context(mx.gpu(0)).expand_dims(0)
                rpn_cls_targets = rpn_cls_targets.as_in_context(mx.gpu(0))
                rpn_box_targets = rpn_box_targets.as_in_context(mx.gpu(0))
                rpn_box_masks = rpn_box_masks.as_in_context(mx.gpu(0))

                gt_label = label[:, :, 4:5]
                gt_box = label[:, :, :4]

                with autograd.record():
                    # network forward
                    cls_preds, box_preds, roi, samples, matches, rpn_score, rpn_box, anchors, cls_targets, box_targets, box_masks, _ = net(dataa.expand_dims(0), gt_box, gt_label)

                    # losses of rpn (region proposal network)
                    rpn_score = rpn_score.squeeze(axis=-1)
                    num_rpn_pos = (rpn_cls_targets >= 0).sum()
                    rpn_loss1 = rpn_cls_loss(rpn_score, rpn_cls_targets,rpn_cls_targets >= 0) * rpn_cls_targets.size / num_rpn_pos
                    rpn_loss2 = rpn_box_loss(rpn_box, rpn_box_targets,rpn_box_masks) * rpn_box.size / num_rpn_pos

                    # losses of rcnn (region convolutional neuronal network)
                    num_rcnn_pos = (cls_targets >= 0).sum()
                    rcnn_loss1 = rcnn_cls_loss(cls_preds, cls_targets,cls_targets >= 0) * cls_targets.size / cls_targets.shape[0] / num_rcnn_pos
                    rcnn_loss2 = rcnn_box_loss(box_preds, box_targets, box_masks) * box_preds.size / box_preds.shape[0] / num_rcnn_pos

                # Backpropagation y actualizacion
                autograd.backward([rpn_loss1, rpn_loss2, rcnn_loss1, rcnn_loss2])
                trainer.set_learning_rate(100/((epoch+1)**(4/3)))
                trainer.step(batch_size)
这就是我加载要测试的图像的方式(我已经在另一个笔记本上测试了这个电池,使用这个电池的预训练版本,一切正常)

我应该如何面对这个错误?我应该申请什么好的实践吗

---------------------------------------------------------------------------
MXNetError                                Traceback (most recent call last)
<ipython-input-23-bfcca7cbcfca> in <module>()
     10 box_ids, scores, bboxes = net(x)
     11 
---> 12 ax = utils.viz.plot_bbox(img, bboxes[0], scores[0], box_ids[0], thresh=0.3, class_names=net.classes)
     13 
     14 plt.show()

2 frames
/usr/local/lib/python3.6/dist-packages/gluoncv/utils/viz/bbox.py in plot_bbox(img, bboxes, scores, labels, thresh, class_names, colors, ax, reverse_rgb, absolute_coordinates)
     59 
     60     if isinstance(bboxes, mx.nd.NDArray):
---> 61         bboxes = bboxes.asnumpy()
     62     if isinstance(labels, mx.nd.NDArray):
     63         labels = labels.asnumpy()

/usr/local/lib/python3.6/dist-packages/mxnet/ndarray/ndarray.py in asnumpy(self)
   2533             self.handle,
   2534             data.ctypes.data_as(ctypes.c_void_p),
-> 2535             ctypes.c_size_t(data.size)))
   2536         return data
   2537 

/usr/local/lib/python3.6/dist-packages/mxnet/base.py in check_call(ret)
    253     """
    254     if ret != 0:
--> 255         raise MXNetError(py_str(_LIB.MXGetLastError()))
    256 
    257 

MXNetError: [18:30:25] src/storage/./pooled_storage_manager.h:161: cudaMalloc retry failed: out of memory
Stack trace:
  [bt] (0) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x6d554b) [0x7ff0fc97e54b]
  [bt] (1) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x41a0c72) [0x7ff100449c72]
  [bt] (2) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x41a694f) [0x7ff10044f94f]
  [bt] (3) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x3972e10) [0x7ff0ffc1be10]
  [bt] (4) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x39730c7) [0x7ff0ffc1c0c7]
  [bt] (5) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(mxnet::imperative::PushFCompute(std::function<void (nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)> const&, nnvm::Op const*, nnvm::NodeAttrs const&, mxnet::Context const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<unsigned int, std::allocator<unsigned int> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&)::{lambda(mxnet::RunContext)#1}::operator()(mxnet::RunContext) const+0x281) [0x7ff0ffc1c4d1]
  [bt] (6) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x38970cb) [0x7ff0ffb400cb]
  [bt] (7) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x38a3c31) [0x7ff0ffb4cc31]
  [bt] (8) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x38a7170) [0x7ff0ffb50170]
---------------------------------------------------------------------------
MXNetError回溯(最近一次调用上次)
在()
10箱ID、分数、B箱=净(x)
11
--->12 ax=utils.即plot_bbox(img,bboxes[0],分数[0],方框ID[0],阈值=0.3,类别名称=net.classes)
13
14 plt.show()
2帧
/绘图框中的usr/local/lib/python3.6/dist-packages/gluncv/utils/viz/bbox.py(img、bbox、分数、标签、阈值、类名、颜色、ax、反向rgb、绝对坐标)
59
60如果存在(B盒、mx.nd.N阵列):
--->61 bboxes=bboxes.asnumpy()
62如果存在(标签、mx.nd.NDArray):
63 labels=labels.asnumpy()
/asnumpy中的usr/local/lib/python3.6/dist-packages/mxnet/ndarray/ndarray.py(self)
2533自我处理,
2534数据.ctypes.data_as(ctypes.c_void_p),
->2535 ctypes.c_size_t(data.size)))
2536返回数据
2537
/检查调用(ret)中的usr/local/lib/python3.6/dist-packages/mxnet/base.py
253     """
254如果ret!=0:
-->255 raise MXNetError(py_str(_LIB.MXGetLastError()))
256
257
MXNetError:[18:30:25]src/storage//pooled_storage_manager.h:161:cudamaloc重试失败:内存不足
堆栈跟踪:
[bt](0)/usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x6d554b)[0x7ff0fc97e54b]
[bt](1)/usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x41a0c72)[0x7ff100449c72]
[bt](2)/usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x41a694f)[0x7ff10044f94f]
[bt](3)/usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x3972e10)[0x7ff0ffc1be10]
[bt](4)/usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x39730c7)[0x7ff0ffc1c0c7]
[bt](5)/usr/local/lib/python3.6/dist packages/mxnet/libmxnet.so(mxnet::commissive::PushFCompute(std::function const&,nnvm::Op const*,nnvm::NodeAttrs const&,mxnet::Context const&,std::vector const&,std::vector const&,std::vector const&,std::vector const&,std::vector const&),std::vector const&,std::vector const&):{(mxnet::RunContext)#1}::operator()(mxnet::RunContext)const+0x281)[0x7ff0ffc1c4d1]
[bt](6)/usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x38970cb)[0x7ff0ffb400cb]
[bt](7)/usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x38a3c31)[0x7ff0ffb4cc31]
[bt](8)/usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x38a7170)[0x7ff0ffb50170]

您使用哪种分辨率进行培训?此外,您可以删除行
img=mx.nd.array(img,ctx=ctx)
plot\u bbox
可以直接使用预处理返回的
img
我正在使用我应用于图像的相同变换,短,max\u size=600800。您使用哪种分辨率进行训练?此外,您可以删除行
img=mx.nd.array(img,ctx=ctx)
plot\u bbox
可以直接使用预处理返回的
img
我使用的变换与我应用于图像的变换相同,短,最大尺寸=600800。
---------------------------------------------------------------------------
MXNetError                                Traceback (most recent call last)
<ipython-input-23-bfcca7cbcfca> in <module>()
     10 box_ids, scores, bboxes = net(x)
     11 
---> 12 ax = utils.viz.plot_bbox(img, bboxes[0], scores[0], box_ids[0], thresh=0.3, class_names=net.classes)
     13 
     14 plt.show()

2 frames
/usr/local/lib/python3.6/dist-packages/gluoncv/utils/viz/bbox.py in plot_bbox(img, bboxes, scores, labels, thresh, class_names, colors, ax, reverse_rgb, absolute_coordinates)
     59 
     60     if isinstance(bboxes, mx.nd.NDArray):
---> 61         bboxes = bboxes.asnumpy()
     62     if isinstance(labels, mx.nd.NDArray):
     63         labels = labels.asnumpy()

/usr/local/lib/python3.6/dist-packages/mxnet/ndarray/ndarray.py in asnumpy(self)
   2533             self.handle,
   2534             data.ctypes.data_as(ctypes.c_void_p),
-> 2535             ctypes.c_size_t(data.size)))
   2536         return data
   2537 

/usr/local/lib/python3.6/dist-packages/mxnet/base.py in check_call(ret)
    253     """
    254     if ret != 0:
--> 255         raise MXNetError(py_str(_LIB.MXGetLastError()))
    256 
    257 

MXNetError: [18:30:25] src/storage/./pooled_storage_manager.h:161: cudaMalloc retry failed: out of memory
Stack trace:
  [bt] (0) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x6d554b) [0x7ff0fc97e54b]
  [bt] (1) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x41a0c72) [0x7ff100449c72]
  [bt] (2) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x41a694f) [0x7ff10044f94f]
  [bt] (3) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x3972e10) [0x7ff0ffc1be10]
  [bt] (4) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x39730c7) [0x7ff0ffc1c0c7]
  [bt] (5) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(mxnet::imperative::PushFCompute(std::function<void (nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)> const&, nnvm::Op const*, nnvm::NodeAttrs const&, mxnet::Context const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<unsigned int, std::allocator<unsigned int> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&)::{lambda(mxnet::RunContext)#1}::operator()(mxnet::RunContext) const+0x281) [0x7ff0ffc1c4d1]
  [bt] (6) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x38970cb) [0x7ff0ffb400cb]
  [bt] (7) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x38a3c31) [0x7ff0ffb4cc31]
  [bt] (8) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x38a7170) [0x7ff0ffb50170]