Python 使用给定代码从头开始重新培训COCO，在保存检查点0时遇到问题_Python_Python 3.x_Tensorflow

Python 使用给定代码从头开始重新培训COCO，在保存检查点0时遇到问题

python python-3.x tensorflow

Python 使用给定代码从头开始重新培训COCO，在保存检查点0时遇到问题,python,python-3.x,tensorflow,Python,Python 3.x,Tensorflow,我想用model_main.py从头开始在MSCOCO数据集上重新训练更快的rcnn。首先，我使用create_coco_tf_record.py和COCO2017 Detection生成tfrecord文件，得到了如下的train/val文件：coco_train.record-00000-of-00100。之后，我运行model_main.py，commang窗口输出许多警告日志。然后我陷入了将0的检查点保存到/data/code/vision\u ori/my\u checkpoints

我想用model_main.py从头开始在MSCOCO数据集上重新训练更快的rcnn。首先，我使用create_coco_tf_record.py和COCO2017 Detection生成tfrecord文件，得到了如下的train/val文件：coco_train.record-00000-of-00100。之后，我运行model_main.py，commang窗口输出许多警告日志。然后我陷入了将0的检查点保存到/data/code/vision\u ori/my\u checkpoints/model.ckpt的困境

我仔细检查，发现在创建新的MonitoredSession对象时进程被卡住了

源代码/日志日志：它被困在这里好几天了，再也不能继续了

建筑tf记录：尝试培训：配置文件：我想从头开始训练，所以我删除了两行代码：

fine_tune_checkpoint: "PATH_TO_BE_CONFIGURED/model.ckpt"
  from_detection_checkpoint: true

没有错误信息。只是被困在这里了

python3 create_coco_tf_record.py --logtostderr \
--train_image_dir="/data/code/vision_ori/dataset/train2017" \
--val_image_dir="/data/code/vision_ori/dataset/val2017" \
--test_image_dir="/data/code/vision_ori/dataset/test2017" \
--train_annotations_file="/data/code/vision_ori/dataset/anno/instances_train2017.json" \
--val_annotations_file="/data/code/vision_ori/dataset/anno/annotations/instances_val2017.json" \
--testdev_annotations_file="/data/code/vision_ori/dataset/anno/annotations/image_info_test-dev2017.json" \
--output_dir="cocodata"

python3 object_detection/model_main.py \
    --pipeline_config_path="/data/code/vision_ori/models/research/object_detection/samples/configs/faster_rcnn_inception_resnet_v2_atrous_coco.config" \
    --model_dir="/data/code/vision_ori/my_checkpoints" \
    --num_train_steps=200000 \
    --sample_1_of_n_eval_examples=1 \
    --alsologtostderr

fine_tune_checkpoint: "PATH_TO_BE_CONFIGURED/model.ckpt"
  from_detection_checkpoint: true

model {
  faster_rcnn {
    num_classes: 90
    image_resizer {
      keep_aspect_ratio_resizer {
        min_dimension: 600
        max_dimension: 1024
      }
    }
    feature_extractor {
      type: 'faster_rcnn_inception_resnet_v2'
      first_stage_features_stride: 8
    }
    first_stage_anchor_generator {
      grid_anchor_generator {
        scales: [0.25, 0.5, 1.0, 2.0]
        aspect_ratios: [0.5, 1.0, 2.0]
        height_stride: 8
        width_stride: 8
      }
    }
    first_stage_atrous_rate: 2
    first_stage_box_predictor_conv_hyperparams {
      op: CONV
      regularizer {
        l2_regularizer {
          weight: 0.0
        }
      }
      initializer {
        truncated_normal_initializer {
          stddev: 0.01
        }
      }
    }
    first_stage_nms_score_threshold: 0.0
    first_stage_nms_iou_threshold: 0.7
    first_stage_max_proposals: 300
    first_stage_localization_loss_weight: 2.0
    first_stage_objectness_loss_weight: 1.0
    initial_crop_size: 17
    maxpool_kernel_size: 1
    maxpool_stride: 1
    second_stage_box_predictor {
      mask_rcnn_box_predictor {
        use_dropout: false
        dropout_keep_probability: 1.0
        fc_hyperparams {
          op: FC
          regularizer {
            l2_regularizer {
              weight: 0.0
            }
          }
          initializer {
            variance_scaling_initializer {
              factor: 1.0
              uniform: true
              mode: FAN_AVG
            }
          }
        }
      }
    }
    second_stage_post_processing {
      batch_non_max_suppression {
        score_threshold: 0.0
        iou_threshold: 0.6
        max_detections_per_class: 100
        max_total_detections: 100
      }
      score_converter: SOFTMAX
    }
    second_stage_localization_loss_weight: 2.0
    second_stage_classification_loss_weight: 1.0
  }
}

train_config: {
  batch_size: 1
  optimizer {
    momentum_optimizer: {
      learning_rate: {
        manual_step_learning_rate {
          initial_learning_rate: 0.0003
          schedule {
            step: 900000
            learning_rate: .00003
          }
          schedule {
            step: 1200000
            learning_rate: .000003
          }
        }
      }
      momentum_optimizer_value: 0.9
    }
    use_moving_average: false
  }
  gradient_clipping_by_norm: 10.0
  # Note: The below line limits the training process to 200K steps, which we
  # empirically found to be sufficient enough to train the pets dataset. This
  # effectively bypasses the learning rate schedule (the learning rate will
  # never decay). Remove the below line to train indefinitely.
  num_steps: 200000
  data_augmentation_options {
    random_horizontal_flip {
    }
  }
}

train_input_reader: {
  tf_record_input_reader {
    input_path: "/data/code/vision_ori/models/research/object_detection/dataset_tools/cocodata/coco_train.record-00000-of-00100"
  }
  label_map_path: "/data/code/vision_ori/models/research/object_detection/data/mscoco_label_map.pbtxt"
}

eval_config: {
  num_examples: 5000
  # Note: The below line limits the evaluation process to 10 evaluations.
  # Remove the below line to evaluate indefinitely.
  max_evals: 10
}

eval_input_reader: {
  tf_record_input_reader {
    input_path: "/data/code/vision_ori/models/research/object_detection/dataset_tools/cocodata/coco_val.record-00000-of-00010"
  }
  label_map_path: "/data/code/vision_ori/models/research/object_detection/data/mscoco_label_map.pbtxt"
  shuffle: false
  num_readers: 1
}