finetuning EfficientDet-D0 from model zoo on PASCALVOC doesn't recognize class label 1 (TensorFlow Object Detection API)

I've downloaded the EfficientDet D0 512x512 model from the object detection API model zoo, downloaded the PASCAL VOC dataset and preprocessed it with the file. Next I took one of the config files and adjusted it to fit the architecture and VOC dataset. When evaluating the resulting network with the pascal_voc_detection_metrics it gives me a near zero mAP for the first class (airplane), the other classes are performing fine. I'm assuming one of my settings in the config file is wrong (pasted down below), why does this happen and how do i fix this?

model {
  ssd {
    inplace_batchnorm_update: true
    freeze_batchnorm: false
    num_classes: 20
    add_background_class: false
    box_coder {
      faster_rcnn_box_coder {
        y_scale: 10.0
        x_scale: 10.0
        height_scale: 5.0
        width_scale: 5.0
    matcher {
      argmax_matcher {
        matched_threshold: 0.5
        unmatched_threshold: 0.5
        ignore_thresholds: false
        negatives_lower_than_unmatched: true
        force_match_for_each_row: true
        use_matmul_gather: true
    similarity_calculator {
      iou_similarity {
    encode_background_as_zeros: true
    anchor_generator {
      multiscale_anchor_generator {
        min_level: 3
        max_level: 7
        anchor_scale: 4.0
        aspect_ratios: [1.0, 2.0, 0.5]
        scales_per_octave: 3
    image_resizer {
      keep_aspect_ratio_resizer {
        min_dimension: 512
        max_dimension: 512
        pad_to_max_dimension: true
    box_predictor {
      weight_shared_convolutional_box_predictor {
        depth: 64
        class_prediction_bias_init: -4.6
        conv_hyperparams {
          force_use_bias: true
          activation: SWISH
          regularizer {
            l2_regularizer {
              weight: 0.00004
          initializer {
            random_normal_initializer {
              stddev: 0.01
              mean: 0.0
          batch_norm {
            scale: true
            decay: 0.99
            epsilon: 0.001
        num_layers_before_predictor: 3
        kernel_size: 3
        use_depthwise: true
    feature_extractor {
      type: 'ssd_efficientnet-b0_bifpn_keras'
      bifpn {
        min_level: 3
        max_level: 7
        num_iterations: 3
        num_filters: 64
      conv_hyperparams {
        force_use_bias: true
        activation: SWISH
        regularizer {
          l2_regularizer {
            weight: 0.00004
        initializer {
          truncated_normal_initializer {
            stddev: 0.03
            mean: 0.0
        batch_norm {
          scale: true,
          decay: 0.99,
          epsilon: 0.001,
    loss {
      classification_loss {
        weighted_sigmoid_focal {
          alpha: 0.25
          gamma: 1.5
      localization_loss {
        weighted_smooth_l1 {
      classification_weight: 1.0
      localization_weight: 1.0
    normalize_loss_by_num_matches: true
    normalize_loc_loss_by_codesize: true
    post_processing {
      batch_non_max_suppression {
        score_threshold: 1e-8
        iou_threshold: 0.5
        max_detections_per_class: 100
        max_total_detections: 100
      score_converter: SIGMOID

train_config: {
  fine_tune_checkpoint: "oracle/efficientdet_d0/checkpoint/ckpt-0"
  fine_tune_checkpoint_version: V2
  fine_tune_checkpoint_type: "detection"
  batch_size: 3
  startup_delay_steps: 0
  use_bfloat16: false
  num_steps: 30000
  data_augmentation_options {
    random_horizontal_flip {
  data_augmentation_options {
    random_scale_crop_and_pad_to_square {
      output_size: 512
      scale_min: 0.1
      scale_max: 2.0
  optimizer {
    momentum_optimizer: {
      learning_rate: {
        cosine_decay_learning_rate {
          learning_rate_base: 8e-2
          total_steps: 30000
          warmup_learning_rate: .001
          warmup_steps: 2500
      momentum_optimizer_value: 0.9
    use_moving_average: false
  max_number_of_boxes: 100
  unpad_groundtruth_tensors: false
  update_trainable_variables: ["WeightSharedConvolutionalBoxPredictor"]

train_input_reader: {
  label_map_path: "pascalVOC/pascal_label_map.pbtxt"
  tf_record_input_reader {
    input_path: "pascalVOC/pascal_train.record"

eval_config: {
  metrics_set: "pascal_voc_detection_metrics"
  use_moving_averages: false
  batch_size: 1;

eval_input_reader: {
  label_map_path: "pascalVOC/pascal_label_map.pbtxt"
  shuffle: false
  num_epochs: 1
  tf_record_input_reader {
    input_path: "pascalVOC/pascal_val.record"


  • There is a bug in the way pascal_voc_detection_metrics calculates the metric, fix can be found here