Search code examples
pythontensorflowobject-detectionobject-detection-apicustom-training

Can you run training and evalutaion process from a single anaconda prompt?


I am having trouble evaluating my training process during training a Tensorflow2 Custom Object Detector. After reading several issues related to this problem I found that evaluation and training should be treated as two separate processes therefore I should use a new anaconda prompt for starting the evaluation job. I am training on the ssd_mobilenetv2 640x640 version. My pipeline configuration:

model {
  ssd {
    num_classes: 6
    image_resizer {
      fixed_shape_resizer {
        height: 640
        width: 640
      }
    }
    feature_extractor {
      type: "ssd_mobilenet_v2_fpn_keras"
      depth_multiplier: 1.0
      min_depth: 16
      conv_hyperparams {
        regularizer {
          l2_regularizer {
            weight: 3.9999998989515007e-05
          }
        }
        initializer {
          random_normal_initializer {
            mean: 0.0
            stddev: 0.009999999776482582
          }
        }
        activation: RELU_6
        batch_norm {
          decay: 0.996999979019165
          scale: true
          epsilon: 0.0010000000474974513
        }
      }
      use_depthwise: true
      override_base_feature_extractor_hyperparams: true
      fpn {
        min_level: 3
        max_level: 7
        additional_layer_depth: 128
      }
    }
    box_coder {
      faster_rcnn_box_coder {
        y_scale: 10.0
        x_scale: 10.0
        height_scale: 5.0
        width_scale: 5.0
      }
    }
    matcher {
      argmax_matcher {
        matched_threshold: 0.5
        unmatched_threshold: 0.5
        ignore_thresholds: false
        negatives_lower_than_unmatched: true
        force_match_for_each_row: true
        use_matmul_gather: true
      }
    }
    similarity_calculator {
      iou_similarity {
      }
    }
    box_predictor {
      weight_shared_convolutional_box_predictor {
        conv_hyperparams {
          regularizer {
            l2_regularizer {
              weight: 3.9999998989515007e-05
            }
          }
          initializer {
            random_normal_initializer {
              mean: 0.0
              stddev: 0.009999999776482582
            }
          }
          activation: RELU_6
          batch_norm {
            decay: 0.996999979019165
            scale: true
            epsilon: 0.0010000000474974513
          }
        }
        depth: 128
        num_layers_before_predictor: 4
        kernel_size: 3
        class_prediction_bias_init: -4.599999904632568
        share_prediction_tower: true
        use_depthwise: true
      }
    }
    anchor_generator {
      multiscale_anchor_generator {
        min_level: 3
        max_level: 7
        anchor_scale: 4.0
        aspect_ratios: 1.0
        aspect_ratios: 2.0
        aspect_ratios: 0.5
        scales_per_octave: 2
      }
    }
    post_processing {
      batch_non_max_suppression {
        score_threshold: 9.99999993922529e-09
        iou_threshold: 0.6000000238418579
        max_detections_per_class: 100
        max_total_detections: 100
        use_static_shapes: false
      }
      score_converter: SIGMOID
    }
    normalize_loss_by_num_matches: true
    loss {
      localization_loss {
        weighted_smooth_l1 {
        }
      }
      classification_loss {
        weighted_sigmoid_focal {
          gamma: 2.0
          alpha: 0.25
        }
      }
      classification_weight: 1.0
      localization_weight: 1.0
    }
    encode_background_as_zeros: true
    normalize_loc_loss_by_codesize: true
    inplace_batchnorm_update: true
    freeze_batchnorm: false
  }
}
train_config {
  batch_size: 4
  data_augmentation_options {
    random_horizontal_flip {
    }
  }
  #data_augmentation_options {
    #random_crop_image {
      #min_object_covered: 0.0
      #min_aspect_ratio: 0.75
      #max_aspect_ratio: 3.0
      #min_area: 0.75
      #max_area: 1.0
      #overlap_thresh: 0.0
    #}
  #}
  optimizer {
    momentum_optimizer {
      learning_rate {
        cosine_decay_learning_rate {
          learning_rate_base: 0.04999999821186066
          total_steps: 50000
          warmup_learning_rate: 0.0026666000485420227
          warmup_steps: 600
        }
      }
      momentum_optimizer_value: 0.8999999761581421
    }
    use_moving_average: false
  }
  fine_tune_checkpoint: "pre-trained-models\ssd_mobilenet_v2_fpnlite_640x640_coco17_tpu-8\checkpoint\ckpt-0"
  num_steps: 50000
  startup_delay_steps: 0.0
  replicas_to_aggregate: 8
  max_number_of_boxes: 100
  unpad_groundtruth_tensors: false
  fine_tune_checkpoint_type: "detection"
  fine_tune_checkpoint_version: V2
  from_detection_checkpoint: true
}
train_input_reader {
  label_map_path: "annotations/label_map.pbtxt"
  tf_record_input_reader {
    input_path: "data/train.record"
  }
 
}
eval_config {
  metrics_set: "coco_detection_metrics"
  use_moving_averages: false
}
eval_input_reader {
  label_map_path: "annotations/label_map.pbtxt"
  shuffle: false
  num_epochs: 1
  tf_record_input_reader {
    input_path: "data/test.record"
  }
}

I have started the training with the command:

python model_main_tf2.py --model_dir=models/my_ssd2_3/ --pipeline_config_path=models/my_ssd2_3/pipeline.config --sample_1_of_n_eval_examples 1 --logtostderr

I was hoping that setting the number of evaluation examples will have an effect of starting the evaluation job. In any case I tried running the evaluation in a different terminal window with : python model_main_tf2.py --model_dir=models/my_ssd2_3 --pipeline_config_path=models/my_ssd2_3/pipeline.config --checkpoint_dir=models/my_ssd2_3/ --alsologtostderr
As soon as starting the evaluation the training job crashes with this error:error

The problem I think that I do not have the sufficient hardware:

  1. 8GB RAM
  2. NVIDIDA GTX960M (2GB RAM)

Could it be a problem that all the input images that I use are 3000x3000, therefore the preprocesser has to load too many information? If so, is there any way to work around it? I would not want to resize all the images before generating TF record file, because I would have to re-label all the images. I clearly lack the insight of how the memory is being allocated during the start of the training process so some details would be much appreciated.


A second question is that during monitoring the training on tensorboard the images are displayed with various brightness I tried changing in the model_lib_v2.py file the 627 line to:

data= (features[fields.InputDataFields.image]-np.min(features[fields.InputDataFields.image]))/(np.max(features[fields.InputDataFields.image])-np.min(features[fields.InputDataFields.image]))
,

According to this solution:https://github.com/tensorflow/models/issues/9115 Without any luck. Is there a solution to this problem? Also it would be nice if I could monitor there the bounding boxes the modell proposes. Thank you.


Solution

  • With some changes to the train_loop function in model_lib.py, you can alternate between training and evaluation in the same application. See the example below.

    From what I understand, the Tensorflow Object Detection API is developed with a focus on distributed learning and if you were using mulitple GPUs/TPUs then you could have some devices doing training and other devices doing evaluation. So I suspect the way model_lib.py is implemented currently does not fully support doing training and evaluation on the same device.

    I'm not certain the root cause of the error you are seeing, typically I have seen Tensorflow throw OOM errors when there is a memory issue. It may be that how Tensorflow is using CUDA does not support two applications using the same device.

    Regarding your second question, I followed the advice here on the same thread and this worked for me. Duplicating the code in the third code block below. Initially, this did not appear to work for me because I naively updated the file in the Object Detection repository I created, but your application may be using the Object Detection API that is installed in your site-libs, so I would recommend confirming that the file you are changing is the same one being loaded in your import statements.

    --

    This is outside of the training loop

      ##Set up evaluation data and writer
      eval_config = configs['eval_config']
      eval_input_configs = configs['eval_input_configs']
      eval_input_config = eval_input_configs[0]
      eval_input = strategy.experimental_distribute_dataset(
        inputs.eval_input(
            eval_config=eval_config,
            eval_input_config=eval_input_config,
            model_config=model_config,
            model=detection_model))
      
      summary_writer_eval = tf.compat.v2.summary.create_file_writer(os.path.join(model_dir, 'eval', eval_input_config.name))
    

    This is the modified train/evaluation loop. The evaluation happens near the end.

    for _ in range(global_step.value(), train_steps, num_steps_per_iteration):
      
      tf.logging.info('Performing Training')
      with summary_writer_train.as_default():
        with tf.compat.v2.summary.record_if(lambda: global_step % num_steps_per_iteration == 0):
    
          losses_dict = _dist_train_step(train_input_iter)
    
          time_taken = time.time() - last_step_time
          last_step_time = time.time()
          steps_per_sec = num_steps_per_iteration * 1.0 / time_taken
    
          tf.compat.v2.summary.scalar(
              'steps_per_sec', steps_per_sec, step=global_step)
    
          steps_per_sec_list.append(steps_per_sec)
    
          logged_dict = losses_dict.copy()
          logged_dict['learning_rate'] = learning_rate_fn()
    
          for key, val in logged_dict.items():
            tf.compat.v2.summary.scalar(key, val, step=global_step)
    
          if global_step.value() - logged_step >= 0:
            logged_dict_np = {name: value.numpy() for name, value in
                              logged_dict.items()}
            tf.logging.info(
                'Step {} per-step time {:.3f}s'.format(
                    global_step.value(), time_taken / num_steps_per_iteration))
            tf.logging.info(pprint.pformat(logged_dict_np, width=40))
            print_gpu_memory_usage()
            logged_step = global_step.value()
    
          if ((int(global_step.value()) - checkpointed_step) >=
              checkpoint_every_n):
            manager.save()
            checkpointed_step = int(global_step.value())
            
      tf.logging.info('Performing Evaluation')
      with summary_writer_eval.as_default():
        eager_eval_loop(
            detection_model,
            configs,
            eval_input,
            use_tpu=use_tpu,
            global_step=global_step,
            )
    

    Fixing image rendering in TensorBoard

    if record_summaries:
        imgs = features[fields.InputDataFields.image][:3] 
        imgs = tf.div(tf.subtract(imgs, tf.reduce_min(imgs)), tf.subtract(tf.reduce_max(imgs), tf.reduce_min(imgs)))
        tf.compat.v2.summary.image(name='train_input_images', step=global_step, data=imgs, max_outputs=3)