I am having trouble evaluating my training process during training a Tensorflow2 Custom Object Detector. After reading several issues related to this problem I found that evaluation and training should be treated as two separate processes therefore I should use a new anaconda prompt for starting the evaluation job. I am training on the ssd_mobilenetv2 640x640 version. My pipeline configuration:
model {
ssd {
num_classes: 6
image_resizer {
fixed_shape_resizer {
height: 640
width: 640
}
}
feature_extractor {
type: "ssd_mobilenet_v2_fpn_keras"
depth_multiplier: 1.0
min_depth: 16
conv_hyperparams {
regularizer {
l2_regularizer {
weight: 3.9999998989515007e-05
}
}
initializer {
random_normal_initializer {
mean: 0.0
stddev: 0.009999999776482582
}
}
activation: RELU_6
batch_norm {
decay: 0.996999979019165
scale: true
epsilon: 0.0010000000474974513
}
}
use_depthwise: true
override_base_feature_extractor_hyperparams: true
fpn {
min_level: 3
max_level: 7
additional_layer_depth: 128
}
}
box_coder {
faster_rcnn_box_coder {
y_scale: 10.0
x_scale: 10.0
height_scale: 5.0
width_scale: 5.0
}
}
matcher {
argmax_matcher {
matched_threshold: 0.5
unmatched_threshold: 0.5
ignore_thresholds: false
negatives_lower_than_unmatched: true
force_match_for_each_row: true
use_matmul_gather: true
}
}
similarity_calculator {
iou_similarity {
}
}
box_predictor {
weight_shared_convolutional_box_predictor {
conv_hyperparams {
regularizer {
l2_regularizer {
weight: 3.9999998989515007e-05
}
}
initializer {
random_normal_initializer {
mean: 0.0
stddev: 0.009999999776482582
}
}
activation: RELU_6
batch_norm {
decay: 0.996999979019165
scale: true
epsilon: 0.0010000000474974513
}
}
depth: 128
num_layers_before_predictor: 4
kernel_size: 3
class_prediction_bias_init: -4.599999904632568
share_prediction_tower: true
use_depthwise: true
}
}
anchor_generator {
multiscale_anchor_generator {
min_level: 3
max_level: 7
anchor_scale: 4.0
aspect_ratios: 1.0
aspect_ratios: 2.0
aspect_ratios: 0.5
scales_per_octave: 2
}
}
post_processing {
batch_non_max_suppression {
score_threshold: 9.99999993922529e-09
iou_threshold: 0.6000000238418579
max_detections_per_class: 100
max_total_detections: 100
use_static_shapes: false
}
score_converter: SIGMOID
}
normalize_loss_by_num_matches: true
loss {
localization_loss {
weighted_smooth_l1 {
}
}
classification_loss {
weighted_sigmoid_focal {
gamma: 2.0
alpha: 0.25
}
}
classification_weight: 1.0
localization_weight: 1.0
}
encode_background_as_zeros: true
normalize_loc_loss_by_codesize: true
inplace_batchnorm_update: true
freeze_batchnorm: false
}
}
train_config {
batch_size: 4
data_augmentation_options {
random_horizontal_flip {
}
}
#data_augmentation_options {
#random_crop_image {
#min_object_covered: 0.0
#min_aspect_ratio: 0.75
#max_aspect_ratio: 3.0
#min_area: 0.75
#max_area: 1.0
#overlap_thresh: 0.0
#}
#}
optimizer {
momentum_optimizer {
learning_rate {
cosine_decay_learning_rate {
learning_rate_base: 0.04999999821186066
total_steps: 50000
warmup_learning_rate: 0.0026666000485420227
warmup_steps: 600
}
}
momentum_optimizer_value: 0.8999999761581421
}
use_moving_average: false
}
fine_tune_checkpoint: "pre-trained-models\ssd_mobilenet_v2_fpnlite_640x640_coco17_tpu-8\checkpoint\ckpt-0"
num_steps: 50000
startup_delay_steps: 0.0
replicas_to_aggregate: 8
max_number_of_boxes: 100
unpad_groundtruth_tensors: false
fine_tune_checkpoint_type: "detection"
fine_tune_checkpoint_version: V2
from_detection_checkpoint: true
}
train_input_reader {
label_map_path: "annotations/label_map.pbtxt"
tf_record_input_reader {
input_path: "data/train.record"
}
}
eval_config {
metrics_set: "coco_detection_metrics"
use_moving_averages: false
}
eval_input_reader {
label_map_path: "annotations/label_map.pbtxt"
shuffle: false
num_epochs: 1
tf_record_input_reader {
input_path: "data/test.record"
}
}
I have started the training with the command:
python model_main_tf2.py --model_dir=models/my_ssd2_3/ --pipeline_config_path=models/my_ssd2_3/pipeline.config --sample_1_of_n_eval_examples 1 --logtostderr
I was hoping that setting the number of evaluation examples will have an effect of starting the evaluation job. In any case I tried running the evaluation in a different terminal window with : python model_main_tf2.py --model_dir=models/my_ssd2_3 --pipeline_config_path=models/my_ssd2_3/pipeline.config --checkpoint_dir=models/my_ssd2_3/ --alsologtostderr
As soon as starting the evaluation the training job crashes with this error:error
The problem I think that I do not have the sufficient hardware:
Could it be a problem that all the input images that I use are 3000x3000, therefore the preprocesser has to load too many information? If so, is there any way to work around it? I would not want to resize all the images before generating TF record file, because I would have to re-label all the images. I clearly lack the insight of how the memory is being allocated during the start of the training process so some details would be much appreciated.
data= (features[fields.InputDataFields.image]-np.min(features[fields.InputDataFields.image]))/(np.max(features[fields.InputDataFields.image])-np.min(features[fields.InputDataFields.image]))
,
According to this solution:https://github.com/tensorflow/models/issues/9115 Without any luck. Is there a solution to this problem? Also it would be nice if I could monitor there the bounding boxes the modell proposes. Thank you.
With some changes to the train_loop function in model_lib.py, you can alternate between training and evaluation in the same application. See the example below.
From what I understand, the Tensorflow Object Detection API is developed with a focus on distributed learning and if you were using mulitple GPUs/TPUs then you could have some devices doing training and other devices doing evaluation. So I suspect the way model_lib.py is implemented currently does not fully support doing training and evaluation on the same device.
I'm not certain the root cause of the error you are seeing, typically I have seen Tensorflow throw OOM errors when there is a memory issue. It may be that how Tensorflow is using CUDA does not support two applications using the same device.
Regarding your second question, I followed the advice here on the same thread and this worked for me. Duplicating the code in the third code block below. Initially, this did not appear to work for me because I naively updated the file in the Object Detection repository I created, but your application may be using the Object Detection API that is installed in your site-libs, so I would recommend confirming that the file you are changing is the same one being loaded in your import statements.
--
This is outside of the training loop
##Set up evaluation data and writer
eval_config = configs['eval_config']
eval_input_configs = configs['eval_input_configs']
eval_input_config = eval_input_configs[0]
eval_input = strategy.experimental_distribute_dataset(
inputs.eval_input(
eval_config=eval_config,
eval_input_config=eval_input_config,
model_config=model_config,
model=detection_model))
summary_writer_eval = tf.compat.v2.summary.create_file_writer(os.path.join(model_dir, 'eval', eval_input_config.name))
This is the modified train/evaluation loop. The evaluation happens near the end.
for _ in range(global_step.value(), train_steps, num_steps_per_iteration):
tf.logging.info('Performing Training')
with summary_writer_train.as_default():
with tf.compat.v2.summary.record_if(lambda: global_step % num_steps_per_iteration == 0):
losses_dict = _dist_train_step(train_input_iter)
time_taken = time.time() - last_step_time
last_step_time = time.time()
steps_per_sec = num_steps_per_iteration * 1.0 / time_taken
tf.compat.v2.summary.scalar(
'steps_per_sec', steps_per_sec, step=global_step)
steps_per_sec_list.append(steps_per_sec)
logged_dict = losses_dict.copy()
logged_dict['learning_rate'] = learning_rate_fn()
for key, val in logged_dict.items():
tf.compat.v2.summary.scalar(key, val, step=global_step)
if global_step.value() - logged_step >= 0:
logged_dict_np = {name: value.numpy() for name, value in
logged_dict.items()}
tf.logging.info(
'Step {} per-step time {:.3f}s'.format(
global_step.value(), time_taken / num_steps_per_iteration))
tf.logging.info(pprint.pformat(logged_dict_np, width=40))
print_gpu_memory_usage()
logged_step = global_step.value()
if ((int(global_step.value()) - checkpointed_step) >=
checkpoint_every_n):
manager.save()
checkpointed_step = int(global_step.value())
tf.logging.info('Performing Evaluation')
with summary_writer_eval.as_default():
eager_eval_loop(
detection_model,
configs,
eval_input,
use_tpu=use_tpu,
global_step=global_step,
)
Fixing image rendering in TensorBoard
if record_summaries:
imgs = features[fields.InputDataFields.image][:3]
imgs = tf.div(tf.subtract(imgs, tf.reduce_min(imgs)), tf.subtract(tf.reduce_max(imgs), tf.reduce_min(imgs)))
tf.compat.v2.summary.image(name='train_input_images', step=global_step, data=imgs, max_outputs=3)