Search code examples
caffeconv-neural-networktraining-datalmdb

Caffe - aborted training


I am trying to train from scratch a caffe model (in docker).

pwd:

root@982adaaca24f:~/sharedfolder/caffe/docker/image/happyNet# 

relevant files path:

models/
      Custom_Model/
                  deploy.prototxt
                  solver.prototxt
                  train.prototxt
datasets/
        training_set_lmdb/
                         data.mdb (5,01 GB)
                         lock.mdb
        validation_set_lmdb/
                         data.mdb (163,8 GB)
                         lock.mdb

for that I'm running:

#~/caffe/build/tools/caffe train -solver models/Custom_Model/solver.prototxt

but after initializing net from parameters, loading mean file, opening LMDB datasets and setting all up, I get an aborted process with:

    I0428 22:51:03.340870    59 caffe.cpp:178] Use CPU.
    I0428 22:51:03.343197    59 solver.cpp:48] Initializing solver from parameters: 
    test_iter: 1
    test_interval: 20
    base_lr: 0.001
    display: 10
    max_iter: 3000
    lr_policy: "fixed"
    momentum: 0.9
    snapshot: 100
    snapshot_prefix: "snapshot"
    solver_mode: CPU
    net: "models/Custom_Model/train.prototxt"
    momentum2: 0.999
    type: "Adam"
    I0428 22:51:03.348469    59 solver.cpp:91] Creating training net from net file: models/Custom_Model/train.prototxt
    I0428 22:51:03.351524    59 upgrade_proto.cpp:52] Attempting to upgrade input file specified using deprecated V1LayerParameter: models/Custom_Model/train.prototxt
    I0428 22:51:03.352391    59 upgrade_proto.cpp:60] Successfully upgraded file specified using deprecated V1LayerParameter
    I0428 22:51:03.353207    59 net.cpp:313] The NetState phase (0) differed from the phase (1) specified by a rule in layer training_test
    I0428 22:51:03.353914    59 net.cpp:49] Initializing net from parameters: 
    name: "CaffeNet"
    state {
      phase: TRAIN
    }
    layer {
      name: "training_train"
      type: "Data"
      top: "data"
      top: "label"
      include {
        phase: TRAIN
      }
      transform_param {
        mean_file: "datasets/mean_training_image.binaryproto"
      }
      data_param {
        source: "datasets/training_set_lmdb"
        batch_size: 400
        backend: LMDB
      }
    }
    layer {
      name: "conv1"
      type: "Convolution"
      bottom: "data"
      top: "conv1"
      param {
        lr_mult: 0
      }
      param {
        lr_mult: 0
      }
      convolution_param {
        num_output: 96
        kernel_size: 7
        stride: 2
      }
    }
    layer {
      name: "relu1"
      type: "ReLU"
      bottom: "conv1"
      top: "conv1"
    }
    layer {
      name: "norm1"
      type: "LRN"
      bottom: "conv1"
      top: "norm1"
      lrn_param {
        local_size: 5
        alpha: 0.0005
        beta: 0.75
      }
    }
    layer {
      name: "pool1"
      type: "Pooling"
      bottom: "norm1"
      top: "pool1"
      pooling_param {
        pool: MAX
        kernel_size: 3
        stride: 3
      }
    }
    layer {
      name: "conv2"
      type: "Convolution"
      bottom: "pool1"
      top: "conv2"
      param {
        lr_mult: 0
      }
      param {
        lr_mult: 0
      }
      convolution_param {
        num_output: 256
        pad: 2
        kernel_size: 5
      }
    }
    layer {
      name: "relu2"
      type: "ReLU"
      bottom: "conv2"
      top: "conv2"
    }
    layer {
      name: "pool2"
      type: "Pooling"
      bottom: "conv2"
      top: "pool2"
      pooling_param {
        pool: MAX
        kernel_size: 2
        stride: 2
      }
    }
    layer {
      name: "conv3"
      type: "Convolution"
      bottom: "pool2"
      top: "conv3"
      param {
        lr_mult: 0
      }
      param {
        lr_mult: 0
      }
      convolution_param {
        num_output: 512
        pad: 1
        kernel_size: 3
      }
    }
    layer {
      name: "relu3"
      type: "ReLU"
      bottom: "conv3"
      top: "conv3"
    }
    layer {
      name: "conv4"
      type: "Convolution"
      bottom: "conv3"
      top: "conv4"
      param {
        lr_mult: 0
      }
      param {
        lr_mult: 0
      }
      convolution_param {
        num_output: 512
        pad: 1
        kernel_size: 3
      }
    }
    layer {
      name: "relu4"
      type: "ReLU"
      bottom: "conv4"
      top: "conv4"
    }
    layer {
      name: "conv5"
      type: "Convolution"
      bottom: "conv4"
      top: "conv5"
      param {
        lr_mult: 0
      }
      param {
        lr_mult: 0
      }
      convolution_param {
        num_output: 512
        pad: 1
        kernel_size: 3
      }
    }
    layer {
      name: "relu5"
      type: "ReLU"
      bottom: "conv5"
      top: "conv5"
    }
    layer {
      name: "pool5"
      type: "Pooling"
      bottom: "conv5"
      top: "pool5"
      pooling_param {
        pool: MAX
        kernel_size: 3
        stride: 3
      }
    }
    layer {
      name: "fc6"
      type: "InnerProduct"
      bottom: "pool5"
      top: "fc6"
      param {
        lr_mult: 1
      }
      param {
        lr_mult: 1
      }
      inner_product_param {
        num_output: 4048
      }
    }
    layer {
      name: "relu6"
      type: "ReLU"
      bottom: "fc6"
      top: "fc6"
    }
    layer {
      name: "drop6"
      type: "Dropout"
      bottom: "fc6"
      top: "fc6"
      dropout_param {
        dropout_ratio: 0.5
      }
    }
    layer {
      name: "fc7"
      type: "InnerProduct"
      bottom: "fc6"
      top: "fc7"
      param {
        lr_mult: 1
      }
      param {
        lr_mult: 1
      }
      inner_product_param {
        num_output: 4048
      }
    }
    layer {
      name: "relu7"
      type: "ReLU"
      bottom: "fc7"
      top: "fc7"
    }
    layer {
      name: "drop7"
      type: "Dropout"
      bottom: "fc7"
      top: "fc7"
      dropout_param {
        dropout_ratio: 0.5
      }
    }
    layer {
      name: "fc8_cat"
      type: "InnerProduct"
      bottom: "fc7"
      top: "fc8"
      param {
        lr_mult: 1
      }
      param {
        lr_mult: 1
      }
      inner_product_param {
        num_output: 6
      }
    }
    layer {
      name: "prob"
      type: "SoftmaxWithLoss"
      bottom: "fc8"
      bottom: "label"
    }
    I0428 22:51:03.356101    59 layer_factory.hpp:77] Creating layer training_train
    I0428 22:51:03.357806    59 net.cpp:91] Creating Layer training_train
    I0428 22:51:03.357897    59 net.cpp:399] training_train -> data
    I0428 22:51:03.359665    59 net.cpp:399] training_train -> label
    I0428 22:51:03.359840    59 data_transformer.cpp:25] Loading mean file from: datasets/mean_training_image.binaryproto
    I0428 22:51:03.376284    61 db_lmdb.cpp:35] Opened lmdb datasets/training_set_lmdb
    I0428 22:51:03.380998    59 data_layer.cpp:41] output data size: 400,3,224,224
    I0428 22:51:04.102387    59 net.cpp:141] Setting up training_train
    I0428 22:51:04.102494    59 net.cpp:148] Top shape: 400 3 224 224 (60211200)
    I0428 22:51:04.102694    59 net.cpp:148] Top shape: 400 (400)
    I0428 22:51:04.104347    59 net.cpp:156] Memory required for data: 240846400
    I0428 22:51:04.105435    59 layer_factory.hpp:77] Creating layer conv1
    I0428 22:51:04.107542    59 net.cpp:91] Creating Layer conv1
    I0428 22:51:04.108368    59 net.cpp:425] conv1 <- data
    I0428 22:51:04.109095    59 net.cpp:399] conv1 -> conv1
    I0428 22:51:04.109275    59 net.cpp:141] Setting up conv1
    I0428 22:51:04.109341    59 net.cpp:148] Top shape: 400 96 109 109 (456230400)
    I0428 22:51:04.109398    59 net.cpp:156] Memory required for data: 2065768000
    I0428 22:51:04.109553    59 layer_factory.hpp:77] Creating layer relu1
    I0428 22:51:04.109599    59 net.cpp:91] Creating Layer relu1
    I0428 22:51:04.109633    59 net.cpp:425] relu1 <- conv1
    I0428 22:51:04.109670    59 net.cpp:386] relu1 -> conv1 (in-place)
    I0428 22:51:04.110841    59 net.cpp:141] Setting up relu1
    I0428 22:51:04.111608    59 net.cpp:148] Top shape: 400 96 109 109 (456230400)
    I0428 22:51:04.111649    59 net.cpp:156] Memory required for data: 3890689600
    I0428 22:51:04.111726    59 layer_factory.hpp:77] Creating layer norm1
    I0428 22:51:04.111804    59 net.cpp:91] Creating Layer norm1
    I0428 22:51:04.111929    59 net.cpp:425] norm1 <- conv1
    I0428 22:51:04.111969    59 net.cpp:399] norm1 -> norm1
    I0428 22:51:04.112043    59 net.cpp:141] Setting up norm1
    I0428 22:51:04.112100    59 net.cpp:148] Top shape: 400 96 109 109 (456230400)
    I0428 22:51:04.112149    59 net.cpp:156] Memory required for data: 5715611200
    I0428 22:51:04.112201    59 layer_factory.hpp:77] Creating layer pool1
    I0428 22:51:04.112262    59 net.cpp:91] Creating Layer pool1
    I0428 22:51:04.112313    59 net.cpp:425] pool1 <- norm1
    I0428 22:51:04.112367    59 net.cpp:399] pool1 -> pool1
    I0428 22:51:04.112658    59 net.cpp:141] Setting up pool1
    I0428 22:51:04.112794    59 net.cpp:148] Top shape: 400 96 37 37 (52569600)
    I0428 22:51:04.112848    59 net.cpp:156] Memory required for data: 5925889600
    I0428 22:51:04.112884    59 layer_factory.hpp:77] Creating layer conv2
    I0428 22:51:04.112972    59 net.cpp:91] Creating Layer conv2
    I0428 22:51:04.113026    59 net.cpp:425] conv2 <- pool1
    I0428 22:51:04.113488    59 net.cpp:399] conv2 -> conv2
    I0428 22:51:04.115536    59 net.cpp:141] Setting up conv2
    I0428 22:51:04.115640    59 net.cpp:148] Top shape: 400 256 37 37 (140185600)
    I0428 22:51:04.115696    59 net.cpp:156] Memory required for data: 6486632000
    I0428 22:51:04.115751    59 layer_factory.hpp:77] Creating layer relu2
    I0428 22:51:04.115788    59 net.cpp:91] Creating Layer relu2
    I0428 22:51:04.115888    59 net.cpp:425] relu2 <- conv2
    I0428 22:51:04.115939    59 net.cpp:386] relu2 -> conv2 (in-place)
    I0428 22:51:04.116014    59 net.cpp:141] Setting up relu2
    I0428 22:51:04.116051    59 net.cpp:148] Top shape: 400 256 37 37 (140185600)
    I0428 22:51:04.116106    59 net.cpp:156] Memory required for data: 7047374400
    I0428 22:51:04.116142    59 layer_factory.hpp:77] Creating layer pool2
    I0428 22:51:04.116181    59 net.cpp:91] Creating Layer pool2
    I0428 22:51:04.116235    59 net.cpp:425] pool2 <- conv2
    I0428 22:51:04.116294    59 net.cpp:399] pool2 -> pool2
    I0428 22:51:04.116364    59 net.cpp:141] Setting up pool2
    I0428 22:51:04.116492    59 net.cpp:148] Top shape: 400 256 19 19 (36966400)
    I0428 22:51:04.116545    59 net.cpp:156] Memory required for data: 7195240000
    I0428 22:51:04.116581    59 layer_factory.hpp:77] Creating layer conv3
    I0428 22:51:04.116639    59 net.cpp:91] Creating Layer conv3
    I0428 22:51:04.116670    59 net.cpp:425] conv3 <- pool2
    I0428 22:51:04.116727    59 net.cpp:399] conv3 -> conv3
    I0428 22:51:04.134765    59 net.cpp:141] Setting up conv3
    I0428 22:51:04.134871    59 net.cpp:148] Top shape: 400 512 19 19 (73932800)
    I0428 22:51:04.134928    59 net.cpp:156] Memory required for data: 7490971200
    I0428 22:51:04.135994    59 layer_factory.hpp:77] Creating layer relu3
    I0428 22:51:04.136255    59 net.cpp:91] Creating Layer relu3
    I0428 22:51:04.136296    59 net.cpp:425] relu3 <- conv3
    I0428 22:51:04.136435    59 net.cpp:386] relu3 -> conv3 (in-place)
    I0428 22:51:04.137774    59 net.cpp:141] Setting up relu3
    I0428 22:51:04.139025    59 net.cpp:148] Top shape: 400 512 19 19 (73932800)
    I0428 22:51:04.139958    59 net.cpp:156] Memory required for data: 7786702400
    I0428 22:51:04.140475    59 layer_factory.hpp:77] Creating layer conv4
    I0428 22:51:04.141017    59 net.cpp:91] Creating Layer conv4
    I0428 22:51:04.141383    59 net.cpp:425] conv4 <- conv3
    I0428 22:51:04.141641    59 net.cpp:399] conv4 -> conv4
    I0428 22:51:04.165778    59 net.cpp:141] Setting up conv4
    I0428 22:51:04.165900    59 net.cpp:148] Top shape: 400 512 19 19 (73932800)
    I0428 22:51:04.165962    59 net.cpp:156] Memory required for data: 8082433600
    I0428 22:51:04.168637    59 layer_factory.hpp:77] Creating layer relu4
    I0428 22:51:04.171306    59 net.cpp:91] Creating Layer relu4
    I0428 22:51:04.171368    59 net.cpp:425] relu4 <- conv4
    I0428 22:51:04.171439    59 net.cpp:386] relu4 -> conv4 (in-place)
    I0428 22:51:04.175688    59 net.cpp:141] Setting up relu4
    I0428 22:51:04.175788    59 net.cpp:148] Top shape: 400 512 19 19 (73932800)
    I0428 22:51:04.175819    59 net.cpp:156] Memory required for data: 8378164800
    I0428 22:51:04.175881    59 layer_factory.hpp:77] Creating layer conv5
    I0428 22:51:04.175940    59 net.cpp:91] Creating Layer conv5
    I0428 22:51:04.175971    59 net.cpp:425] conv5 <- conv4
    I0428 22:51:04.176026    59 net.cpp:399] conv5 -> conv5
    I0428 22:51:04.194139    59 net.cpp:141] Setting up conv5
    I0428 22:51:04.194244    59 net.cpp:148] Top shape: 400 512 19 19 (73932800)
    I0428 22:51:04.196287    59 net.cpp:156] Memory required for data: 8673896000
    I0428 22:51:04.201050    59 layer_factory.hpp:77] Creating layer relu5
    I0428 22:51:04.201668    59 net.cpp:91] Creating Layer relu5
    I0428 22:51:04.206367    59 net.cpp:425] relu5 <- conv5
    I0428 22:51:04.206445    59 net.cpp:386] relu5 -> conv5 (in-place)
    I0428 22:51:04.208932    59 net.cpp:141] Setting up relu5
    I0428 22:51:04.209012    59 net.cpp:148] Top shape: 400 512 19 19 (73932800)
    I0428 22:51:04.209039    59 net.cpp:156] Memory required for data: 8969627200
    I0428 22:51:04.209074    59 layer_factory.hpp:77] Creating layer pool5
    I0428 22:51:04.209153    59 net.cpp:91] Creating Layer pool5
    I0428 22:51:04.209192    59 net.cpp:425] pool5 <- conv5
    I0428 22:51:04.210391    59 net.cpp:399] pool5 -> pool5
    I0428 22:51:04.211598    59 net.cpp:141] Setting up pool5
    I0428 22:51:04.216861    59 net.cpp:148] Top shape: 400 512 7 7 (10035200)
    I0428 22:51:04.217041    59 net.cpp:156] Memory required for data: 9009768000
    I0428 22:51:04.217103    59 layer_factory.hpp:77] Creating layer fc6
    I0428 22:51:04.219173    59 net.cpp:91] Creating Layer fc6
    I0428 22:51:04.219277    59 net.cpp:425] fc6 <- pool5
    I0428 22:51:04.219324    59 net.cpp:399] fc6 -> fc6
    I0428 22:51:04.773458    59 net.cpp:141] Setting up fc6
    I0428 22:51:04.777616    59 net.cpp:148] Top shape: 400 4048 (1619200)
    I0428 22:51:04.778857    59 net.cpp:156] Memory required for data: 9016244800
    I0428 22:51:04.781023    59 layer_factory.hpp:77] Creating layer relu6
    I0428 22:51:04.784178    59 net.cpp:91] Creating Layer relu6
    I0428 22:51:04.788236    59 net.cpp:425] relu6 <- fc6
    I0428 22:51:04.790361    59 net.cpp:386] relu6 -> fc6 (in-place)
    I0428 22:51:04.792532    59 net.cpp:141] Setting up relu6
    I0428 22:51:04.792620    59 net.cpp:148] Top shape: 400 4048 (1619200)
    I0428 22:51:04.792671    59 net.cpp:156] Memory required for data: 9022721600
    I0428 22:51:04.792724    59 layer_factory.hpp:77] Creating layer drop6
    I0428 22:51:04.792795    59 net.cpp:91] Creating Layer drop6
    I0428 22:51:04.793380    59 net.cpp:425] drop6 <- fc6
    I0428 22:51:04.793471    59 net.cpp:386] drop6 -> fc6 (in-place)
    I0428 22:51:04.794314    59 net.cpp:141] Setting up drop6
    I0428 22:51:04.795964    59 net.cpp:148] Top shape: 400 4048 (1619200)
    I0428 22:51:04.796800    59 net.cpp:156] Memory required for data: 9029198400
    I0428 22:51:04.797582    59 layer_factory.hpp:77] Creating layer fc7
    I0428 22:51:04.797665    59 net.cpp:91] Creating Layer fc7
    I0428 22:51:04.798545    59 net.cpp:425] fc7 <- fc6
    I0428 22:51:04.798630    59 net.cpp:399] fc7 -> fc7
    I0428 22:51:04.828491    62 blocking_queue.cpp:50] Waiting for data
    I0428 22:51:04.880416    59 net.cpp:141] Setting up fc7
    I0428 22:51:04.880659    59 net.cpp:148] Top shape: 400 4048 (1619200)
    I0428 22:51:04.880733    59 net.cpp:156] Memory required for data: 9035675200
    I0428 22:51:04.880820    59 layer_factory.hpp:77] Creating layer relu7
    I0428 22:51:04.880908    59 net.cpp:91] Creating Layer relu7
    I0428 22:51:04.880982    59 net.cpp:425] relu7 <- fc7
    I0428 22:51:04.881057    59 net.cpp:386] relu7 -> fc7 (in-place)
    I0428 22:51:04.881140    59 net.cpp:141] Setting up relu7
    I0428 22:51:04.881214    59 net.cpp:148] Top shape: 400 4048 (1619200)
    I0428 22:51:04.881286    59 net.cpp:156] Memory required for data: 9042152000
    I0428 22:51:04.881357    59 layer_factory.hpp:77] Creating layer drop7
    I0428 22:51:04.881438    59 net.cpp:91] Creating Layer drop7
    I0428 22:51:04.881507    59 net.cpp:425] drop7 <- fc7
    I0428 22:51:04.881594    59 net.cpp:386] drop7 -> fc7 (in-place)
    I0428 22:51:04.881676    59 net.cpp:141] Setting up drop7
    I0428 22:51:04.881752    59 net.cpp:148] Top shape: 400 4048 (1619200)
    I0428 22:51:04.881820    59 net.cpp:156] Memory required for data: 9048628800
    I0428 22:51:04.881891    59 layer_factory.hpp:77] Creating layer fc8_cat
    I0428 22:51:04.881965    59 net.cpp:91] Creating Layer fc8_cat
    I0428 22:51:04.882040    59 net.cpp:425] fc8_cat <- fc7
    I0428 22:51:04.882113    59 net.cpp:399] fc8_cat -> fc8
    I0428 22:51:04.882292    59 net.cpp:141] Setting up fc8_cat
    I0428 22:51:04.882369    59 net.cpp:148] Top shape: 400 6 (2400)
    I0428 22:51:04.882429    59 net.cpp:156] Memory required for data: 9048638400
    I0428 22:51:04.882500    59 layer_factory.hpp:77] Creating layer prob
    I0428 22:51:04.882591    59 net.cpp:91] Creating Layer prob
    I0428 22:51:04.882678    59 net.cpp:425] prob <- fc8
    I0428 22:51:04.886852    59 net.cpp:425] prob <- label
    I0428 22:51:04.886905    59 net.cpp:399] prob -> (automatic)
    I0428 22:51:04.887187    59 layer_factory.hpp:77] Creating layer prob
    I0428 22:51:04.888458    59 net.cpp:141] Setting up prob
    I0428 22:51:04.888552    59 net.cpp:148] Top shape: (1)
    I0428 22:51:04.888584    59 net.cpp:151]     with loss weight 1
    I0428 22:51:04.888667    59 net.cpp:156] Memory required for data: 9048638404
    I0428 22:51:04.888703    59 net.cpp:217] prob needs backward computation.
    I0428 22:51:04.888746    59 net.cpp:217] fc8_cat needs backward computation.
    I0428 22:51:04.888803    59 net.cpp:217] drop7 needs backward computation.
    I0428 22:51:04.888860    59 net.cpp:217] relu7 needs backward computation.
    I0428 22:51:04.888916    59 net.cpp:217] fc7 needs backward computation.
    I0428 22:51:04.888969    59 net.cpp:217] drop6 needs backward computation.
    I0428 22:51:04.889027    59 net.cpp:217] relu6 needs backward computation.
    I0428 22:51:04.889086    59 net.cpp:217] fc6 needs backward computation.
   (...)
    I0428 22:51:04.896559    59 net.cpp:274] Network initialization done.
    I0428 22:51:04.908800    59 upgrade_proto.cpp:52] Attempting to upgrade input file specified using deprecated V1LayerParameter: models/Custom_Model/train.prototxt
    I0428 22:51:04.909487    59 upgrade_proto.cpp:60] Successfully upgraded file specified using deprecated V1LayerParameter
    I0428 22:51:04.910534    59 solver.cpp:181] Creating test net (#0) specified by net file: models/Custom_Model/train.prototxt
    I0428 22:51:04.910686    59 net.cpp:313] The NetState phase (1) differed from the phase (0) specified by a rule in layer training_train
    I0428 22:51:04.912101    59 net.cpp:49] Initializing net from parameters: 
    name: "CaffeNet"
    state {
      phase: TEST
    }
    layer {
      name: "training_test"
      type: "Data"
      top: "data"
      top: "label"
      include {
        phase: TEST
      }
      transform_param {
        mean_file: "datasets/mean_training_image.binaryproto"
      }
      data_param {
        source: "datasets/validation_set_lmdb"
        batch_size: 14
        backend: LMDB
      }
    }
    layer {
      name: "conv1"
      type: "Convolution"
      bottom: "data"
      top: "conv1"
      param {
        lr_mult: 0
      }
      param {
        lr_mult: 0
      }
      convolution_param {
        num_output: 96
        kernel_size: 7
        stride: 2
      }
    }
    layer {
      name: "relu1"
      type: "ReLU"
      bottom: "conv1"
      top: "conv1"
    }
    layer {
      name: "norm1"
      type: "LRN"
      bottom: "conv1"
      top: "norm1"
      lrn_param {
        local_size: 5
        alpha: 0.0005
        beta: 0.75
      }
    }
    layer {
      name: "pool1"
      type: "Pooling"
      bottom: "norm1"
      top: "pool1"
      pooling_param {
        pool: MAX
        kernel_size: 3
        stride: 3
      }
    }
    layer {
      name: "conv2"
      type: "Convolution"
      bottom: "pool1"
      top: "conv2"
      param {
        lr_mult: 0
      }
      param {
        lr_mult: 0
      }
      convolution_param {
        num_output: 256
        pad: 2
        kernel_size: 5
      }
    }
    layer {
      name: "relu2"
      type: "ReLU"
      bottom: "conv2"
      top: "conv2"
    }
    layer {
      name: "pool2"
      type: "Pooling"
      bottom: "conv2"
      top: "pool2"
      pooling_param {
        pool: MAX
        kernel_size: 2
        stride: 2
      }
    }
    layer {
      name: "conv3"
      type: "Convolution"
      bottom: "pool2"
      top: "conv3"
      param {
        lr_mult: 0
      }
      param {
        lr_mult: 0
      }
      convolution_param {
        num_output: 512
        pad: 1
        kernel_size: 3
      }
    }
    layer {
      name: "relu3"
      type: "ReLU"
      bottom: "conv3"
      top: "conv3"
    }
    layer {
      name: "conv4"
      type: "Convolution"
      bottom: "conv3"
      top: "conv4"
      param {
        lr_mult: 0
      }
      param {
        lr_mult: 0
      }
      convolution_param {
        num_output: 512
        pad: 1
        kernel_size: 3
      }
    }
    layer {
      name: "relu4"
      type: "ReLU"
      bottom: "conv4"
      top: "conv4"
    }
    layer {
      name: "conv5"
      type: "Convolution"
      bottom: "conv4"
      top: "conv5"
      param {
        lr_mult: 0
      }
      param {
        lr_mult: 0
      }
      convolution_param {
        num_output: 512
        pad: 1
        kernel_size: 3
      }
    }
    layer {
      name: "relu5"
      type: "ReLU"
      bottom: "conv5"
      top: "conv5"
    }
    layer {
      name: "pool5"
      type: "Pooling"
      bottom: "conv5"
      top: "pool5"
      pooling_param {
        pool: MAX
        kernel_size: 3
        stride: 3
      }
    }
    layer {
      name: "fc6"
      type: "InnerProduct"
      bottom: "pool5"
      top: "fc6"
      param {
        lr_mult: 1
      }
      param {
        lr_mult: 1
      }
      inner_product_param {
        num_output: 4048
      }
    }
    layer {
      name: "relu6"
      type: "ReLU"
      bottom: "fc6"
      top: "fc6"
    }
    layer {
      name: "drop6"
      type: "Dropout"
      bottom: "fc6"
      top: "fc6"
      dropout_param {
        dropout_ratio: 0.5
      }
    }
    layer {
      name: "fc7"
      type: "InnerProduct"
      bottom: "fc6"
      top: "fc7"
      param {
        lr_mult: 1
      }
      param {
        lr_mult: 1
      }
      inner_product_param {
        num_output: 4048
      }
    }
    layer {
      name: "relu7"
      type: "ReLU"
      bottom: "fc7"
      top: "fc7"
    }
    layer {
      name: "drop7"
      type: "Dropout"
      bottom: "fc7"
      top: "fc7"
      dropout_param {
        dropout_ratio: 0.5
      }
    }
    layer {
      name: "fc8_cat"
      type: "InnerProduct"
      bottom: "fc7"
      top: "fc8"
      param {
        lr_mult: 1
      }
      param {
        lr_mult: 1
      }
      inner_product_param {
        num_output: 6
      }
    }
    layer {
      name: "prob"
      type: "SoftmaxWithLoss"
      bottom: "fc8"
      bottom: "label"
    }
    I0428 22:51:04.915211    59 layer_factory.hpp:77] Creating layer training_test
    I0428 22:51:04.916718    59 net.cpp:91] Creating Layer training_test
    I0428 22:51:04.916820    59 net.cpp:399] training_test -> data
    I0428 22:51:04.916895    59 net.cpp:399] training_test -> label
    I0428 22:51:04.916968    59 data_transformer.cpp:25] Loading mean file from: datasets/mean_training_image.binaryproto
    I0428 22:51:04.957635    63 db_lmdb.cpp:35] Opened lmdb datasets/validation_set_lmdb
    I0428 22:51:04.966471    59 data_layer.cpp:41] output data size: 14,3,224,224
    I0428 22:51:04.986405    59 net.cpp:141] Setting up training_test
    I0428 22:51:04.987761    59 net.cpp:148] Top shape: 14 3 224 224 (2107392)
    I0428 22:51:04.988591    59 net.cpp:148] Top shape: 14 (14)
    I0428 22:51:04.988828    59 net.cpp:156] Memory required for data: 8429624
    I0428 22:51:04.991192    59 layer_factory.hpp:77] Creating layer conv1
    I0428 22:51:04.992264    59 net.cpp:91] Creating Layer conv1
    I0428 22:51:04.992722    59 net.cpp:425] conv1 <- data
    I0428 22:51:04.993867    59 net.cpp:399] conv1 -> conv1
    I0428 22:51:04.994596    59 net.cpp:141] Setting up conv1
   (...)
    I0428 22:51:05.945319    59 net.cpp:274] Network initialization done.
    I0428 22:51:05.946696    59 solver.cpp:60] Solver scaffolding done.
    I0428 22:51:05.948148    59 caffe.cpp:219] Starting Optimization
    I0428 22:51:05.948653    59 solver.cpp:279] Solving CaffeNet
    I0428 22:51:05.949687    59 solver.cpp:280] Learning Rate Policy: fixed
    I0428 22:51:10.701836    59 solver.cpp:337] Iteration 0, Testing net (#0)
    I0428 22:51:10.705909    59 net.cpp:684] Ignoring source layer training_train
    Killed

Can somebody show me what I am doing wrong here?

EDIT: as a sidenote for future reference, the accepted answer solved the problem, but then training was aborted at first snapshot.

Apparently memory was still an issue, and, in order to make it work, I had to add this line at solver.prototxt, perhaps due to the size of data.mdb (5,01 GB):

snapshot_format: HDF5

related: https://github.com/BVLC/caffe/pull/2836

Then it all worked.


Solution

  • You're using solver_mode: CPU, did you check CPU memory utilisation when this training starts?

    You are using very high training batch_size

    layer {
              name: "training_train"
              type: "Data"
              top: "data"
              top: "label"
              include {
                phase: TRAIN
              }
              transform_param {
                mean_file: "datasets/mean_training_image.binaryproto"
              }
              data_param {
                source: "datasets/training_set_lmdb"
                batch_size: 400
                backend: LMDB
              }
            }
    

    So looks like memory needed for this batch_size: 400 is not available on your system CPU.

    So reduce batch_size and give training. Let's say batch_size: 20.

    Once you get to know the available memory on your system, you can roughly calculate the batch_size that you can use.