matlab deep-learning regression convergence matconvnet

Deep Neural Network training, why is the network training not converging?

I'm using MATCONVNET DagNN. Using AlexNet architecture. The last few layers of my architecture are

  [![net = dagnn.DagNN() ;
  imdb_32 =load('imdb_all_32_pd_norm.mat');
  imdb_32=imdb_32.imdb;
  % some common options
  opts.train.batchSize = 100;
  opts.train.numEpochs = 100 ;
  opts.train.continue = true ;
  opts.train.gpus = \[\] ;
  opts.train.learningRate = 0.2;%\[0.1 * ones(1,30), 0.01*ones(1,30), 0.001*ones(1,30)\] ;%0.002;%\[2e-1*ones(1, 10),  2e-2*ones(1, 5)\];
  opts.train.momentum = 0.9;
  opts.train.expDir = expDir;
  opts.train.numSubBatches = 1;

  bopts.useGpu =0;%numel(opts.train.gpus) >  0 ;

  %% NET
  net.addLayer('conv1', dagnn.Conv('size', \[11 11 3 96\], 'hasBias', true, 'stride', \[4, 4\], 'pad', \[20 20 20 20\]), {'input'}, {'conv1'},  {'conv1f'  'conv1b'});
  net.addLayer('relu1', dagnn.ReLU(), {'conv1'}, {'relu1'}, {});
  net.addLayer('lrn1', dagnn.LRN('param', \[5 1 2.0000e-05 0.7500\]), {'relu1'}, {'lrn1'}, {});
  net.addLayer('pool1', dagnn.Pooling('method', 'max', 'poolSize', \[3, 3\], 'stride', \[2 2\], 'pad', \[0 0 0 0\]), {'lrn1'}, {'pool1'}, {});

  net.addLayer('conv2', dagnn.Conv('size', \[5 5 48 256\], 'hasBias', true, 'stride', \[1, 1\], 'pad', \[2 2 2 2\]), {'pool1'}, {'conv2'},  {'conv2f'  'conv2b'});
  net.addLayer('relu2', dagnn.ReLU(), {'conv2'}, {'relu2'}, {});
  net.addLayer('lrn2', dagnn.LRN('param', \[5 1 2.0000e-05 0.7500\]), {'relu2'}, {'lrn2'}, {});
  net.addLayer('pool2', dagnn.Pooling('method', 'max', 'poolSize', \[3, 3\], 'stride', \[2 2\], 'pad', \[0 0 0 0\]), {'lrn2'}, {'pool2'}, {});
  net.addLayer('drop2',dagnn.DropOut('rate',0.7),{'pool2'},{'drop2'});

  net.addLayer('conv3', dagnn.Conv('size', \[3 3 256 384\], 'hasBias', true, 'stride', \[1, 1\], 'pad', \[1 1 1 1\]), {'drop2'}, {'conv3'},  {'conv3f'  'conv3b'});
  net.addLayer('relu3', dagnn.ReLU(), {'conv3'}, {'relu3'}, {});

  net.addLayer('conv4', dagnn.Conv('size', \[3 3 192 384\], 'hasBias', true, 'stride', \[1, 1\], 'pad', \[1 1 1 1\]), {'relu3'}, {'conv4'},  {'conv4f'  'conv4b'});
  net.addLayer('relu4', dagnn.ReLU(), {'conv4'}, {'relu4'}, {});

  net.addLayer('conv5', dagnn.Conv('size', \[3 3 192 256\], 'hasBias', true, 'stride', \[1, 1\], 'pad', \[1 1 1 1\]), {'relu4'}, {'conv5'},  {'conv5f'  'conv5b'});
  net.addLayer('relu5', dagnn.ReLU(), {'conv5'}, {'relu5'}, {});
  net.addLayer('pool5', dagnn.Pooling('method', 'max', 'poolSize', \[3 3\], 'stride', \[2 2\], 'pad', \[0 0 0 0\]), {'relu5'}, {'pool5'}, {});
  net.addLayer('drop5',dagnn.DropOut('rate',0.5),{'pool5'},{'drop5'});

  net.addLayer('fc6', dagnn.Conv('size', \[1 1 256 4096\], 'hasBias', true, 'stride', \[1, 1\], 'pad', \[0 0 0 0\]), {'drop5'}, {'fc6'},  {'conv6f'  'conv6b'});
  net.addLayer('relu6', dagnn.ReLU(), {'fc6'}, {'relu6'}, {});

  net.addLayer('fc7', dagnn.Conv('size', \[1 1 4096 4096\], 'hasBias', true, 'stride', \[1, 1\], 'pad', \[0 0 0 0\]), {'relu6'}, {'fc7'},  {'conv7f'  'conv7b'});
  net.addLayer('relu7', dagnn.ReLU(), {'fc7'}, {'relu7'}, {});
  classLabels=max(unique(imdb_32.images.labels));
  net.addLayer('classifier', dagnn.Conv('size', \[1 1 4096 1\], 'hasBias', true, 'stride', \[1, 1\], 'pad', \[0 0 0 0\]), {'relu7'}, {'prediction'},  {'conv8f'  'conv8b'});
   net.addLayer('prob', dagnn.SoftMax(), {'prediction'}, {'prob'}, {});
  net.addLayer('l2_loss', dagnn.L2Loss(), {'prob', 'label'}, {'objective'});
  net.addLayer('error', dagnn.Loss('loss', 'classerror'), {'prob','label'}, 'error') ;

  opts.colorDeviation = zeros(3) ;
  net.meta.augmentation.jitterFlip = true ;
  net.meta.augmentation.jitterLocation = true ;
  net.meta.augmentation.jitterFlip = true ;
  net.meta.augmentation.jitterBrightness = double(0.1 * opts.colorDeviation) ;
  net.meta.augmentation.jitterAspect = \[3/4, 4/3\] ;
  net.meta.augmentation.jitterScale  = \[0.4, 1.1\] ;
  net.meta.augmentation.jitterSaturation = 0.4 ;
  net.meta.augmentation.jitterContrast = 0.4 ;
  % net.meta.augmentation.jitterAspect = \[2/3, 3/2\] ;
  net.meta.normalization.averageImage=imdb_32.images.data_mean;
  initNet_He(net);

  info = cnn_train_dag(net, imdb_32, @(i,b) getBatch(bopts,i,b), opts.train, 'val', find(imdb_32.images.set == 2)) ;][1]][1]

and The result of each epoch is shown in attachment. Why isn't the error and Objective converging? The regression loss is the MSE loss.

Solution

For each individual conv filters' bias and initialization, the parameters have to be chosen based on application at hand. This result is due to signal fading after passing through different filters.