I'm using MATCONVNET DagNN. Using AlexNet architecture. The last few layers of my architecture are
[![net = dagnn.DagNN() ;
imdb_32 =load('imdb_all_32_pd_norm.mat');
imdb_32=imdb_32.imdb;
% some common options
opts.train.batchSize = 100;
opts.train.numEpochs = 100 ;
opts.train.continue = true ;
opts.train.gpus = \[\] ;
opts.train.learningRate = 0.2;%\[0.1 * ones(1,30), 0.01*ones(1,30), 0.001*ones(1,30)\] ;%0.002;%\[2e-1*ones(1, 10), 2e-2*ones(1, 5)\];
opts.train.momentum = 0.9;
opts.train.expDir = expDir;
opts.train.numSubBatches = 1;
bopts.useGpu =0;%numel(opts.train.gpus) > 0 ;
%% NET
net.addLayer('conv1', dagnn.Conv('size', \[11 11 3 96\], 'hasBias', true, 'stride', \[4, 4\], 'pad', \[20 20 20 20\]), {'input'}, {'conv1'}, {'conv1f' 'conv1b'});
net.addLayer('relu1', dagnn.ReLU(), {'conv1'}, {'relu1'}, {});
net.addLayer('lrn1', dagnn.LRN('param', \[5 1 2.0000e-05 0.7500\]), {'relu1'}, {'lrn1'}, {});
net.addLayer('pool1', dagnn.Pooling('method', 'max', 'poolSize', \[3, 3\], 'stride', \[2 2\], 'pad', \[0 0 0 0\]), {'lrn1'}, {'pool1'}, {});
net.addLayer('conv2', dagnn.Conv('size', \[5 5 48 256\], 'hasBias', true, 'stride', \[1, 1\], 'pad', \[2 2 2 2\]), {'pool1'}, {'conv2'}, {'conv2f' 'conv2b'});
net.addLayer('relu2', dagnn.ReLU(), {'conv2'}, {'relu2'}, {});
net.addLayer('lrn2', dagnn.LRN('param', \[5 1 2.0000e-05 0.7500\]), {'relu2'}, {'lrn2'}, {});
net.addLayer('pool2', dagnn.Pooling('method', 'max', 'poolSize', \[3, 3\], 'stride', \[2 2\], 'pad', \[0 0 0 0\]), {'lrn2'}, {'pool2'}, {});
net.addLayer('drop2',dagnn.DropOut('rate',0.7),{'pool2'},{'drop2'});
net.addLayer('conv3', dagnn.Conv('size', \[3 3 256 384\], 'hasBias', true, 'stride', \[1, 1\], 'pad', \[1 1 1 1\]), {'drop2'}, {'conv3'}, {'conv3f' 'conv3b'});
net.addLayer('relu3', dagnn.ReLU(), {'conv3'}, {'relu3'}, {});
net.addLayer('conv4', dagnn.Conv('size', \[3 3 192 384\], 'hasBias', true, 'stride', \[1, 1\], 'pad', \[1 1 1 1\]), {'relu3'}, {'conv4'}, {'conv4f' 'conv4b'});
net.addLayer('relu4', dagnn.ReLU(), {'conv4'}, {'relu4'}, {});
net.addLayer('conv5', dagnn.Conv('size', \[3 3 192 256\], 'hasBias', true, 'stride', \[1, 1\], 'pad', \[1 1 1 1\]), {'relu4'}, {'conv5'}, {'conv5f' 'conv5b'});
net.addLayer('relu5', dagnn.ReLU(), {'conv5'}, {'relu5'}, {});
net.addLayer('pool5', dagnn.Pooling('method', 'max', 'poolSize', \[3 3\], 'stride', \[2 2\], 'pad', \[0 0 0 0\]), {'relu5'}, {'pool5'}, {});
net.addLayer('drop5',dagnn.DropOut('rate',0.5),{'pool5'},{'drop5'});
net.addLayer('fc6', dagnn.Conv('size', \[1 1 256 4096\], 'hasBias', true, 'stride', \[1, 1\], 'pad', \[0 0 0 0\]), {'drop5'}, {'fc6'}, {'conv6f' 'conv6b'});
net.addLayer('relu6', dagnn.ReLU(), {'fc6'}, {'relu6'}, {});
net.addLayer('fc7', dagnn.Conv('size', \[1 1 4096 4096\], 'hasBias', true, 'stride', \[1, 1\], 'pad', \[0 0 0 0\]), {'relu6'}, {'fc7'}, {'conv7f' 'conv7b'});
net.addLayer('relu7', dagnn.ReLU(), {'fc7'}, {'relu7'}, {});
classLabels=max(unique(imdb_32.images.labels));
net.addLayer('classifier', dagnn.Conv('size', \[1 1 4096 1\], 'hasBias', true, 'stride', \[1, 1\], 'pad', \[0 0 0 0\]), {'relu7'}, {'prediction'}, {'conv8f' 'conv8b'});
net.addLayer('prob', dagnn.SoftMax(), {'prediction'}, {'prob'}, {});
net.addLayer('l2_loss', dagnn.L2Loss(), {'prob', 'label'}, {'objective'});
net.addLayer('error', dagnn.Loss('loss', 'classerror'), {'prob','label'}, 'error') ;
opts.colorDeviation = zeros(3) ;
net.meta.augmentation.jitterFlip = true ;
net.meta.augmentation.jitterLocation = true ;
net.meta.augmentation.jitterFlip = true ;
net.meta.augmentation.jitterBrightness = double(0.1 * opts.colorDeviation) ;
net.meta.augmentation.jitterAspect = \[3/4, 4/3\] ;
net.meta.augmentation.jitterScale = \[0.4, 1.1\] ;
net.meta.augmentation.jitterSaturation = 0.4 ;
net.meta.augmentation.jitterContrast = 0.4 ;
% net.meta.augmentation.jitterAspect = \[2/3, 3/2\] ;
net.meta.normalization.averageImage=imdb_32.images.data_mean;
initNet_He(net);
info = cnn_train_dag(net, imdb_32, @(i,b) getBatch(bopts,i,b), opts.train, 'val', find(imdb_32.images.set == 2)) ;][1]][1]
and The result of each epoch is shown in attachment. Why isn't the error and Objective converging? The regression loss is the MSE loss.
For each individual conv filters' bias and initialization, the parameters have to be chosen based on application at hand. This result is due to signal fading after passing through different filters.