java deep-learning conv-neural-network gpgpu aparapi

Aparapi cannot resolve max and falls back to CPU

So I'm designing a CNN in Java and I'm down to the point where I really wanna parallelize the convolution and pooling. This is my approach(rows, columns, inputLayer, convLayer, poolLayer and features have been initialized already in the constructor):

    int padding = 3;
    int filterSize = 2 * padding + 1;
    int[] input = new int[rows * columns];
    for(int r = 0; r < rows; r++)
        System.arraycopy(inputLayer[r], 0, input, r * columns, columns);
    int[] filters = new int[4 * filterSize * filterSize];
    for(int fl = 0; fl < 4; fl++)
        for(int fr = 0; fr < filterSize; fr++)
            System.arraycopy(features[fl][fr], 0, filters, fl * filterSize * filterSize + fr * filterSize, filterSize);
    float[] conv = new float[4 * rows * columns];
    float[] pool = new float[rows * columns];

    Range convRange = Range.create3D(columns, rows, 4, 2, 2, 2);
    Kernel convKernel = new Kernel(){
        int h = rows;
        int w = columns;
        int p = padding;
        int fs = filterSize;
        public void run(){
            int val = 0;
            int c = getGlobalId(0);
            int r = getGlobalId(1);
            int l = getGlobalId(2);
            int upper = max(0, p - r);
            int lower = min(fs, h + p - r);
            int left = max(0, p - c);
            int right = min(fs, w + p - c);
            for (int i = upper; i < lower; i++)
                for (int j = left; j < right; j++)
                    val += input[(r + i - p) * w + c + j - p] * filters[l * fs * fs + i * fs + j];
            conv[l * h * w + r * w + c] = Math.round(100.00f * val / fs) / 100.00f;
        }
    };
    convKernel.setExplicit(true);
    convKernel.put(input);
    convKernel.put(conv);
    convKernel.put(filters);
    convKernel.execute(convRange);
    convKernel.get(conv);
    for(int convL = 0; convL < 4; convL++)
        for(int convR = 0; convR < rows; convR++)
            System.arraycopy(conv, convL * rows * columns + convR * columns, convLayer[convL][convR], 0, columns);

    Range poolRange = Range.create3D(columns / 2, rows / 2, 4, 2, 2, 2);
    Kernel poolKernel = new Kernel(){
        public void run(){
            int wt = columns;
            int ht = rows;
            float val = 0.00f;
            int c = getGlobalId(0);
            int r = getGlobalId(1);
            int l = getGlobalId(2);
            for(int i = 0; i < 2; i++)
                for(int j = 0; j < 2; j++)
                    val = max(val, leakyReLU(conv[l * ht * wt + (2 * r + i) * wt + 2 * c + j]));
            pool[(l * ht * wt / 4) + (r * wt / 2) + c] = Math.round(100.00f * val) / 100.00f;
        }
    };
    poolKernel.setExplicit(true);
    poolKernel.put(conv);
    poolKernel.put(pool);
    poolKernel.execute(poolRange);
    poolKernel.get(pool);
    for(int poolL = 0; poolL < 4; poolL++)
        for(int poolR = 0; poolR < rows / 2; poolR++)
            System.arraycopy(pool, (poolL * rows * columns / 4) + (poolR * columns / 2), poolLayer[poolL][poolR], 0, columns / 2);

Not the prettiest piece of code but I haven't used Java in ages, let alone Aparapi.

Initially I used directly the original arrays, but the api showed a message that it doesn't support them and switched to native mode. Converting everything to 1d arrays is supposed to work but now I get this message:

VIII 09, 2022 9:03:02 PM com.aparapi.internal.model.MethodModel init WARNING: Method max(FF)F does not contain a LocalVariableTable entry (source not compiled with -g) codegen will attempt to create a synthetic table based on bytecode. This is experimental!! VIII 09, 2022 9:03:02 PM com.aparapi.internal.kernel.KernelRunner fallBackToNextDevice WARNING: Device failed for NeuralNetwork$2, devices={NVIDIA|Intel|Java Alternative Algorithm|Java Thread Pool}: null

So it looks like poolKernel can't resolve the max function and the whole thing falls back to CPU.

When debugging, I can confirm that it only uses 12 threads - the amount supported by my Intel Core i7. The GPU is an NVIDIA GeForce GTX 1650 with 896 cores so that's what I would expect to see.

Also, at the end it says:

WARNING: Aparapi is running on an untested OpenCL platform version: OpenCL 3.0 CUDA 11.3.123 WARNING: Aparapi is running on an untested OpenCL platform version: OpenCL 3.0

What am I missing? P.S.: As you would imagine, I'm new to both conv nets and GPGPU. I know there's a library that contains all needed cnn functions(cudnn) but I want to implement it by myself to really understand how it works.

Solution

Well... Sometimes, apparently, one needs to write down one's question to be able to answer it. Did some reworking and now all errors seem to be gone:

    int padding = 3;
    int filterSize = 2 * padding + 1;
    int[] params = {rows, columns, padding, filterSize};
    int[] input = new int[rows * columns];
    for(int r = 0; r < rows; r++)
        System.arraycopy(inputLayer[r], 0, input, r * columns, columns);
    int[] filters = new int[4 * filterSize * filterSize];
    for(int fl = 0; fl < 4; fl++)
        for(int fr = 0; fr < filterSize; fr++)
            System.arraycopy(features[fl][fr], 0, filters, fl * filterSize * filterSize + fr * filterSize, filterSize);
    float[] conv = new float[4 * rows * columns];
    float[] pool = new float[rows * columns];

    Range convRange = Range.create3D(columns, rows, 4);
    Kernel convKernel = new Kernel(){
        final int h = params[0];
        final int w = params[1];
        final int p = params[2];
        final int fs = params[3];
        public void run(){
            int val = 0;
            final int c = getGlobalId(0);
            final int r = getGlobalId(1);
            final int l = getGlobalId(2);
            final int upper = max(0, p - r);
            final int lower = min(fs, h + p - r);
            final int left = max(0, p - c);
            final int right = min(fs, w + p - c);
            for (int i = upper; i < lower; i++)
                for (int j = left; j < right; j++)
                    val += input[(r + i - p) * w + c + j - p] * filters[l * fs * fs + i * fs + j];
            conv[l * h * w + r * w + c] = Math.round(100.00f * val / fs) / 100.00f;
        }
    };
    convKernel.setExplicit(true);
    convKernel.put(params);
    convKernel.put(input);
    convKernel.put(conv);
    convKernel.put(filters);
    convKernel.execute(convRange);
    convKernel.get(conv);
    for(int convL = 0; convL < 4; convL++)
        for(int convR = 0; convR < rows; convR++)
            System.arraycopy(conv, convL * rows * columns + convR * columns, convLayer[convL][convR], 0, columns);

    Range poolRange = Range.create3D(columns / 2, rows / 2, 4);
    Kernel poolKernel = new Kernel(){
        final int ht = params[0];
        final int wt = params[1];
        public void run(){
            //final float coef = coefficient;
            float val = 0.00f;
            final int c = getGlobalId(0);
            final int r = getGlobalId(1);
            final int l = getGlobalId(2);
            for(int i = 0; i < 2; i++)
                for (int j = 0; j < 2; j++) {
                    float tmp = NeuralNetwork.ReLU(conv[l * ht * wt + (2 * r + i) * wt + 2 * c + j]);
                    if(val < tmp) val = tmp;
                }
            pool[(l * ht * wt / 4) + (r * wt / 2) + c] = Math.round(100.00f * val) / 100.00f;
        }
    };
    poolKernel.setExplicit(true);
    poolKernel.put(params);
    poolKernel.put(conv);
    poolKernel.put(pool);
    poolKernel.execute(poolRange);
    poolKernel.get(pool);
    for(int poolL = 0; poolL < 4; poolL++)
        for(int poolR = 0; poolR < rows / 2; poolR++)
            System.arraycopy(pool, (poolL * rows * columns / 4) + (poolR * columns / 2), poolLayer[poolL][poolR], 0, columns / 2);

Also, I came to the conclusion that I don't need LeakyReLU - regular ReLU is perfectly fine! That, being said, I think the topic is more or less closed. I hope someone can learn from my rough path :D