Apply gaussian blur filter on 24/32 bits BMP images in C

I use the following structures to load in memory a bmp image:

// header
typedef struct {
    uint16_t type;  // Magic identifier
    uint32_t size;  // File size in bytes
    uint16_t reserved1;  // Not used
    uint16_t reserved2; // Not used
    uint32_t offset; 
    uint32_t header_size; // Header size in bytes
    uint32_t width;  // Width of the image
    uint32_t height; // Height of image
    uint16_t planes; // Number of color planes
    uint16_t bits; // Bits per pixel
    uint32_t compression;// Compression type
    uint32_t imagesize; // image size in bytes
    uint32_t xresol; // pixels per meter
    uint32_t yresol; // pixels per meter
    uint32_t ncolours;  // nr of colours
    uint32_t importantcolours; // important colours
} BMP_Header;

// image details
typedef struct {
    BMP_Header header;
    uint64_t data_size;
    uint64_t width;
    uint64_t height;
    uint8_t *data;// allocate memory based on width and height 
} BMP_Image;

Then I want to apply a gaussian filter over it. The functions associated is the following:

double gaussianModel(double x, double y, double variance) {
    return 1 / (2 * 3.14159 * pow(variance, 2)) * exp(-(x * x + y * y) / (2 * variance * variance));
}

double *generate_weights(int radius, double variance, int bits_nr) {
    double *weights = (double*)malloc(sizeof(double) * radius * radius * bits_nr);
    double sum = 0;
    for (int i = 0; i < radius; i++) {
        for (int j = 0; j < radius * bits_nr; j++) {
            weights[i * radius * bits_nr + j] = gaussianModel(i - radius / 2, j - radius / 2, variance);
            sum += weights[i * radius + j];
        }
    }

    // normalize
    for (int i = 0; i < radius * radius; i++)
        weights[i] /= sum;

    return weights;
}

double getWeightedColorValue(double *w, int len, unsigned int index, unsigned int bits_nr) {
    double sum = 0;
    for (int i = index; i < len; i+= bits_nr)
        sum += w[i];

    return sum;
}

BMP_Image* BMP_blur_collapsed(BMP_Image *img, unsigned int bits_nr, unsigned int radius, unsigned int th_number, FILE* f) {
    BMP_Image *bluredImg = malloc(sizeof(BMP_Image));
    bluredImg->header = img->header; 
    bluredImg->data_size = (img->header.size - sizeof(BMP_Header)) * bits_nr;
    bluredImg->width = img->header.width;
    bluredImg->height = img->header.height;
    bluredImg->data = malloc(sizeof(uint8_t) * bluredImg->data_size);

    for(uint64_t i = 0; i < bluredImg->data_size; ++i)
        bluredImg->data[i] = img->data[i];

    double variance = 1.94;
    double* weights = generate_weights(radius, variance, bits_nr);
    uint64_t i, j;
    double start, end;
    start = omp_get_wtime();

    #pragma omp parallel for private(i, j) collapse(2) schedule(static) num_threads(th_number)
    for (i = 0; i < img->height - radius; i++) {
        for (j = 0; j < img->width * bits_nr - radius - bits_nr * 2; j+= bits_nr ) {
            uint64_t ofs = i * img->width * bits_nr + j;

            double *distributedColorRed = (double*)malloc(sizeof(double) * radius * radius * bits_nr);
            double *distributedColorGreen = (double*)malloc(sizeof(double) * radius * radius * bits_nr);
            double *distributedColorBlue = (double*)malloc(sizeof(double) * radius * radius * bits_nr);
            
            for (int wx = 0; wx < radius; wx++) {
                for (int wy = 0; wy < radius * bits_nr; wy += bits_nr) {
                    uint64_t wofs = wx * img->width * bits_nr + wy;
                    // double currWeight = weights[wx * radius + wy];

                    distributedColorRed[wofs] = weights[wx * radius + wy] * img->data[ofs];
                    distributedColorGreen[wofs + 1] = weights[wx * radius + wy + 1] * img->data[ofs + 1];
                    distributedColorBlue[wofs + 2] = weights[wx * radius + wy + 2] * img->data[ofs + 2];
                }
            }

            bluredImg->data[ofs] = getWeightedColorValue(distributedColorRed, radius * radius * bits_nr, 0, bits_nr);
            bluredImg->data[ofs + 1] = getWeightedColorValue(distributedColorGreen, radius * radius * bits_nr, 1, bits_nr);
            bluredImg->data[ofs + 2] = getWeightedColorValue(distributedColorBlue, radius * radius * bits_nr, 2, bits_nr);

            free(distributedColorRed);
            free(distributedColorBlue);
            free(distributedColorGreen);
        }
    }

    end = omp_get_wtime();
    fprintf(f, "blur collapsed %f \n", end - start);

    free(weights);
    return bluredImg;
}

Assuming that img is a BMP_Image object, then img->data has stored all the pixels inside it:

img->data[i] is Red
img->data[i + 1] is Green
img->data[i + 2] is Blue

But the output is not the expected one after I use this function.

Solution

Your image convolution with filtering coefficients looks weird. One is because you use the variable bits_nr in the calculation which makes the indexing complicated.
It will be more comprehensible to handle the color components r, g, and b separately.
The variable name variance is not appropriate because variance = sigma ** 2.
You don't need to apply the multiplication with 1/sqrt(2*PI)/sigma in gaussianModel() because it is cancelled in the normalization.

Here is an example of fully compilable code:

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>

#define SIGMA 1.94
#define RADIUS 5
#define INFILE "0Jayx.bmp"
#define OUTFILE "blurred.bmp"

// header
typedef struct  __attribute__((__packed__)) {
    uint16_t type;  // Magic identifier
    uint32_t size;  // File size in bytes
    uint16_t reserved1;  // Not used
    uint16_t reserved2; // Not used
    uint32_t offset;
    uint32_t header_size; // Header size in bytes
    uint32_t width;  // Width of the image
    uint32_t height; // Height of image
    uint16_t planes; // Number of color planes
    uint16_t bits; // Bits per pixel
    uint32_t compression;// Compression type
    uint32_t imagesize; // image size in bytes
    uint32_t xresol; // pixels per meter
    uint32_t yresol; // pixels per meter
    uint32_t ncolours;  // nr of colours
    uint32_t importantcolours; // important colours
} BMP_Header;

// image details
typedef struct {
    BMP_Header header;
    uint64_t width;
    uint64_t height;
    uint8_t *b;
    uint8_t *g;
    uint8_t *r;
} BMP_Image;

double gaussianModel(double x, double y, double sigma) {
    return 1. / exp(-(x * x + y * y) / (2 * sigma * sigma));
}

double *generate_coeff(int radius, double sigma) {
    double *coeff = malloc(sizeof(double) * radius * radius);
    double sum = 0;
    for (int i = 0; i < radius; i++) {
        for (int j = 0; j < radius; j++) {
            coeff[i * radius + j] = gaussianModel(i - radius / 2, j - radius / 2, sigma);
            sum += coeff[i * radius + j];
        }
    }

    // normalize
    for (int i = 0; i < radius * radius; i++)
        coeff[i] /= sum;

    return coeff;
}

BMP_Image *BMP_blur_collapsed(BMP_Image *img, int radius, double sigma) {
    BMP_Image *bimg = malloc(sizeof(BMP_Image));
    bimg->header = img->header;
    bimg->width = img->header.width;
    bimg->height = img->header.height;
    bimg->b = malloc(bimg->width * bimg->height);
    bimg->g = malloc(bimg->width * bimg->height);
    bimg->r = malloc(bimg->width * bimg->height);

    int b, g, r;

    double *coeff = generate_coeff(radius, sigma);
    int i, j, m, n;

    for (i = 0; i < img->height - radius; i++) {
        for (j = 0; j < img->width - radius; j++) {
            b = g = r = 0;
            for (m = 0; m < radius; m++) {
                for (n = 0; n < radius; n++) {
                     b += coeff[m * radius + n] * img->b[(i + m) * img->width + (j + n)];
                     g += coeff[m * radius + n] * img->g[(i + m) * img->width + (j + n)];
                     r += coeff[m * radius + n] * img->r[(i + m) * img->width + (j + n)];
                }
            }
            bimg->b[i * bimg->width + j] = b;
            bimg->g[i * bimg->width + j] = g;
            bimg->r[i * bimg->width + j] = r;
        }
    }
    free(coeff);

    return bimg;
}

void free_img(BMP_Image *img)
{
    free(img->b);
    free(img->g);
    free(img->r);
    free(img);
}

BMP_Image *read_bmp_file(char *filename)
{
    FILE *fp;
    int i, j, bytesperline;
    BMP_Image *img;

    img = malloc(sizeof(BMP_Image));

    if (NULL == (fp = fopen(filename, "r"))) {
        perror(filename);
        exit(1);
    }
    fread(&img->header, sizeof(char), sizeof(BMP_Header), fp);
    img->width = img->header.width;
    img->height = img->header.height;

    bytesperline = ((img->width * 3 + 3) / 4) * 4;      // word alignment

    img->b = malloc(img->width * img->height);
    img->g = malloc(img->width * img->height);
    img->r = malloc(img->width * img->height);

    for (i = 0; i < img->height; i++) {
        for (j = 0; j < img->width; j++) {
            img->b[i * img->width + j] = getc(fp);
            img->g[i * img->width + j] = getc(fp);
            img->r[i * img->width + j] = getc(fp);
        }
        for (j = img->width * 3; j < bytesperline; j++) {
            getc(fp);
        }
    }
    return img;
}

void write_bmp_file(BMP_Image *img, char *filename)
{
    FILE *fp;
    int i, j, bytesperline;

    if (NULL == (fp = fopen(filename, "w"))) {
        perror(filename);
        exit(1);
    }
    bytesperline = ((img->width * 3 + 3) / 4) * 4;      // word alignment
    fwrite(&img->header, sizeof(char), sizeof(BMP_Header), fp);

    for (i = 0; i < img->height; i++) {
        for (j = 0; j < img->width; j++) {
            putc(img->b[i * img->width + j], fp);
            putc(img->g[i * img->width + j], fp);
            putc(img->r[i * img->width + j], fp);
        }
        for (j = img->width * 3; j < bytesperline; j++) {
            putc(0, fp);
        }
    }
}

int main()
{
    BMP_Image *img = read_bmp_file(INFILE);
    BMP_Image *bimg = BMP_blur_collapsed(img, RADIUS, SIGMA);
    write_bmp_file(bimg, OUTFILE);

    free_img(img);
    free_img(bimg);

    return 0;
}

The produced image with radius=5, sigma=1.94:

[Edit]
To modify the code to support 32bit bmp file:

remove the bytesperline relevant codes.
append uint8_t *a; to the struct BMP_Image.
find the b, g, r pixel processing codes and append the a processing thereafter (just by changing the variable name).

If you are familiar with the patch command, below is the patch file which can be applied with:

patch -u < gauss.diff

to convert my previous source code to the 32bit version. (You will need to modify the filenames in the 1st and the 2nd line according to the source filename you work with.)

gauss.diff:

-- gauss.c.O   2021-12-09 08:57:36.504685533 +0900
+++ gauss.c     2021-12-09 09:49:57.342895501 +0900
@@ -36,6 +36,7 @@
     uint8_t *b;
     uint8_t *g;
     uint8_t *r;
+    uint8_t *a;
 } BMP_Image;

 double gaussianModel(double x, double y, double sigma) {
@@ -67,25 +68,28 @@
     bimg->b = malloc(bimg->width * bimg->height);
     bimg->g = malloc(bimg->width * bimg->height);
     bimg->r = malloc(bimg->width * bimg->height);
+    bimg->a = malloc(bimg->width * bimg->height);

-    int b, g, r;
+    int b, g, r, a;

     double *coeff = generate_coeff(radius, sigma);
     int i, j, m, n;

     for (i = 0; i < img->height - radius; i++) {
         for (j = 0; j < img->width - radius; j++) {
-            b = g = r = 0;
+            b = g = r = a = 0;
             for (m = 0; m < radius; m++) {
                 for (n = 0; n < radius; n++) {
                      b += coeff[m * radius + n] * img->b[(i + m) * img->width + (j + n)];
                      g += coeff[m * radius + n] * img->g[(i + m) * img->width + (j + n)];
                      r += coeff[m * radius + n] * img->r[(i + m) * img->width + (j + n)];
+                     a += coeff[m * radius + n] * img->r[(i + m) * img->width + (j + n)];
                 }
             }
             bimg->b[i * bimg->width + j] = b;
             bimg->g[i * bimg->width + j] = g;
             bimg->r[i * bimg->width + j] = r;
+            bimg->a[i * bimg->width + j] = a;
         }
     }
     free(coeff);
@@ -98,13 +102,15 @@
     free(img->b);
     free(img->g);
     free(img->r);
+    free(img->a);
     free(img);
 }

 BMP_Image *read_bmp_file(char *filename)
 {
     FILE *fp;
-    int i, j, bytesperline;
+//  int i, j, bytesperline;
+    int i, j;
     BMP_Image *img;

     img = malloc(sizeof(BMP_Image));
@@ -117,21 +123,25 @@
     img->width = img->header.width;
     img->height = img->header.height;

-    bytesperline = ((img->width * 3 + 3) / 4) * 4;      // word alignment
+//  bytesperline = ((img->width * 3 + 3) / 4) * 4;      // word alignment

     img->b = malloc(img->width * img->height);
     img->g = malloc(img->width * img->height);
     img->r = malloc(img->width * img->height);
+    img->a = malloc(img->width * img->height);

     for (i = 0; i < img->height; i++) {
         for (j = 0; j < img->width; j++) {
             img->b[i * img->width + j] = getc(fp);
             img->g[i * img->width + j] = getc(fp);
             img->r[i * img->width + j] = getc(fp);
+            img->a[i * img->width + j] = getc(fp);
         }
+/*
         for (j = img->width * 3; j < bytesperline; j++) {
             getc(fp);
         }
+*/
     }
     return img;
 }
@@ -139,13 +149,14 @@
 void write_bmp_file(BMP_Image *img, char *filename)
 {
     FILE *fp;
-    int i, j, bytesperline;
+//  int i, j, bytesperline;
+    int i, j;

     if (NULL == (fp = fopen(filename, "w"))) {
         perror(filename);
         exit(1);
     }
-    bytesperline = ((img->width * 3 + 3) / 4) * 4;      // word alignment
+//  bytesperline = ((img->width * 3 + 3) / 4) * 4;      // word alignment
     fwrite(&img->header, sizeof(char), sizeof(BMP_Header), fp);

     for (i = 0; i < img->height; i++) {
@@ -153,10 +164,13 @@
             putc(img->b[i * img->width + j], fp);
             putc(img->g[i * img->width + j], fp);
             putc(img->r[i * img->width + j], fp);
+            putc(img->a[i * img->width + j], fp);
         }
+/*
         for (j = img->width * 3; j < bytesperline; j++) {
             putc(0, fp);
         }
+*/
     }
}