Performance issue while using C/OpenMP

I wrote some code to test executing time of small program using C and OpenMP and I encountered some issue with app execution time. Here is pice of code, responsible to add 2 vectors:

  float *x_f = (float *)malloc(sizeof(float) * DATA_SIZE);
  float *y_f = (float *)malloc(sizeof(float) * DATA_SIZE);

  for (int i = 0; i < DATA_SIZE; i++) {

    x_f[i] = 1.0f;
    y_f[i] = 2.0f;
  }

  start_time = omp_get_wtime();

#pragma omp parallel num_threads(N_THREADS)
  {
      const int thread_id = omp_get_thread_num();

      int begin = range * thread_id;
      int end = begin + range;
      if (thread_id + 1 == N_THREADS)
          end = DATA_SIZE;

      for (int i = begin; i < end; i++) {
          x_f[i] += y_f[i];
      }
  }
  end_time = omp_get_wtime();
  duration = end_time - start_time;

Same code I use to add vectors with double type elements - in the same application. Program seems to work fine, but results are a little strange for me, because float computation time is several times greater than double.

Float      1       2       3       4      5        6      7        8       9      10      11      12
1024    0,0036  0,4535  0,6875  0,9443  1,1653  1,5068  1,6951  2,0447  2,3546  2,6611  3,1319  3,1468
double     1       2       3       4       5       6       7       8       9       10      11     12
1024    0,0004  0,0014  0,0016  0,0019  0,0018  0,0018  0,002   0,0021  0,0024  0,0028  0,0045  0,0036

1-12 is number of OmpeMP Threads, 1024 is vector size in elements. Time is in msec. Can someone explain me, why it happened or what I am doing wrong? I am new in C and OpenMP and I have no idea, why results are as above.

EDIT: Full source code below:

#include <omp.h>
#include <stdio.h>
#include <stdlib.h>


int main(int argc, char const *argv[]) {
  double start_time, end_time, duration;

  if (argv[1] == NULL || argv[2] == NULL) {
    printf("Error parsing data from input. Program will now close");
    return 2;
  }
  int DATA_SIZE = atoi(argv[1]);
  int N_THREADS = atoi(argv[2]);

  int range = DATA_SIZE / N_THREADS;

  // ===================== FLOAT ========================
  float *x_f = (float *)malloc(sizeof(float) * DATA_SIZE);
  float *y_f = (float *)malloc(sizeof(float) * DATA_SIZE);

  for (int i = 0; i < DATA_SIZE; i++) {

    x_f[i] = 1.0f;
    y_f[i] = 2.0f;
  }

  start_time = omp_get_wtime();
#pragma omp parallel num_threads(N_THREADS)
  {
    const int thread_id = omp_get_thread_num();

    int begin = range * thread_id;
    int end = begin + range;
    if (thread_id + 1 == N_THREADS)
      end = DATA_SIZE;

    for (int i = begin; i < end; i++) {
      x_f[i] += y_f[i];
      /*printf("x_f[%d]=%f\ty_f[%d]=%f\tThreadNum=%d\n", i, x_f[i], i,
      y_f[i],
       omp_get_thread_num());*/
    }
  }

  end_time = omp_get_wtime();
  duration = end_time - start_time;

  // Error checking
  for (int i = 0; i < DATA_SIZE; i++) {
    if (!(3.0 - x_f[i]) == 0) {
      printf("ERROR: %f\n", x_f[i]);
      break;
    }
  }
  free(x_f);
  free(y_f);

  printf("==========[FLOAT]==========\n");
  printf("Number of threads: %d\n", N_THREADS);
  printf("Data size: %d bytes\n", DATA_SIZE * sizeof(float));
  printf("ExecTime: %lf ms\n", duration * 1000);

  // ===================== DOUBLE ========================
  double *x_lf = (double *)malloc(sizeof(double) * DATA_SIZE);
  double *y_lf = (double *)malloc(sizeof(double) * DATA_SIZE);

  for (int i = 0; i < DATA_SIZE; i++) {

    x_lf[i] = 1.0f;
    y_lf[i] = 2.0f;
  }

  start_time = omp_get_wtime();

#pragma omp parallel num_threads(N_THREADS)
  {
    const int thread_id = omp_get_thread_num();

    int begin = range * thread_id;
    int end = begin + range;
    if (thread_id + 1 == N_THREADS)
      end = DATA_SIZE;

    for (int i = begin; i < end; i++) {
      x_lf[i] += y_lf[i];
      /*printf("x_f[%d]=%f\ty_f[%d]=%f\tThreadNum=%d\n", i, x_lf[i], i,
      y_lf[i], omp_get_thread_num());*/
    }
  }

  end_time = omp_get_wtime();
  duration = end_time - start_time;

  // Error checking
  for (int i = 0; i < DATA_SIZE; i++) {
    if (!(3.0 - x_lf[i]) == 0) {
      printf("ERROR: %f\n", x_lf[i]);
      break;
    }
  }
  free(x_lf);
  free(y_lf);

  printf("\n==========[DOUBLE]==========\n");
  printf("Number of threads: %d\n", N_THREADS);
  printf("Data size: %d bytes\n", DATA_SIZE * sizeof(double));
  printf("ExecTime: %lf ms\n", duration * 1000);
  
  return 0;
}

EDIT2: Full table of results Results

Solution

Here is the code updated Sep 2022. By adding and removing -03 gcc optimizations are switched on and off. Yes, Godbolt will refuse to run from time to time, since that is a large data set for Godbolt. For better benchmarking run locally.

Here is the code refactored. And perhaps meaningful. Using sheredom/ubench. Please spot the compiler arguments: -fopenmp -s -lm -O3. Play with -O and spot the difference.

(As a side note, please read: 32 OpenMP Traps for C++ Developers)

Here is the code:


#include "https://raw.githubusercontent.com/sheredom/ubench.h/master/ubench.h"

#include <assert.h>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>

    #define DATA_SIZE 100000
    #define N_THREADS 2
    #define range DATA_SIZE/N_THREADS

static struct {
    float x_f[DATA_SIZE] ;
    float y_f[DATA_SIZE] ;
    double x_d[DATA_SIZE] ;
    double y_d[DATA_SIZE] ;
} * app_data = 0 ;

static void app_start (void) 
{
    app_data = calloc(1, sizeof(*app_data) );
    assert(app_data) ;

  for (int i = 0; i < DATA_SIZE; ++i) {
    app_data->x_f[i] = 1.0f;
    app_data->y_f[i] = 2.0f;
    app_data->x_d[i] = 1.0;
    app_data->y_d[i] = 2.0;
  }
}

UBENCH( omp_measuring, adding_two_arrays_of_floats )
{
    #pragma omp parallel num_threads(N_THREADS)
  {
      const int thread_id = omp_get_thread_num();

      const int begin = range * thread_id;
      int end = begin + range;
      if (thread_id + 1 == N_THREADS)
          end = DATA_SIZE;

      for (int i = begin; i < end; i++) {
          app_data->x_f[i] += app_data->y_f[i];
      }
  }
}

UBENCH( omp_measuring, adding_two_arrays_of_doubles )
{
    #pragma omp parallel num_threads(N_THREADS)
  {
      const int thread_id = omp_get_thread_num();

      const int begin = range * thread_id;
      int end = begin + range;
      if (thread_id + 1 == N_THREADS)
          end = DATA_SIZE;

      for (int i = begin; i < end; i++) {
          app_data->x_d[i] += app_data->y_d[i];
      }
  }
}

static void app_end (void) { free(app_data); }


UBENCH_STATE();

int main(int argc, const char *const argv[])
{
    app_start();

    ubench_main(argc, argv);

    app_end();

}

Caveat Emptor

The whole point of OMP is (very) moot since OMP apparently works only if optimizations are not used. It is very hard to beat gnuc -O3.

The problem with Godbolt is it chokes on large data sets. On VLA's (aka Very Large Arrays) using OMP should appear as meaningful.

In reality, it is very rare to have to deal with megabyte or gigabyte datasets. And then one might imagine GPU's being used. No OMP.

Bonus

app_start is not prescribed by UBENCH. One can pass application command line arguments to a different app_start :

static void app_start(const unsigned argc, char ** argv )
{
   // use app arguments here
}

int main (int argc, char ** argv) 
{
      app_start(argc,argv);
      ubench_main(argc,argv);
      app_end();
}