How do I parallelize this triple loop in an efficient way?

I'm trying to parallelize a function which takes as input three arrays (x, y, and prb) and one scalar, and outputs three arrays (P1, Pt1, and Px).

The original c code is here (the outlier and E are inconsequential):

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#define max(A, B)   ((A) > (B) ? (A) : (B))
#define min(A, B)   ((A) < (B) ? (A) : (B))

void cpd_comp(
        double* x,
        double* y, 
        double* prb,
        double* sigma2,
        double* outlier,
        double* P1,
        double* Pt1,
        double* Px,
        double* E,
        int N,
        int M,
        int D
        )

{
  int       n, m, d;
  double    ksig, diff, razn, outlier_tmp, sp;
  double    *P, *temp_x;

  P = (double*) calloc(M, sizeof(double));
  temp_x = (double*) calloc(D, sizeof(double));

  ksig = -2.0 * *sigma2;


  for (n=0; n < N; n++) {

      sp=0;
      for (m=0; m < M; m++) {
          razn=0;
          for (d=0; d < D; d++) {
             diff=*(x+n+d*N)-*(y+m+d*M);  diff=diff*diff;
             razn+=diff;
          }

          *(P+m)=exp(razn/ksig) ;
          sp+=*(P+m);
      }


      *(Pt1+n)=*(prb+n);
      for (d=0; d < D; d++) {
       *(temp_x+d)=*(x+n+d*N)/ sp;
      }

      for (m=0; m < M; m++) {
          *(P1+m)+=((*(P+m)/ sp) **(prb+n));

          for (d=0; d < D; d++) {
          *(Px+m+d*M)+= (*(temp_x+d)**(P+m)**(prb+n));
          }

      }

   *E +=  -log(sp);     
  }
  *E +=D*N*log(*sigma2)/2;


  free((void*)P);
  free((void*)temp_x);

  return;
}

Here is my attempt at parallelizing it:

#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <thrust/device_ptr.h>
#include <thrust/reduce.h>

/*headers*/
void cpd_comp(
    float * x,        //Points to register      [N*D]
    float * y,        //Points to be registered [M*D]
    float * prb,      //Vector of probabilities [N]
    float * sigma2,   //Square of sigma
    float ** P1,       //P1,  output, [M]
    float ** Pt1,      //Pt1, output, [N]
    float ** Px,       //Px,  output, [M*3]
    int N,            //Number of points, i.e. rows, in x
    int M             //Number of points, i.e. rows, in 
    );

__global__ void d_computeP(
    float * P,
    float * P1,
    float * Px,
    float * ProbabilityMatrix,
    float * x,
    float * y,
    float * prb,
    float ksig,
    const int N,
    const int M);

__global__ void d_sumP(
    float * sp,
    float * P1timessp,
    float * Pxtimessp,
    float * P1,
    float * Px,
    const int N,
    const int M);

/*implementations*/

void cpd_comp(
    float * x,        //Points to register      [N*D]
    float * y,        //Points to be registered [M*D]
    float * prb,      //Vector of probabilities [N]
    float * sigma2,   //Scalar
    float ** P1,       //P1,  output, [M]
    float ** Pt1,      //Pt1, output, [N]
    float ** Px,       //Px,  output, [M*3]
    int N,            //Number of points, i.e. rows, in x
    int M             //Number of points, i.e. rows, in y
    ){
    //X is generatedPointPos
    //Y is points

    float
        *P,
        *P1timessp,
        *Pxtimessp,
        ksig = -2.0 * (*sigma2),
        *h_sumofP = new float[N], //sum of P, on host
        *d_sumofP;                //sum of P, on device

    cudaMalloc((void**)&P,        sizeof(float)*M*N);
    cudaMalloc((void**)&P1timessp,sizeof(float)*M*N);
    cudaMalloc((void**)&Pxtimessp,sizeof(float)*M*N*3);
    cudaMalloc((void**)&d_sumofP, sizeof(float)*N);

    cudaMalloc((void**)P1,        sizeof(float)*M);
    cudaMalloc((void**)Px,        sizeof(float)*M*3);
    cudaMalloc((void**)Pt1,       sizeof(float)*N);

    d_computeP<<<dim3(N,M/1024+1),M>1024?1024:M>>>(P,P1timessp,Pxtimessp,NULL,x,y,prb,ksig,N,M);

    for(int n=0; n<N; n++){
        thrust::device_ptr<float>dev_ptr(P);
        h_sumofP[n] = thrust::reduce(dev_ptr+M*n,dev_ptr+M*(n+1),0.0f,thrust::plus<float>());
    }

    cudaMemcpy(d_sumofP,h_sumofP,sizeof(float)*N,cudaMemcpyHostToDevice);

    d_sumP<<<M/1024+1,M>1024?1024:M>>>(d_sumofP,P1timessp,Pxtimessp,*P1,*Px,N,M);

    cudaMemcpy(*Pt1,prb,sizeof(float)*N,cudaMemcpyDeviceToDevice);

    cudaFree(P);
    cudaFree(P1timessp);
    cudaFree(Pxtimessp);
    cudaFree(d_sumofP);
    delete[]h_sumofP;
}

/*kernels*/

__global__ void d_computeP(
    float * P,
    float * P1,
    float * Px,
    float * ProbabilityMatrix,
    float * x,
    float * y,
    float * prb,
    float ksig,
    const int N,
    const int M){
    //thread configuration: <<<dim3(N,M/1024+1),1024>>>
    int m = threadIdx.x+blockIdx.y*blockDim.x;
    int n = blockIdx.x;
    if(m>=M || n>=N) return;

    float 
        x1 = x[3*n],
        x2 = x[3*n+1],
        x3 = x[3*n+2],
        diff1 = x1 - y[3*m],
        diff2 = x2 - y[3*m+1],
        diff3 = x3 - y[3*m+2],
        razn = diff1*diff1+diff2*diff2+diff3*diff3,

        Pm = __expf(razn/ksig), //fast exponentiation
        prbn = prb[n];

    P[M*n+m] = Pm; 

    __syncthreads();

    P1[N*m+n] = Pm*prbn;
    Px[3*(N*m+n)+0] = x1*Pm*prbn;
    Px[3*(N*m+n)+1] = x2*Pm*prbn;
    Px[3*(N*m+n)+2] = x3*Pm*prbn;
}

__global__ void d_sumP(
    float * sp,
    float * P1timessp,
    float * Pxtimessp,
    float * P1,
    float * Px,
    const int N,
    const int M){
    //computes P1 and Px
    //thread configuration: <<<M/1024+1,1024>>>
    int m = threadIdx.x+blockIdx.x*blockDim.x;
    if(m>=M) return;
    float 
        P1m = 0,
        Pxm1 = 0,
        Pxm2 = 0,
        Pxm3 = 0;
    for(int n=0; n<N; n++){
        float spn = 1/sp[n];
        P1m += P1timessp[N*m+n]*spn;
        Pxm1 += Pxtimessp[3*(N*m+n)+0]*spn;
        Pxm2 += Pxtimessp[3*(N*m+n)+1]*spn;
        Pxm3 += Pxtimessp[3*(N*m+n)+2]*spn;
    }

    P1[m] = P1m;
    Px[3*m+0] = Pxm1;
    Px[3*m+1] = Pxm2;
    Px[3*m+2] = Pxm3;

}

However, to my horror, it runs much, much slower than the original version. How do I make it run faster? Please explain things thoroughly since I am very new to CUDA and parallel programming and have no experience in algorithms.

Do note that the c version has column-major ordering and the CUDA version has row-major. I have done several tests to make sure that the result is correct. It's just extremely slow and takes up a LOT of memory.

Any help is greatly appreciated!

EDIT: More information: N and M are on the order of a few thousand (say, 300-3000) and D is always 3. The CUDA version expects arrays to be device memory, except for variables prefixed with h_.

Solution

Before trying any CUDA-specific optimizations, profile your code to see where time is being spent.

Try and arrange your array reads/writes so that each CUDA thread uses a strided access pattern. For example, currently you have

int m = threadIdx.x+blockIdx.y*blockDim.x;
int n = blockIdx.x;
if(m>=M || n>=N) return;

diff1 = x1 - y[3*m],
diff2 = x2 - y[3*m+1],
diff3 = x3 - y[3*m+2],

So thread 1 will read from y[0],y[1],y[2] etc. Instead, rearrange your data so that thread 1 reads from y[0],y[M],y[2*M] and thread 2 reads from y[1],y[M+1],y[2*M+1] etc. You should follow this access pattern for other arrays.

Also, you may want to consider whether you can avoid the use of __syncthreads(). I don't quite follow why it's necessary in this algorithm, it might be worth removing it to see if it improves performance ( even if it produces incorrect results ).