Why does this MEXed C/magma code seg-fault while the stand alone C code works?

The following MEXed C code simply makes calls to magma to invert a matrix. The stand alone C code (which is also posted) works, but the mex code crashes.

I've triple checked the documentation, verified that other magma functions work as expected, and posted on the Magma forum and was told my code is fine (this post is a cross post from Magma forum). This means that the problem is with mex. I would like to know what is causing the mex code to seg-fault and how to get it to run as expected.

Mexed code:

#include <mex.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <stddef.h>
#include <magma_v2.h>
#include <cuda_runtime.h>

void mat2magma(magmaDoubleComplex* p, double* pr, double* pi,int numElements)
{
    int j=0;
    for(j=0;j<numElements;j++){
        p[j].x=pr[j];
        p[j].y=pi[j];
    }
}

void magma2mat(magmaDoubleComplex* p, double* pr, double* pi,int numElements)
{
    int j=0;
    for(j=0;j<numElements;j++){
        pr[j]= p[j].x;
        pi[j]= p[j].y;
    }
}

/*gateway function*/
void mexFunction( int nlhs, mxArray *plhs[],
        int nrhs, const mxArray *prhs[]) {

    /*initialize magma*/
    magma_init();
    magma_queue_t queue = NULL;
    magma_device_t dev;
    magma_getdevice(&dev);
    magma_queue_create(dev,&queue );

    magma_int_t m,ldwork,info;
    magma_int_t *piv;
    magmaDoubleComplex *a,*da,*dwork;

    /* Matlab -> Host */
    m=mxGetM(prhs[0]);
    piv=(magma_int_t*) malloc(m*sizeof(magma_int_t));
    magma_zmalloc_cpu(&a,m*m);
    mat2magma(a,mxGetPr(prhs[0]),mxGetPi(prhs[0]),m*m);
    ldwork = m*magma_get_zgetri_nb(m);

    /* Host -> GPU */
    magma_zmalloc(&dwork,ldwork);
    magma_zmalloc(&da,m*m);
    magma_zsetmatrix(m,m,a,m,da,m,queue);

    /*LU and Inverse */
    magma_zgetrf_gpu(m,m,da,m,piv,&info);
    magma_zgetri_gpu(m,da,m,piv,dwork,ldwork,&info);

    /*GPU -> Host */
    magma_zgetmatrix(m,m,da,m,a,m,queue);

    /*Host -> Matlab*/
    plhs[0] = mxCreateDoubleMatrix(m,m,mxCOMPLEX);
    magma2mat(a,mxGetPr(plhs[0]),mxGetPi(plhs[0]),m*m);
    free(a);
    free(piv);
    magma_free(dwork);
    magma_free(da);
    magma_queue_destroy(queue);
    magma_finalize();
}

I compliled it with mex CC=gcc LDFLAGS="-lmagma -lcudart -lcublas" magmaZinv.c then from matlab, I ran:

a=magic(3)+magic(3)*1i;
magmaZinv(a)

Standalone C code:

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <stddef.h>
#include <magma_v2.h>
#include <cuda_runtime.h>
#include <sys/time.h>
#include <time.h>

/*gateway function*/
int main() {

    /*initialize magma*/
    magma_init();
    magma_queue_t queue = NULL;
    magma_device_t dev;
    magma_getdevice(&dev);
    magma_queue_create(dev,&queue );

    int m,ldwork,info;
    int *piv;
    magmaDoubleComplex *a,*da,*dwork;

    /* allocate and initialize a = magic(3)+magic(3)*1i; */
    m=3;
    piv=(int*) malloc(m*sizeof(int));
    ldwork = m*magma_get_zgetri_nb(m);
    magma_zmalloc_cpu(&a,m*m);
    a[0].x=8;a[0].y=8;
    a[1].x=3;a[1].y=3;
    a[2].x=4;a[2].y=4;
    a[3].x=1;a[3].y=1;
    a[4].x=5;a[4].y=5;
    a[5].x=9;a[5].y=9;
    a[6].x=6;a[6].y=6;
    a[7].x=7;a[7].y=7;
    a[8].x=2;a[8].y=2;

    /* Host -> GPU */
    magma_zmalloc(&dwork,ldwork);
    magma_zmalloc(&da,m*m);
    magma_zsetmatrix(m,m,a,m,da,m,queue);

    /*LU and Inverse */
    magma_zgetrf_gpu(m,m,da,m,piv,&info);
    magma_zgetri_gpu(m,da,m,piv,dwork,ldwork,&info);

    /*GPU -> Host */
    magma_zgetmatrix(m,m,da,m,a,m,queue);

    /* display inv(a) */
    for (int i=0;i<(m*m);i++){
        printf("%f +%fi\n",a[i].x,a[i].y);
    }

    /* free memory */
    free(a);
    free(piv);
    magma_free(dwork);
    magma_free(da);
    magma_queue_destroy(queue);
    magma_finalize();

    return 0;
}

I compiled with: gcc -lmagma -lcudart Ccode.c -o Ccode.o

Solution

My sys admin has figured out why the standalone C code works while the mexed C code does not. I'll post the reason here incase it is helpful to anyone facing the same issues when using Magma from within Matlab.

The version of Matlab I was using was 2014a. The supported compiler for this version is 4.7.x. I was using a higher version of gcc to compile the code. I've never had a problem with using different versions of GCC with matlab, despite the warning it gives, but for the above code it does matter.
Compile with the MKL_ilp64 flag when using Magma with Matlab to ensure that magma_int_t is int64.

With these two suggestions, Magma can be mexed into matlab with no problems.