Search code examples
pythoncscipyintel-mkl

Proper way to calculate `trans(a)*inv(b)*a` with Intel MKL (follow-up question)


This is a follow-up question to this other with the same title (I did a major edit to it, but I was told it should be another question - and I can't think of another title).

I am using Intel's MKL LAPACKE and CBLAS to calculate

yn = trans(a)*inv(zt)*a + trans(b)*inv(zl)*b

Where a and b are m-by-n real matrices, zt and zl are m-by-m complex matrices. The resulting complex matrix yn is n-by-n.

Here is how I am doing it:

zt <- inv(zt)
zl <- inv(zl)
c <- zt*a
yn <- trans(a)*c
c <- zl*b
yn <- trans(b)*c + yn

The C code:

#include <math.h>
#include <complex.h>
#include <stdlib.h>
#include <stdio.h>
#include <mkl_types.h>
#define MKL_Complex16 _Complex double //overwrite type
#include <mkl.h>
#include <mkl_lapacke.h>

int print_zmatrix_file(int m, int n, _Complex double* a, int lda, FILE* fp)
{
    int i, j;
    for( i = 0; i < m; i++ )
    {
        for( j = 0; j < n; j++ )
        {
            fprintf(fp, "(%.6f%+.6fj)", creal(a[i*lda+j]), cimag(a[i*lda+j]) );
            if (j < n - 1) fprintf(fp, ",");
        }
        fprintf(fp, "\n");
    }
    return 0;
}

int calc_yn(
    _Complex double* yn, double* a, double *b, _Complex double* zl,
    _Complex double* zt, int m, int n)
{
    lapack_int* ipiv = (MKL_INT*) malloc(sizeof(lapack_int)*m);
    LAPACKE_zgetrf(LAPACK_ROW_MAJOR, m, m, zt, m, ipiv);
    LAPACKE_zgetri(LAPACK_ROW_MAJOR, m, zt, m, ipiv);
    LAPACKE_zgetrf(LAPACK_ROW_MAJOR, m, m, zl, m, ipiv);
    LAPACKE_zgetri(LAPACK_ROW_MAJOR, m, zl, m, ipiv);
    free(ipiv);
    const double alpha = 1.0;
    const double beta = 0.0;
    lapack_complex_double* c = (lapack_complex_double*) malloc(
        sizeof(lapack_complex_double)*(m*n));
    // c <- zt*a
    cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                m, n, m,
                &alpha, zt, m, a, n,
                &beta, c, n);
    FILE* fp = fopen("c1.csv", "w");
    print_zmatrix_file(m, n, c, n, fp);
    fclose(fp);
    // yn <- aT*c
    cblas_zgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
                n, n, m,
                &alpha, a, n, c, n,
                &beta, yn, n);
    // c <- zl*b
    cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                m, n, m,
                &alpha, zl, m, b, n,
                &beta, c, n);
    FILE* fp2 = fopen("c2.csv", "w");
    print_zmatrix_file(m, n, c, n, fp2);
    fclose(fp2);
    // yn <- bT*c + yn
    cblas_zgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
                n, n, m,
                &alpha, b, n, c, n,
                &alpha, yn, n);
    free(c);
    return 0;
}

int main()
{
    int m = 2;
    int n = 3;
    _Complex double* yn = (_Complex double*) malloc(sizeof(_Complex double)*(n*n));
    double a[] = {
        0.5, 0.0, 0.5,
        0.5, 0.5, 0.0
    };
    double b[] = {
        1.0, 0.0, -1.0,
        1.0, -1.0, 0.0
    };
    _Complex double zt[] = {
        (0.004 + 0.09*I), (-0.004 - 0.12*I),
        (-0.004 - 0.12*I), (0.005 + 0.11*I)
    };
    _Complex double zl[] = {
        (0.1 + 2.13*I), (-124.004 - 800.12*I),
        (-124.004 - 800.12*I), (0.4 + 4.08*I)
    };
    calc_yn(yn, a, b, zl, zt, m, n);
    FILE* fp = fopen("yn.csv", "w");
    print_zmatrix_file(n, n, yn, n, fp);
    fclose(fp);
    free(yn);
    return 0;
}
// compile command (MKLROOT is defined by a bash script that is shipped together with intel's MKL):
//gcc -std=c11 -DMKL_ILP64 -m64 -g -o test.a test.c -L${MKLROOT}/lib/intel64 -Wl,--no-as-needed -lmkl_intel_ilp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lm -ldl

The code in the previous question had an error in the malloc to yn (it was using sizeof(_Complex double*) instead of sizeof(_Complex double)). Having that error corrected, the code compiles and runs successfully. After running it, I compared the results with the ones I get with SciPy. They do not agree.

import numpy
from scipy import linalg

a = numpy.array([[0.5, 0.0, 0.5],
                 [0.5, 0.5, 0.0]])
b = numpy.array([[1.0, 0.0, -1.0],
                 [1.0, -1.0, 0.0]])
zt = numpy.array([[0.004 + 0.09j, -0.004 - 0.12j],
                  [-0.004 - 0.12j, 0.005 + 0.11j]])
zl = numpy.array([[0.1 + 2.13j, 124.004 - 800.12j],
                  [124.004 - 800.12j, 0.4 + 4.08j]])

c1 = numpy.matmul(linalg.inv(zt), a)
m1 = numpy.matmul(a.T, c1)
c2 = numpy.matmul(linalg.inv(zl), b)
m2 = numpy.matmul(b.T, c2)
yn = m1 + m2

yn_file = numpy.genfromtxt('yn.csv', delimiter=',', dtype=numpy.complex128)
c1_file = numpy.genfromtxt('c1.csv', delimiter=',', dtype=numpy.complex128)
c2_file = numpy.genfromtxt('c2.csv', delimiter=',', dtype=numpy.complex128)

numpy.max(numpy.abs(yn)) #0.004958820819049211
numpy.max(numpy.abs(yn_file)) #60.4590237745794

numpy.max(numpy.abs(c1)) #25.549314567403204
numpy.max(numpy.abs(c1_file)) #41.278805716697306

numpy.max(numpy.abs(c2)) #0.0012411403762584482
numpy.max(numpy.abs(c2_file)) #0.03292682468747935

There is something wrong either in my C code or in the Python one. Why I am getting different results?


Edit: further testing as per @Bwebb suggestion. He noticed a copy-paste error where -124.004 - 800.12i appears as +124.004 - 800.12i in the Python code. Correcting that does not change the results

To make it easier to test, I used the matrices:

a = numpy.array([[1.0, 0.0],
                 [0.0, 1.0]])
b = numpy.array([[0.0, -1.0],
                 [-1.0, 0.0]])
zt = a
zl = b

Which results in

yn = [[1.0, -1.0]
      [-1.0, 1.0]]

The Python code gives that result, but the C one gives

yn = [[0.0 + 2.0j, 1.0 + 2.0j]
      [-1.0 + 2.0j, 0.0 + 0.0j]]

That makes me conclude that the C code is wrong, but I do not know where.


Solution

  • That behavior (most likely undefined) is caused by passing double array to zgemm instead of _Complex double. When I change matrices a and b to be complex, then I get the expected result.

    Here is the fixed C code for testing:

    #include <math.h>
    #include <complex.h>
    #include <stdlib.h>
    #include <stdio.h>
    #include <mkl_types.h>
    #define MKL_Complex16 _Complex double //overwrite type
    #include <mkl.h>
    #include <mkl_lapacke.h>
    
    int print_zmatrix_file(int m, int n, _Complex double* a, int lda, FILE* fp)
    {
        int i, j;
        for( i = 0; i < m; i++ )
        {
            for( j = 0; j < n; j++ )
            {
                fprintf(fp, "(%.6f%+.6fj)", creal(a[i*lda+j]), cimag(a[i*lda+j]) );
                if (j < n - 1) fprintf(fp, ",");
            }
            fprintf(fp, "\n");
        }
        return 0;
    }
    
    int calc_yn(
        _Complex double* yn, _Complex double* a, _Complex double *b,
        _Complex double* zl, _Complex double* zt, int m, int n)
    {
        lapack_int* ipiv = (MKL_INT*) malloc(sizeof(lapack_int)*m);
        LAPACKE_zgetrf(LAPACK_ROW_MAJOR, m, m, zt, m, ipiv);
        LAPACKE_zgetri(LAPACK_ROW_MAJOR, m, zt, m, ipiv);
        LAPACKE_zgetrf(LAPACK_ROW_MAJOR, m, m, zl, m, ipiv);
        LAPACKE_zgetri(LAPACK_ROW_MAJOR, m, zl, m, ipiv);
        free(ipiv);
        const double alpha = 1.0;
        const double beta = 0.0;
        lapack_complex_double* c = (lapack_complex_double*) malloc(
            sizeof(lapack_complex_double)*(m*n));
        // c <- zt*a
        cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                    m, n, m,
                    &alpha, zt, m, a, n,
                    &beta, c, n);
        // yn <- aT*c
        cblas_zgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
                    n, n, m,
                    &alpha, a, n, c, n,
                    &beta, yn, n);
        // c <- zl*b
        cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                    m, n, m,
                    &alpha, zl, m, b, n,
                    &beta, c, n);
        // yn <- bT*c + yn
        cblas_zgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
                    n, n, m,
                    &alpha, b, n, c, n,
                    &alpha, yn, n);
        free(c);
        return 0;
    }
    
    int main()
    {
        int m = 2;
        int n = 2;
        _Complex double a[] = {
            1.0, 0.0,
            0.0, 1.0
        };
        _Complex double b[] = {
            0.0, -1.0,
            -1.0, 0.0
        };
        _Complex double zt[] = {
            1.0, 0.0,
            0.0, 1.0
        };
        _Complex double zl[] = {
            0.0, -1.0,
            -1.0, 0.0
        };
        _Complex double* yn = (_Complex double*) malloc(sizeof(_Complex double)*(n*n));
        calc_yn(yn, a, b, zl, zt, m, n);
        FILE* fp = fopen("yn.csv", "w");
        print_zmatrix_file(n, n, yn, n, fp);
        fclose(fp);
        free(yn);
        return 0;
    }
    // compile command (MKLROOT is defined by a bash script that is shipped together with intel's MKL):
    //gcc -std=c11 -DMKL_ILP64 -m64 -g -o test.a test.c -L${MKLROOT}/lib/intel64 -Wl,--no-as-needed -lmkl_intel_ilp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lm -ldl