Search code examples
c++visual-studiointel-mkl

Run time issues using the Intel's Math Kernel Library for eigendecomposition


to do the the eigendecomposition in C++, I use the routine "zhpev". This routine is embedded in a dll file of a larger piece of software and is exhaustively used during the run time. After ~5000 calls of "zhpev" I measure the run time. Everything is fine for the first 900 run time evaluations. The run time is around 0.7 seconds with little variation. However, 900 run time evaluations, the run time suddenly increases from 0.7 seconds up to 2.7 seconds with heavy variation.

I made the following observations:

  • The run time issues occur independently to the input data of "zhpev".
  • The routine "zhpev" runs fine and stable in a small program. It seems that the interaction with other parts causes the trouble.
  • After replacing "zhpev" with another routine for eigendecomposition, everything runs smoothly with little variations in run time.
  • The run time issues occur with and without multi-threading.
  • I do not use dynamic memory allocation. All variables are allocated as static variables.
  • The problem is similar to Visual C++ function suddenly 170 ms slower (4x longer) , however, I could not detect any memory leaks in my code.

Sorry that I could not post any code since the project I'm working with is too large.

I would appreciate any hint which helps me to stop this strange behaviour!

Edit The routine "zhpev" works on complex Hermitian matrices with size 32x32 in double precision. So, the chunks of data processed at a time are rather small.

Update 1) Paging is not the problem here. I disabled the paging file in the system options. The run time issue is still not fixed. 2) Running the application on a different Windows computer also leads to the same run time issues. However, the starting of the run time increase occurs now later, after 1400 run time evaluations.

Update I found out that the run time problems occur only if I call "zhpev" inside a thread. With this I can create a small code example where I run into the same problems.

Let me explain my code

This is my code

#include <windows.h>
#include <tchar.h>
#include <strsafe.h>
#include "stdafx.h"
#include "mkl_lapack.h"
#include "mkl_service.h"
#include <time.h>
#include <stdio.h>
#include <iostream>
#include <fstream>
#include <stdlib.h>
#include <iostream> 

using namespace std;
#define CACHE_LINE  32
#define CACHE_ALIGN __declspec(align(CACHE_LINE))

#define MAX_THREADS 2
#define BUF_SIZE 255

DWORD WINAPI MyThreadFunction( LPVOID lpParam );
void ErrorHandler(LPTSTR lpszFunction);

// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
// This is the critical function.
void Eigendecomposition();
// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

typedef struct MyData {
    int val1;
    int val2;
} MYDATA, *PMYDATA;



int _tmain()
{
    PMYDATA pDataArray[MAX_THREADS];
    DWORD   dwThreadIdArray[MAX_THREADS];
    HANDLE  hThreadArray[MAX_THREADS]; 

    std::ofstream ofs;

    double tstart;
    double tend;

    double proc_time_pure;

    for(int j=0;j<10000;j++){

    // Start one iteration
    tstart = clock(); 



    // Create MAX_THREADS worker threads.

    for( int i=0; i<MAX_THREADS; i++ )
    {


        pDataArray[i] = (PMYDATA) HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY,
                sizeof(MYDATA));

        if( pDataArray[i] == NULL )
        {

            ExitProcess(2);
        }



        pDataArray[i]->val1 = i;
        pDataArray[i]->val2 = i+100;

        // Create the thread to begin execution on its own.

        hThreadArray[i] = CreateThread( 
            NULL,                   // default security attributes
            0,                      // use default stack size  
            MyThreadFunction,       // thread function name
            pDataArray[i],          // argument to thread function 
            0,                      // use default creation flags 
            &dwThreadIdArray[i]);   // returns the thread identifier 




        if (hThreadArray[i] == NULL) 
        {
           ErrorHandler(TEXT("CreateThread"));
           ExitProcess(3);
        }
    } // End of main thread creation loop.

    // Wait until all threads have terminated.

    WaitForMultipleObjects(MAX_THREADS, hThreadArray, TRUE, INFINITE);



    for(int i=0; i<MAX_THREADS; i++)
    {
        CloseHandle(hThreadArray[i]);
        if(pDataArray[i] != NULL)
        {
            HeapFree(GetProcessHeap(), 0, pDataArray[i]);
            pDataArray[i] = NULL;    // Ensure address is not reused.
        }
    }

    tend = clock();
    proc_time_pure = tend-tstart;

    // Print processing time into console and write it into a file
    printf("   Processing time: %4.3f \n", proc_time_pure/1000.0);
    ofs.open ("Processing_time.txt", std::ofstream::out | std::ofstream::app);

      ofs << proc_time_pure/1000.0 << " ";

      ofs.close();
    }
    return 0;
}


DWORD WINAPI MyThreadFunction( LPVOID lpParam ) 
{ 
    HANDLE hStdout;
    PMYDATA pDataArray;

    TCHAR msgBuf[BUF_SIZE];
    size_t cchStringSize;
    DWORD dwChars;



    hStdout = GetStdHandle(STD_OUTPUT_HANDLE);
    if( hStdout == INVALID_HANDLE_VALUE )
        return 1;



    pDataArray = (PMYDATA)lpParam;
    // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   // Critical function
    Eigendecomposition();
    // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    return 0; 
} 



void ErrorHandler(LPTSTR lpszFunction) 
{ 
    // Retrieve the system error message for the last-error code.

    LPVOID lpMsgBuf;
    LPVOID lpDisplayBuf;
    DWORD dw = GetLastError(); 

    FormatMessage(
        FORMAT_MESSAGE_ALLOCATE_BUFFER | 
        FORMAT_MESSAGE_FROM_SYSTEM |
        FORMAT_MESSAGE_IGNORE_INSERTS,
        NULL,
        dw,
        MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
        (LPTSTR) &lpMsgBuf,
        0, NULL );

    // Display the error message.

    lpDisplayBuf = (LPVOID)LocalAlloc(LMEM_ZEROINIT, 
        (lstrlen((LPCTSTR) lpMsgBuf) + lstrlen((LPCTSTR) lpszFunction) + 40) * sizeof(TCHAR)); 
    StringCchPrintf((LPTSTR)lpDisplayBuf, 
        LocalSize(lpDisplayBuf) / sizeof(TCHAR),
        TEXT("%s failed with error %d: %s"), 
        lpszFunction, dw, lpMsgBuf); 
    MessageBox(NULL, (LPCTSTR) lpDisplayBuf, TEXT("Error"), MB_OK); 

    // Free error-handling buffer allocations.

    LocalFree(lpMsgBuf);
    LocalFree(lpDisplayBuf);
}

void Eigendecomposition(){
    const int M = 32;
    typedef MKL_Complex16  double_complex;
    const char    jobz = 'V';
    const char    uplo = 'L'; // lower triangular part of input matrix is used
    const MKL_INT dim = M;
    const MKL_INT ldz = M;
    const MKL_INT LWORK = (2*M-1);
    const MKL_INT LRWORK = (3*M-2);
    MKL_INT       info = 0;


    double_complex A_H_MKL[(M*M+M)/2];

    CACHE_ALIGN double_complex       work[LWORK]; 
    CACHE_ALIGN double               rwork[LRWORK];

    double D[M];
    double_complex U[M][M];
    for(int i=0;i<500;i++ ){
    // Create the input matrix
    for (int tmp=0; tmp < (M*M+M)/2; tmp++){
        A_H_MKL[tmp].real = 1  ;
        A_H_MKL[tmp].imag = 0;}

    // This is the mkl function
        zhpev(&jobz,                                // const char* jobz,
          &uplo,                                // const char* uplo,
          &dim,                                 // const MKL_INT* n,
          (double_complex *)&A_H_MKL[0],        // double_complex* ap,
          (double *)&D[0],                      // double* w,
          (double_complex *)&U[0][0],           // double_complex* z,
          &ldz,                                 // const MKL_INT* ldz,
          work,                                 // double_complex* work,
          rwork,                                // double* rwork,
          &info);                               // MKL_INT* info


}
}

Solution

  • I found the bug in my code. I run the routine for eigendecomposition inside a thread that was created with the windows function CreatThread. However, there was no function to end the thread as for example the WaitForMultipleObjects-routine. For all the other parts of my application this was not a problem but the eigendecomposition run into difficulties.