parallel-processing cuda gpu linear-algebra

Jacobi algorithm using CUDA

I have an issue with my parallel Jacobi algorithm using CUDA which does not converge. I have a serial version that works and unfortunately I cannot spot any obvious mistake in the parallel implementation. Please let me know any hint! You may try the code, this will show that the algorithm does not converge for the parallel

constexpr int N{ 1024 };
constexpr int block_size = 256;
constexpr float epsilon = 1e-4;


void fillMatrix(float* M, const int size) {
    for (int i = 0; i < size; ++i) {
        M[i * size + i] = size + 1;
        for (int j = 0; j<size; ++j) {
            if (j != i) {
                M[j + size * i] = 1;
            }
        }
    }
}

void fillVector(float* M, const int size) {
    for (int i = 0; i < size; ++i) {
        M[i] = i;
    }
}

void solveJacobi(const float* A, const float* b, float* xNew, float* xOld, const int size) {
    for (int i = 0; i < size; ++i) {
        xOld[i] = xNew[i];
        xNew[i] = 0.0;
        for (int j = 0; j < size; ++j) {
            if (j != i) {
                xNew[i] -= A[i * size + j] * xOld[j];
            }
        }
        xNew[i] += b[i];
        xNew[i] /= A[i * size + i];
    }
}

void computeNorm(const float* xNew, const float* xOld, const int size, float* norm) {
    *norm = 0;
    for (int i = 0; i < size; ++i) {
        *norm += (xOld[i] - xNew[i])*(xOld[i] - xNew[i]);
    }
    *norm = sqrt(*norm);
}

__global__ void solveJacobiCuda(const float* A, const float* b, float* xNew, float* xOld, const int size){
    int idx = threadIdx.x+blockDim.x*blockIdx.x; // create thread x index
    if ((idx < size)){
        xOld[idx] = xNew[idx];
        xNew[idx] = 0;
        for(int i=0; i < size; ++i){
            if(i != idx){
                xNew[idx] -= A[idx * size + i] * xOld[i];
            }
        }
        xNew[idx] += b[idx];
        xNew[idx] /= A[idx * size + idx];
    }
}

int main() {
    float* h_A = new float[N * N];
    float* h_b = new float[N];
    float* h_bStar = new float[N];
    float* h_xOld = new float[N];
    float* h_xNew = new float[N];
    float* h_norm = new float{std::numeric_limits<float>::max()};

    fillMatrix(h_A, N);
    fillVector(h_b, N);
    fillVector(h_xOld,N);
    fillVector(h_xNew,N);
    
    float* d_A;
    float* d_b;
    float* d_xOld;
    float* d_xNew;

    cudaMalloc(&d_A, N*N*sizeof(float));
    cudaMalloc(&d_b, N*sizeof(float));
    cudaMalloc(&d_xOld, N*sizeof(float));
    cudaMalloc(&d_xNew, N*sizeof(float));

    cudaMemcpy(d_A, h_A, N*N*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, N*sizeof(float), cudaMemcpyHostToDevice);

    
    int iterationCounter{ 0 };
    while (*h_norm > epsilon && iterationCounter < 10000) {
        ++iterationCounter;

        //solveJacobi(h_A, h_b, h_xNew, h_xOld, N);
        cudaMemcpy(d_xOld, h_xOld, N*sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(d_xNew, h_xNew, N*sizeof(float), cudaMemcpyHostToDevice);
        // Launch kernel
        solveJacobiCuda<<<(N+block_size-1)/block_size, block_size>>>(d_A, d_b, d_xNew, d_xOld, N);
        cudaCheckErrors("kernel launch failure");
        cudaMemcpy(h_xNew, d_xNew, N*sizeof(float), cudaMemcpyDeviceToHost);
        cudaMemcpy(h_xOld, d_xOld, N*sizeof(float), cudaMemcpyDeviceToHost);

        computeNorm(h_xNew, h_xOld, N, h_norm);
    }

    for(int i=0; i<N; ++i){
        h_bStar[i] = 0;
        for(int j=0; j<N; ++j){
            h_bStar[i] += h_A[i*N + j]*h_xNew[j];
        }
    }
        
    cout << "Jacobi finished" << endl;
    cout << "N_iterations = " << iterationCounter << endl;
    cout << "Final norm = " << *h_norm << endl;
    cout << "b*[0] = " << h_bStar[0] << endl;
    cout << "b*[N] = " << h_bStar[N-1] << endl;

    return 0;
}

Solution

Issues

Both the sequential solveJacobi() and the parallel solveJacobiCuda() are incorrect because the copy from xNew to xOld was wrongly "loop-fused" with the matrix-vector product.

In the sequential case this actually causes faster convergence because the usage of the outdated part of xOld basically creates a Gauss-Seidel type algorithm that is more powerful than plain Jacobi.

In the parallel case the problem is more serious because the kernel-fusion now causes a race condition on xOld which in turn makes the result non-deterministic while not even providing the faster Gauss-Seidel convergence.
Such a race condition can be avoided by grid-synchronization which can be achieved using CUDA's cooperative-groups API in a single kernel or more easily by splitting the kernel into two kernels on the same CUDA stream.

Solution

While the problems can be solved in terms of correctness by loop-/kernel-fission, this still leaves a lot of performance on the table because copying elements from xNew to xOld is very inefficient in the first place. For both the sequential and the parallel case it is much more efficient to just swap the pointers xNew and xOld around between iterations instead of copying the actual values in memory. Additionally the initialization of xNew to zero can be avoided by accumulating the elements of the matrix-vector product in a local variable and only writing out the result to xNew at the end of the kernel function.

Correct implementation:

#include <cstdio>
#include <iostream>

using std::cout;
using std::endl;

constexpr int N{ 1024 };
constexpr int block_size = 256;
constexpr float epsilon = 1e-4;

// proper CUDA runtime error checking
// from https://stackoverflow.com/a/14038590/10107454
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

void fillMatrix(float* M, const int size) {
    for (int i = 0; i < size; ++i) {
        M[i * size + i] = size + 1;
        for (int j = 0; j<size; ++j) {
            if (j != i) {
                M[j + size * i] = 1;
            }
        }
    }
}

void fillVector(float* M, const int size) {
    for (int i = 0; i < size; ++i) {
        M[i] = i;
    }
}

void solveJacobi(const float* A, const float* b, float* xNew, const float* xOld, const int size) {
    for (int i = 0; i < size; ++i) {
        float accumulator = 0.0f;
        for (int j = 0; j < size; ++j) {
            if (j != i) {
                accumulator -= A[i * size + j] * xOld[j];
            }
        }
        accumulator += b[i];
        xNew[i] = accumulator / A[i * size + i];
    }
}

void computeNorm(const float* xNew, const float* xOld, const int size, float* norm) {
    *norm = 0;
    for (int i = 0; i < size; ++i) {
        const float diff = xOld[i] - xNew[i];
        *norm += diff * diff;
    }
    *norm = sqrt(*norm);
}

__global__ void solveJacobiCuda(const float* A, const float* b, float* xNew, const float* xOld, const int size){
    int idx = threadIdx.x+blockDim.x*blockIdx.x; // create thread x index
    if ((idx < size)){
        float accumulator = 0;
        for(int i=0; i < size; ++i){
            if(i != idx){
                accumulator -= A[idx * size + i] * xOld[i];
            }
        }
        accumulator += b[idx];
        xNew[idx] = accumulator / A[idx * size + idx];
    }
}

int main() {
    float* h_A = new float[N * N];
    float* h_b = new float[N];
    float* h_bStar = new float[N];
    float* h_xOld = new float[N];
    float* h_xNew = new float[N];
    float* h_norm = new float{std::numeric_limits<float>::max()};

    fillMatrix(h_A, N);
    fillVector(h_b, N);
    // fillVector(h_xNew,N); not needed
    fillVector(h_xOld,N);
    
    float* d_A;
    float* d_b;
    float* d_xOld;
    float* d_xNew;

    gpuErrchk(cudaMalloc(&d_A, N*N*sizeof(float)));
    gpuErrchk(cudaMalloc(&d_b, N*sizeof(float)));
    gpuErrchk(cudaMalloc(&d_xOld, N*sizeof(float)));
    gpuErrchk(cudaMalloc(&d_xNew, N*sizeof(float)));

    gpuErrchk(cudaMemcpy(d_A, h_A, N*N*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_b, h_b, N*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_xOld, h_xOld, N*sizeof(float), cudaMemcpyHostToDevice));

    
    int iterationCounter{ 0 };
    while (*h_norm > epsilon && iterationCounter < 10000) {
        ++iterationCounter;

        // solveJacobi(h_A, h_b, h_xNew, h_xOld, N);
        
        // Launch kernel
        solveJacobiCuda<<<(N+block_size-1)/block_size, block_size>>>(d_A, d_b, d_xNew, d_xOld, N);
        gpuErrchk(cudaGetLastError());
        gpuErrchk(cudaMemcpy(h_xNew, d_xNew, N*sizeof(float), cudaMemcpyDeviceToHost));
        gpuErrchk(cudaMemcpy(h_xOld, d_xOld, N*sizeof(float), cudaMemcpyDeviceToHost));
        std::swap(d_xOld, d_xNew);

        // TODO this needs to be done on the GPU as well to achieve good performance. 
        computeNorm(h_xNew, h_xOld, N, h_norm);
        std::swap(h_xNew, h_xOld);
    }
    std::swap(h_xNew, h_xOld); // unswap for b* calculation

    for(int i=0; i<N; ++i){
        h_bStar[i] = 0;
        for(int j=0; j<N; ++j){
            h_bStar[i] += h_A[i*N + j]*h_xNew[j];
        }
    }
        
    cout << "Jacobi finished" << endl;
    cout << "N_iterations = " << iterationCounter << endl;
    cout << "Final norm = " << *h_norm << endl;
    cout << "b*[0] = " << h_bStar[0] << endl;
    cout << "b*[N] = " << h_bStar[N-1] << endl;

    return 0;
}

Further Optimization

To parallelize computeNorm as the next step I recommend using cub::DeviceReduce::TransformReduce(). Depending on the size of the matrices and GPU, I would also try further parallelizing solveJacobiCuda by e.g. using a whole block per row using e.g. cooperative_groups::reduce to parallelize the loop over the columns that is sequential in this version. Properly implemented such an algorithm will also achieve coalescing global memory accesses to A which is a potential bottleneck here.