I have an issue with my parallel Jacobi algorithm using CUDA which does not converge. I have a serial version that works and unfortunately I cannot spot any obvious mistake in the parallel implementation. Please let me know any hint! You may try the code, this will show that the algorithm does not converge for the parallel
constexpr int N{ 1024 };
constexpr int block_size = 256;
constexpr float epsilon = 1e-4;
void fillMatrix(float* M, const int size) {
for (int i = 0; i < size; ++i) {
M[i * size + i] = size + 1;
for (int j = 0; j<size; ++j) {
if (j != i) {
M[j + size * i] = 1;
}
}
}
}
void fillVector(float* M, const int size) {
for (int i = 0; i < size; ++i) {
M[i] = i;
}
}
void solveJacobi(const float* A, const float* b, float* xNew, float* xOld, const int size) {
for (int i = 0; i < size; ++i) {
xOld[i] = xNew[i];
xNew[i] = 0.0;
for (int j = 0; j < size; ++j) {
if (j != i) {
xNew[i] -= A[i * size + j] * xOld[j];
}
}
xNew[i] += b[i];
xNew[i] /= A[i * size + i];
}
}
void computeNorm(const float* xNew, const float* xOld, const int size, float* norm) {
*norm = 0;
for (int i = 0; i < size; ++i) {
*norm += (xOld[i] - xNew[i])*(xOld[i] - xNew[i]);
}
*norm = sqrt(*norm);
}
__global__ void solveJacobiCuda(const float* A, const float* b, float* xNew, float* xOld, const int size){
int idx = threadIdx.x+blockDim.x*blockIdx.x; // create thread x index
if ((idx < size)){
xOld[idx] = xNew[idx];
xNew[idx] = 0;
for(int i=0; i < size; ++i){
if(i != idx){
xNew[idx] -= A[idx * size + i] * xOld[i];
}
}
xNew[idx] += b[idx];
xNew[idx] /= A[idx * size + idx];
}
}
int main() {
float* h_A = new float[N * N];
float* h_b = new float[N];
float* h_bStar = new float[N];
float* h_xOld = new float[N];
float* h_xNew = new float[N];
float* h_norm = new float{std::numeric_limits<float>::max()};
fillMatrix(h_A, N);
fillVector(h_b, N);
fillVector(h_xOld,N);
fillVector(h_xNew,N);
float* d_A;
float* d_b;
float* d_xOld;
float* d_xNew;
cudaMalloc(&d_A, N*N*sizeof(float));
cudaMalloc(&d_b, N*sizeof(float));
cudaMalloc(&d_xOld, N*sizeof(float));
cudaMalloc(&d_xNew, N*sizeof(float));
cudaMemcpy(d_A, h_A, N*N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, N*sizeof(float), cudaMemcpyHostToDevice);
int iterationCounter{ 0 };
while (*h_norm > epsilon && iterationCounter < 10000) {
++iterationCounter;
//solveJacobi(h_A, h_b, h_xNew, h_xOld, N);
cudaMemcpy(d_xOld, h_xOld, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_xNew, h_xNew, N*sizeof(float), cudaMemcpyHostToDevice);
// Launch kernel
solveJacobiCuda<<<(N+block_size-1)/block_size, block_size>>>(d_A, d_b, d_xNew, d_xOld, N);
cudaCheckErrors("kernel launch failure");
cudaMemcpy(h_xNew, d_xNew, N*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(h_xOld, d_xOld, N*sizeof(float), cudaMemcpyDeviceToHost);
computeNorm(h_xNew, h_xOld, N, h_norm);
}
for(int i=0; i<N; ++i){
h_bStar[i] = 0;
for(int j=0; j<N; ++j){
h_bStar[i] += h_A[i*N + j]*h_xNew[j];
}
}
cout << "Jacobi finished" << endl;
cout << "N_iterations = " << iterationCounter << endl;
cout << "Final norm = " << *h_norm << endl;
cout << "b*[0] = " << h_bStar[0] << endl;
cout << "b*[N] = " << h_bStar[N-1] << endl;
return 0;
}
Both the sequential solveJacobi()
and the parallel solveJacobiCuda()
are incorrect because the copy from xNew
to xOld
was wrongly "loop-fused" with the matrix-vector product.
In the sequential case this actually causes faster convergence because the usage of the outdated part of xOld
basically creates a Gauss-Seidel type algorithm that is more powerful than plain Jacobi.
In the parallel case the problem is more serious because the kernel-fusion now causes a race condition on xOld
which in turn makes the result non-deterministic while not even providing the faster Gauss-Seidel convergence.
Such a race condition can be avoided by grid-synchronization which can be achieved using CUDA's cooperative-groups API in a single kernel or more easily by splitting the kernel into two kernels on the same CUDA stream.
While the problems can be solved in terms of correctness by loop-/kernel-fission, this still leaves a lot of performance on the table because copying elements from xNew
to xOld
is very inefficient in the first place. For both the sequential and the parallel case it is much more efficient to just swap the pointers xNew
and xOld
around between iterations instead of copying the actual values in memory. Additionally the initialization of xNew
to zero can be avoided by accumulating the elements of the matrix-vector product in a local variable and only writing out the result to xNew
at the end of the kernel function.
Correct implementation:
#include <cstdio>
#include <iostream>
using std::cout;
using std::endl;
constexpr int N{ 1024 };
constexpr int block_size = 256;
constexpr float epsilon = 1e-4;
// proper CUDA runtime error checking
// from https://stackoverflow.com/a/14038590/10107454
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
void fillMatrix(float* M, const int size) {
for (int i = 0; i < size; ++i) {
M[i * size + i] = size + 1;
for (int j = 0; j<size; ++j) {
if (j != i) {
M[j + size * i] = 1;
}
}
}
}
void fillVector(float* M, const int size) {
for (int i = 0; i < size; ++i) {
M[i] = i;
}
}
void solveJacobi(const float* A, const float* b, float* xNew, const float* xOld, const int size) {
for (int i = 0; i < size; ++i) {
float accumulator = 0.0f;
for (int j = 0; j < size; ++j) {
if (j != i) {
accumulator -= A[i * size + j] * xOld[j];
}
}
accumulator += b[i];
xNew[i] = accumulator / A[i * size + i];
}
}
void computeNorm(const float* xNew, const float* xOld, const int size, float* norm) {
*norm = 0;
for (int i = 0; i < size; ++i) {
const float diff = xOld[i] - xNew[i];
*norm += diff * diff;
}
*norm = sqrt(*norm);
}
__global__ void solveJacobiCuda(const float* A, const float* b, float* xNew, const float* xOld, const int size){
int idx = threadIdx.x+blockDim.x*blockIdx.x; // create thread x index
if ((idx < size)){
float accumulator = 0;
for(int i=0; i < size; ++i){
if(i != idx){
accumulator -= A[idx * size + i] * xOld[i];
}
}
accumulator += b[idx];
xNew[idx] = accumulator / A[idx * size + idx];
}
}
int main() {
float* h_A = new float[N * N];
float* h_b = new float[N];
float* h_bStar = new float[N];
float* h_xOld = new float[N];
float* h_xNew = new float[N];
float* h_norm = new float{std::numeric_limits<float>::max()};
fillMatrix(h_A, N);
fillVector(h_b, N);
// fillVector(h_xNew,N); not needed
fillVector(h_xOld,N);
float* d_A;
float* d_b;
float* d_xOld;
float* d_xNew;
gpuErrchk(cudaMalloc(&d_A, N*N*sizeof(float)));
gpuErrchk(cudaMalloc(&d_b, N*sizeof(float)));
gpuErrchk(cudaMalloc(&d_xOld, N*sizeof(float)));
gpuErrchk(cudaMalloc(&d_xNew, N*sizeof(float)));
gpuErrchk(cudaMemcpy(d_A, h_A, N*N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_b, h_b, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_xOld, h_xOld, N*sizeof(float), cudaMemcpyHostToDevice));
int iterationCounter{ 0 };
while (*h_norm > epsilon && iterationCounter < 10000) {
++iterationCounter;
// solveJacobi(h_A, h_b, h_xNew, h_xOld, N);
// Launch kernel
solveJacobiCuda<<<(N+block_size-1)/block_size, block_size>>>(d_A, d_b, d_xNew, d_xOld, N);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaMemcpy(h_xNew, d_xNew, N*sizeof(float), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_xOld, d_xOld, N*sizeof(float), cudaMemcpyDeviceToHost));
std::swap(d_xOld, d_xNew);
// TODO this needs to be done on the GPU as well to achieve good performance.
computeNorm(h_xNew, h_xOld, N, h_norm);
std::swap(h_xNew, h_xOld);
}
std::swap(h_xNew, h_xOld); // unswap for b* calculation
for(int i=0; i<N; ++i){
h_bStar[i] = 0;
for(int j=0; j<N; ++j){
h_bStar[i] += h_A[i*N + j]*h_xNew[j];
}
}
cout << "Jacobi finished" << endl;
cout << "N_iterations = " << iterationCounter << endl;
cout << "Final norm = " << *h_norm << endl;
cout << "b*[0] = " << h_bStar[0] << endl;
cout << "b*[N] = " << h_bStar[N-1] << endl;
return 0;
}
To parallelize computeNorm
as the next step I recommend using cub::DeviceReduce::TransformReduce()
. Depending on the size of the matrices and GPU, I would also try further parallelizing solveJacobiCuda
by e.g. using a whole block per row using e.g. cooperative_groups::reduce
to parallelize the loop over the columns that is sequential in this version. Properly implemented such an algorithm will also achieve coalescing global memory accesses to A
which is a potential bottleneck here.