Recently I work with CUDA programming, I met an incredible problem when the blockNum Exceeding 500. In order to simplify the mode,I wrote the following test code:
#include <assert.h>
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
/* Example from "Introduction to CUDA C" from NVIDIA website:
https://developer.nvidia.com/cuda-education
Compile with:
$ nvcc example_intro.cu */
#define num 1000
const int N = num*32*12;
__global__ void add_blocks (int *a, int *c) {
int threadId = blockIdx.x * blockDim.x * blockDim.y
+ threadIdx.y * blockDim.x + threadIdx.x;
int block_id = threadIdx.y;
if(threadId % 2 == 0){
c[threadId] = 1;
}
}
int main(void) {
int *a, *c;
int *d_a, *d_c; /* Device (GPU) copies of a, b, c */
size_t size = N * sizeof(int);
/* Allocate memory in device */
cudaMalloc((void **) &d_a, size);
cudaMalloc((void **) &d_c, size);
/* Allocate memory in host */
a = (int *) malloc(size);
c = (int *) malloc(size);
/* Allocate random data in vectors a and b (inside host) */
for (int i = 0; i < N; ++i) {
a[i] = 0;
c[i] = 0;
}
/* Copy data to device */
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
dim3 threads_per_block(32, 12);
add_blocks<<<num, threads_per_block>>>(d_a,d_c);
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
cudaError_t errSync = cudaGetLastError();
if (errSync != cudaSuccess)
printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));
int counter = 0;
for (int i = 0; i < N; ++i) {
if(c[i] == 1){
counter ++;
}
}
printf("%d\n",counter);
/* Clean-up */
free(a);
free(c);
cudaFree(d_a);
cudaFree(d_c);
return 0;
}
when the thread num is multiples of 2, I set the c array with 1, and in the end I count the num of 1, which I think is N/2. It works well when block num is below 500, for example is num*32*12/2 = 500 * 32 * 12 / 2 = 96 000. But when num is 1000 the result is 312846 which should be 192000. Anyone can help me? thanks all.
The problem is in this code:
int counter = 0;
for (int i = 0; i < N; ++i) {
if(c[i] == 1){
counter ++;
}
}
printf("%d\n",counter);
You are implicitly assuming that every value of 1 in c
must have been set by the prior GPU kernel. However, you never set the value of half of the elements in d_c
(and thus c
at this point in the program) at all, so there is no guarantee that some of those also won't have a value of 1. Reading and using the values of unitialized memory isn't amazing, it is just bad programming practice.