CUDA tex1Dfetch() wrong behaviour

I'm very new to CUDA programming and I'm facing a problem which is driving me crazy. What's going on: I have very simple program (just for study purpose) where one input image and one output image 16x16 is created. The input image is initialized to values from 0..255 and then it is bound to texture. The CUDA kernel just copies the input image to the output image. The input image values are obtained by calling the tex1Dfetch() which returns very strange values in some cases. Please see the code below, the comments inside the kernel and the output of the program. The code is complete and compilable so that you can create a CUDA project in VC and paste the code to the main ".cu" file.

Please help me! What I'm doing wrong?

I'm using VS 2013 Community and CUDA SDK 6.5 + CUDA integration for VS 2013.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>

texture<unsigned char> tex;

cudaError_t testMyKernel(unsigned char * inputImg, unsigned char * outputImg, int width, int height);

__global__ void myKernel(unsigned char *outImg, int width)
{
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int idx = row*width + col;
    __shared__ unsigned char input;
    __shared__ unsigned char input2;
    unsigned char *outPix = outImg + idx;

    //It fetches strange value, for example, when the idx==0 then the input is 51. 
    //But I expect that input==idx (according to the input image initialization).   
    input = tex1Dfetch(tex, idx);
    printf("Fetched for idx=%d: %d\n", idx, input);
    *outPix = input;

    //Very strange is that when I test the following code then the tex1Dfetch() returns correct values.
    if (idx == 0)
    {   
        printf("\nKernel test print:\n");
        for (int i = 0; i < 256; i++)
        {
            input2 = tex1Dfetch(tex, i);
            printf("%d,", input2);
        }
    }
}

int main()
{
    const int width = 16;
    const int height = 16;
    const int count = width * height;
    unsigned char imgIn[count];
    unsigned char imgOut[count];

    for (int i = 0; i < count; i++)
    {
        imgIn[i] = i;
    }

    cudaError_t cudaStatus = testMyKernel(imgIn, imgOut, width, height);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "testMyKernel failed!");
        return 1;
    }

    printf("\n\nOutput values:\n");
    for (int i = 0; i < height; i++)
    {
        for (int j = 0; j < width; j++)
        {
            printf("%d,", imgOut[i * width + j]);
        }
    }
    printf("\n");

    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    getchar();
    return 0;
}


cudaError_t testMyKernel(unsigned char * inputImg, unsigned char * outputImg, int width, int height)
{
    unsigned char * dev_in;
    unsigned char * dev_out;

    size_t size = width * height * sizeof(unsigned char);
    cudaError_t cudaStatus;

    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // input data
    cudaStatus = cudaMalloc((void**)&dev_in, size);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }
    cudaStatus = cudaMemcpy(dev_in, inputImg, size, cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
    cudaStatus = cudaBindTexture(NULL, tex, dev_in, size);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaBindTexture failed!");
        goto Error;
    }

    // output data
    cudaStatus = cudaMalloc((void**)&dev_out, size);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    dim3 threadsPerBlock(4, 4);
    int blk_x = width / threadsPerBlock.x;  
    int blk_y = height / threadsPerBlock.y;
    dim3 numBlocks(blk_x, blk_y);

    // Launch a kernel on the GPU with one thread for each element.
    myKernel<<<numBlocks, threadsPerBlock>>>(dev_out, width);

    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "myKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }

    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching myKernel!\n", cudaStatus);
        goto Error;
    }

    //copy output image to host
    cudaStatus = cudaMemcpy(outputImg, dev_out, size, cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaUnbindTexture(tex);
    cudaFree(dev_in);
    cudaFree(dev_out);

    return cudaStatus;
}

And here is the output of the program (truncated little bit):

Fetched for idx=0: 51
Fetched for idx=1: 51
Fetched for idx=2: 51
Fetched for idx=3: 51
Fetched for idx=16: 51
Fetched for idx=17: 51
Fetched for idx=18: 51
Fetched for idx=19: 51
Fetched for idx=32: 51
Fetched for idx=33: 51
Fetched for idx=34: 51
Fetched for idx=35: 51
Fetched for idx=48: 51
Fetched for idx=49: 51
Fetched for idx=50: 51
Fetched for idx=51: 51
Fetched for idx=192: 243
Fetched for idx=193: 243
Fetched for idx=194: 243
Fetched for idx=195: 243
Fetched for idx=208: 243
Fetched for idx=209: 243
Fetched for idx=210: 243
Fetched for idx=211: 243
Fetched for idx=224: 243
etc... (output truncated.. see the Output values)

Kernel test print:
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,
30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56
etc...(correct values)

Output values:
51,51,51,51,55,55,55,55,59,59,59,59,63,63,63,63,51,51,51,51,55,55,55,55,59,59,59
,59,63,63,63,63,51,51,51,51,55,55,55,55,59,59,59,59,63,63,63,63,51,51,51,51,55,55,
etc.. (wrong values)

Solution

This line of the kernel

input = tex1Dfetch(tex, idx);

is causing race condition among the threads of a block. All threads in a block are trying to fetch value from texture into the __shared__ variable input simultaneously causing undefined behavior. You should allocate separate shared memory space for each thread of the block in the form of a __shared__ array.

For you current case, it may be something like

__shared__ unsigned char input[16]; //4 x 4 block size

The rest of the kernel should look something like:

int idx_local = threadIdx.y * blockDim.x + threadIdx.x; //local id of thread in a block
input[idx_local] = tex1Dfetch(tex, idx);
printf("Fetched for idx=%d: %d\n", idx, input[idx_local]);
*outPix = input[idx_local];

The code inside the condition at the end of the kernel is working fine because due to the specified condition if (idx == 0), only the first thread of the first block will do all the processing serially while all other threads would remain idle, so problem will disappear due to absence of race condition.