CUDA_ERROR_ILLEGAL_ADDRESS when accessing variables in CUDA kernel

I'm getting a CUDA_ERROR_ILLEGAL_ADDRESS exception when trying to run a kernel used for calculating Buddhabrot fractal orbits.

extern "C"

__global__ void exec(int iterations, int size,
                float* inputR,  float* inputI, // Real/Imaginary input
                int* output                    // Output image in one dimension
                ) {

    int i = blockIdx.x * blockDim.x + threadIdx.x;

    float cR = inputR[i];
    float cI = inputI[i];

    float x = 0;
    float y = 0;

    float outX[1000];
    float outY[1000];

    for (int j = 0; j < iterations; j++) {
        outX[j] = x;
        outY[j] = y;

        float xNew = (x * x) - (y * y) + cR;
        float yNew = (2 * x * y) + cI;

        if (xNew * xNew + yNew * yNew > 4) {
            for (int k = 1; k < j; k++) {
                int curX = (outX[k] + 2 ) * size / 4;
                int curY = (outY[k] + 2 ) * size / 4;

                int idx = curX + size * curY;

                output[idx]++; // <- exception here
            }
            return;
        }

        x = xNew;
        y = yNew;
    }
}

I've tried multiple things now and the error doesn't even seem to be stemming from the array contrary to what I first thought. For example,

output[0] = 0;

will work just fine. However, when I tried to debug idx (Remember I first thought the error was related to the array), i found out that I can neither assign idx like so

output[0] = idx;

nor use it in a printf statement

if (i == 0) {
    printf("%d\n", idx);
}

I've tried the same with curX and curY which also refuse to work, however cR for example will work without any error. There seems to be a problem with the variables assigned inside the innermost loop (I also can't assign k), so I tried declaring idx outside of all loops at the start of the function, but to no avail. Still the same error.

Stack trace:

Exception in thread "main" jcuda.CudaException: CUDA_ERROR_ILLEGAL_ADDRESS
        at jcuda.driver.JCudaDriver.checkResult(JCudaDriver.java:330)
        at jcuda.driver.JCudaDriver.cuCtxSynchronize(JCudaDriver.java:1938)
        at fractal.Buddhabrot.<init>(Buddhabrot.java:96)
        at controller.Controller.<init>(Controller.java:10)
        at Main.main(Main.java:8)
        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
        at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
        at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        at java.lang.reflect.Method.invoke(Method.java:497)
        at com.intellij.rt.execution.application.AppMain.main(AppMain.java:144)

Constants:

block size            512*1*1
grid size             64 *1*1
iterations            1000
size                  256
inputR, inputI length 64*512
output length         256*256

MCVE:

import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.driver.*;

import java.io.File;
import java.util.Random;

import static jcuda.driver.JCudaDriver.*;

public class Stackoverflow {
    public static final int SIZE = 256;
    public static final long NUM_POINTS = 128 * 128 * 128;
    public static final int ITERATIONS = 10000;

    public static final int BLOCK_SIZE = 512;
    public static final int SIM_THREADS = BLOCK_SIZE * 64;

    public static final Random random = new Random();

    public static void main(String[] args) {
        File ptxFile = new File("Buddha.ptx");

        setExceptionsEnabled(true);
        cuInit(0);

        CUdevice device = new CUdevice();
        cuDeviceGet(device, 0);

        CUcontext context = new CUcontext();
        cuCtxCreate(context, 0, device);

        CUmodule module = new CUmodule();
        cuModuleLoad(module, ptxFile.getAbsolutePath());

        CUfunction function = new CUfunction();
        cuModuleGetFunction(function, module, "exec");

        cuCtxSetLimit(CUlimit.CU_LIMIT_PRINTF_FIFO_SIZE, 4096);

        float[] inR = new float[SIM_THREADS];
        float[] inI = new float[SIM_THREADS];

        int[] out = new int[SIZE * SIZE];

        CUdeviceptr deviceInputR = new CUdeviceptr();
        cuMemAlloc(deviceInputR, inR.length * Sizeof.FLOAT);
        CUdeviceptr deviceInputI = new CUdeviceptr();
        cuMemAlloc(deviceInputI, inI.length * Sizeof.FLOAT);

        CUdeviceptr deviceOutput = new CUdeviceptr();
        cuMemAlloc(deviceOutput, out.length * Sizeof.INT);

        for (long i = 0; i < NUM_POINTS; i += SIM_THREADS) {
            for (int j = 0; j < SIM_THREADS; j++) {
                inR[j] = random.nextFloat() * 4f - 2f;
                inI[j] = random.nextFloat() * 4f - 2f;
            }

            System.out.println("GPU START");

            cuMemcpyHtoD(deviceInputR, Pointer.to(inR), inR.length * Sizeof.FLOAT);
            cuMemcpyHtoD(deviceInputI, Pointer.to(inI), inI.length * Sizeof.FLOAT);

            Pointer kernelParameters = Pointer.to(
                    Pointer.to(new int[]{ITERATIONS}),
                    Pointer.to(new int[]{SIZE}),
                    Pointer.to(deviceInputR),
                    Pointer.to(deviceInputI),
                    Pointer.to(deviceOutput)
            );

            int gridSize = (int) Math.ceil(((double) SIM_THREADS) / BLOCK_SIZE);

            cuLaunchKernel(function,
                    gridSize, 1, 1,
                    BLOCK_SIZE, 1, 1,
                    0, null,
                    kernelParameters, null
            );

            cuCtxSynchronize();

            System.out.println("GPU END");
        }

        cuMemcpyDtoH(Pointer.to(out), deviceOutput, out.length * Sizeof.INT);
    }
}

Solution

In your "constants" section you had indicated this:

iterations            1000

but in your java code (after you provided the MCVE) you have this:

public static final int ITERATIONS = 10000;

That can clearly cause this section of your kernel code to break:

float outX[1000];
float outY[1000];

for (int j = 0; j < iterations; j++) {
    outX[j] = x;
    outY[j] = y;

since 10000 for iterations is indexing out of bounds. (The extent of this loop is actually data dependent, but for some data input patterns, the loop will traverse past 1000, as written).

When I change this:

public static final int ITERATIONS = 10000;

to this:

public static final int ITERATIONS = 1000;

your code runs correctly for me:

$ cuda-memcheck java -cp ".:jcuda-0.7.5b.jar" so1
========= CUDA-MEMCHECK
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
========= ERROR SUMMARY: 0 errors
$