When using CUDA-OpenGL interop,NPOT texture results in bad rendering

Here is the code,just trying to render a gradient for now. What i am trying to ultimately acomplish is render a raytraced scene with CUDA then display it on the screen,with the ability to move about. The main problem i am running into is getting my computed image to display corectly,as right now i am testing it out with just a gradient and i am running into issues when my screen is not a power of two square.

#define width 1024
#define height 256
struct cudaGraphicsResource* screen;
uchar4* rendered;
GLFWwindow* window;
GLuint image;
__global__ void computeFrame(uchar4* rendered){
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x > width || y > height) return;
    int index = y * height + x;
    rendered[index] = make_uchar4(y/(height*1.0f)*255, 0, 0, 255);
}
void createTexture(){
    glGenTextures(1, &image);
    glBindTexture(GL_TEXTURE_2D, image);
    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, width, height, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
    glBindTexture(GL_TEXTURE_2D, 0);
}
int renderFrame() {
    cudaDeviceSynchronize();
    cudaGraphicsMapResources(1, &screen, 0);
    cudaArray* dstArray;
    cudaGraphicsSubResourceGetMappedArray(&dstArray, screen, 0, 0);
    dim3 block(16, 16);
    dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y);
    computeFrame << <grid, block >> > (rendered);
    cudaMemcpyToArray(dstArray,0,0, rendered, width * height * sizeof(uchar4), 
    cudaMemcpyDeviceToDevice);
    cudaDeviceSynchronize();
    cudaGraphicsUnmapResources(1, &screen, 0);
    return 0;

}
void displayFrame() {
    static int frno = 0;
    frno++;
    if (frno > 60) {
        frno = 0;
        printf("60frames passed\n");
    }
    renderFrame();
    glBindTexture(GL_TEXTURE_2D, image);
    glEnable(GL_TEXTURE_2D);
    glDisable(GL_DEPTH_TEST);
    glDisable(GL_LIGHTING);
    glTexEnvf(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);
    glMatrixMode(GL_PROJECTION);
    glLoadIdentity();
    glOrtho(-width,width, -height, height, -1.0, 1.0);
    glMatrixMode(GL_MODELVIEW);
    glLoadIdentity();
    glViewport(0,0, width, height);
    glBegin(GL_QUADS);
    glTexCoord2f(0.0, 0.0); glVertex3f(-width, -height, 0.5);
    glTexCoord2f(1, 0.0); glVertex3f(width, -height, 0.5);
    glTexCoord2f(1, 1); glVertex3f(width, height, 0.5);
    glTexCoord2f(0.0, 1); glVertex3f(-width, height, 0.5);
    glEnd();
    glDisable(GL_TEXTURE_2D);
    glfwSwapBuffers(window);
}
int main(int argc, char** argv){
    glfwInit();
    glewInit();
    glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
    glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 0);
    window = glfwCreateWindow(width,height, "Rotaru Leonard Claudiu", NULL, NULL);
    glfwMakeContextCurrent(window);
    cudaSetDevice(0);
    createTexture();
    cudaGraphicsGLRegisterImage(&screen, image, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsNone);
    cudaMalloc(&rendered, width * height * sizeof(uchar4));
    while (true) {
        displayFrame();
    }
    glfwTerminate();
}

Here is the result of running width and height 1024: width 1024 height 1024 Here is the result of running width 1024 and height 512: width 1024 height 512

Solution

In the future please provide a complete code. It's not helpful when you strip off the include headers.

According to my testing you have several issues, at least.

In one of your screen shots you can see that the compiler is telling you that cudaMemcpyToArray is deprecated. Furthermore, the usage of it in this setting wouldn't be correct anyway. So we'll replace it with cudaMemcpy2DToArray. Instead of this:
```
cudaMemcpyToArray(dstArray,0,0, rendered, width * height * sizeof(uchar4), 
cudaMemcpyDeviceToDevice);
```
use this:
```
cudaMemcpy2DToArray(dstArray,0,0, rendered, width * sizeof(uchar4), width * sizeof(uchar4), height, 
cudaMemcpyDeviceToDevice);
```
Your bounds checking in your kernel is incorrect:
```
if (x > width || y > height) return;
```
it should be:
```
if (x >= width || y >= height) return;
```
this is the standard computer science off-by-one error.
Your kernel calculation for index is incorrect:
```
int index = y * height + x;
```
it should be:
```
 int index = y * width + x;
```
y is your height variable, and we must multiply it by the width of each line to get a proper 1D index.

With those changes, your code appears to run correctly for me, with a height of 256 as you have shown.