Draw image from vertex buffer object generated with CUDA using OpenGL

I am using CUDA to generate this ABGR output image. The image in question is stored in a uchar4 array. Each element of the array represents the color of each pixel in the image. Obviously, this output array is a 2D image but it is allocated in CUDA as a linear memory of interleaved bytes.

I know that CUDA can easily map this array to an OpenGL Vertex Buffer Object. My question is, assuming that I have the RGB value of every pixel in an image, along with the width and height of the image, how can I draw this image to screen using OpenGL?
I know that some kind of shader must be involved but since my knowledge is very little, I have no idea how a shader can use the color of each pixel, but map it to correct screen pixels.

I know I should increase my knowledge in OpenGL, but this seems like a trivial task. If there is an easy way for me to draw this image, I'd rather not spend much time learning OpenGL.

Solution

I finally figured out an easy way to do what I wanted. Unfortunately, I did not know about the existence of the sample that Robert was talking about on NVIDIA's website.

Long story short, the easiest way to draw the image was to define a Pixel Buffer Object in OpenGL, register the buffer with CUDA and pass it as an output array of uchar4 to the CUDA kernel. Here is a quick pseudo-code based on JOGL and JCUDA that shows the steps involved. Most of the code was obtained from the sample on NVIDIA's website:

1) Creaing the OpenGL buffers

GL2 gl = drawable.getGL().getGL2();

int[] buffer = new int[1];

// Generate buffer
gl.glGenBuffers(1, IntBuffer.wrap(buffer));
glBuffer = buffer[0];

// Bind the generated buffer
gl.glBindBuffer(GL2.GL_ARRAY_BUFFER, glBuffer);
// Specify the size of the buffer (no data is pre-loaded in this buffer)
gl.glBufferData(GL2.GL_ARRAY_BUFFER, imageWidth * imageHeight * 4, (Buffer)null, GL2.GL_DYNAMIC_DRAW);
gl.glBindBuffer(GL2.GL_ARRAY_BUFFER, 0);

// The bufferResource is of type CUgraphicsResource and is defined as a class field
this.bufferResource = new CUgraphicsResource();

// Register buffer in CUDA
cuGraphicsGLRegisterBuffer(bufferResource, glBuffer, CUgraphicsMapResourceFlags.CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);

2) Initialize the texture and set texture parameters

GL2 gl = drawable.getGL().getGL2();
int[] texture = new int[1];

gl.glGenTextures(1, IntBuffer.wrap(texture));
this.glTexture = texture[0];

gl.glBindTexture(GL2.GL_TEXTURE_2D, glTexture);

gl.glTexParameteri(GL2.GL_TEXTURE_2D, GL2.GL_TEXTURE_MIN_FILTER, GL2.GL_LINEAR);
gl.glTexParameteri(GL2.GL_TEXTURE_2D, GL2.GL_TEXTURE_MAG_FILTER, GL2.GL_LINEAR);


gl.glTexImage2D(GL2.GL_TEXTURE_2D, 0, GL2.GL_RGBA8, imageWidth, imageHeight, 0, GL2.GL_BGRA, GL2.GL_UNSIGNED_BYTE, (Buffer)null);

gl.glBindTexture(GL2.GL_TEXTURE_2D, 0);

3) Run the CUDA kernel and display the results in OpenGL's display loop.

this.runCUDA();

GL2 gl = drawable.getGL().getGL2();

gl.glBindBuffer(GL2.GL_PIXEL_UNPACK_BUFFER, glBuffer);

gl.glBindTexture(GL2.GL_TEXTURE_2D, glTexture);
gl.glTexSubImage2D(GL2.GL_TEXTURE_2D, 0, 0, 0,
                imageWidth, imageHeight,
                GL2.GL_RGBA, GL2.GL_UNSIGNED_BYTE, 0); //The last argument must be ZERO! NOT NULL! :-)

gl.glBindBuffer(GL2.GL_PIXEL_PACK_BUFFER, 0);
gl.glBindBuffer(GL2.GL_PIXEL_UNPACK_BUFFER, 0);

gl.glBindTexture(GL2.GL_TEXTURE_2D, glTexture);
gl.glEnable(GL2.GL_TEXTURE_2D);
gl.glDisable(GL2.GL_DEPTH_TEST);
gl.glDisable(GL2.GL_LIGHTING);
gl.glTexEnvf(GL2.GL_TEXTURE_ENV, GL2.GL_TEXTURE_ENV_MODE, GL2.GL_REPLACE);

gl.glMatrixMode(GL2.GL_PROJECTION);
gl.glPushMatrix();
gl.glLoadIdentity();
gl.glOrtho(-1.0, 1.0, -1.0, 1.0, -1.0, 1.0);

gl.glMatrixMode(GL2.GL_MODELVIEW);
gl.glLoadIdentity();

gl.glViewport(0, 0, imageWidth, imageHeight);


gl.glBegin(GL2.GL_QUADS);
    gl.glTexCoord2f(0.0f, 1.0f);
    gl.glVertex2f(-1.0f, -1.0f);


    gl.glTexCoord2f(1.0f, 1.0f);
    gl.glVertex2f(1.0f, -1.0f);


    gl.glTexCoord2f(1.0f, 0.0f);
    gl.glVertex2f(1.0f, 1.0f);


    gl.glTexCoord2f(0.0f, 0.0f);
    gl.glVertex2f(-1.0f, 1.0f);
gl.glEnd();

gl.glMatrixMode(GL2.GL_PROJECTION);
gl.glPopMatrix();

gl.glDisable(GL2.GL_TEXTURE_2D);

3.5) The CUDA call:

public void runCuda(GLAutoDrawable drawable) {

    devOutput = new CUdeviceptr();
    // Map the OpenGL buffer to a resource and then obtain a CUDA pointer to that resource
    cuGraphicsMapResources(1, new CUgraphicsResource[]{bufferResource}, null);
    cuGraphicsResourceGetMappedPointer(devOutput, new long[1], bufferResource);

    // Setup the kernel parameters making sure that the devOutput pointer is passed to the kernel
    Pointer kernelParams = 
                            .
                            .
                            .
                            .

    int gridSize = (int) Math.ceil(imageWidth * imageHeight / (double)DESC_BLOCK_SIZE);

    cuLaunchKernel(function,
            gridSize, 1, 1,
            DESC_BLOCK_SIZE, 1, 1,
            0, null,
            kernelParams, null);
    cuCtxSynchronize();

    // Unmap the buffer so that it can be used in OpenGL
    cuGraphicsUnmapResources(1, new CUgraphicsResource[]{bufferResource}, null);
}

PS: I thank Robert for providing the link to the sample. I also thank the people who downvoted my question without any useful feedback!