Search code examples
openglgraphicsglslgpgpuraster

Fast way to rasterize a grid of points/pixels


I want to fill the screen with a grid of points. My desired performance would be about the same speed as drawing that many pixels as a contiguous quad (or equivalent triangle clipped with glViewport). Using GL_POINT primitives (positioned via gl_VertexID, not attribs) or glPolygonStipple are possibilities, but are still a little slower. Here's an example of what I want (though the black points drawn may be yet more sparse):

enter image description here

Are there any other methods to draw this grid?
(in a similar time to a smaller quad of the same number of pixels)

Wouldn't it be great if the rasterizer was programmable!

The main point of this is to be able to write to both stencil and colour buffers in this grid pattern from a fragment shader.


EDIT

Some rendering times:

Full screen for me is 1680x1050, GTX670. Times are calculated drawing 10,000 times each frame, no depth test. I draw a quad with a big triangle and clip using glViewport.

  • Rendering a full screen quad and calling discard for coord%4>0: 0.112ms
  • Rendering a full screen quad, assigning const colour: 0.059ms
  • Rendering with glPolygonStipple creating %4 pattern: 0.009ms
  • Rendering quarter full screen quad: 0.003ms
  • Rendering a 1x1 quad: 0.002ms (binding VBO and shader, could prob be optimized)

The differences get larger with a more sparse grid, for example %16.


EDIT

OK, I've thrown together a small example. Requires glut and glew libraries:

#include <GL/glew.h>
#include <GL/gl.h>
#include <GL/glut.h>
#include <memory.h>
#include <assert.h>
#include <stdio.h>

#define RESOLUTION_X 1680
#define RESOLUTION_Y 1050
#define USE_32_BIT 0
#define TEST_LOOP 1000 //number of quads to draw per frame
#define WARMUP_MS 1000 //time between switching methods
#define TEST_MS 4000 //time to benchmark for
#define TESTS 6
#define DRAW_GRAPH 1
#define SCALE_MS 0.2f //for drawing the graph


GLuint fbo, colourTex, vbo, shader, shaderPoints, shaderDiscard;
int viewport[2];
int test = 0;
int results_time[TESTS];
int results_frames[TESTS];

float colours[TESTS][3] = {
    {1,0,0},
    {1,1,0},
    {1,0,1},
    {0,1,0},
    {0,1,1},
    {0,0,1},
    };

const char* names[TESTS] = {
    "full",
    "full discard",
    "full stipple",
    "draw points",
    "quarter",
    "one"
    };

float triangleVerts[9] = {-1,-1,0,-1,4,0,4,-1,0};

const char* vertexShaderSrc = "#version 150\nin vec4 v;\nvoid main() {gl_Position = v;}\n";
const char* vertexShaderPointsSrc = "#version 150\nuniform ivec2 s;\nvoid main() {ivec2 p = ivec2(gl_VertexID%(s.x/4),gl_VertexID/(s.x/4)); gl_Position = vec4(2.0*(p*4+0.5)/s-1.0, 0, 1);}\n";
const char* fragmentShaderSrc = "#version 150\nout vec4 c;\nvoid main() {c = vec4(1,0,0,1);}\n";
const char* fragmentShaderDiscardSrc = "#version 150\nout vec4 c;\nvoid main() {if (int(gl_FragCoord.x)%4>0||int(gl_FragCoord.y)%4>0) discard; c = vec4(1,0,0,1);}\n";

void setupDraw(GLuint program, int x, int y)
{
    glUseProgram(program);
    glViewport(0, 0, x, y);
    glBindBuffer(GL_ARRAY_BUFFER, vbo);
    GLuint loc = glGetAttribLocation(program, "v");
    glEnableVertexAttribArray(loc);
    glVertexAttribPointer(loc, 3, GL_FLOAT, GL_FALSE, 0, 0);
}

void polygonStippleGrid(int x, int y)
{
    unsigned char tilePattern[32*32];
    memset(tilePattern, 0, sizeof(tilePattern));
    for (int j = 0; j < 32; j += y)
    {
        for (int i = 0; i < 32; i += x)
        {
            int index = (j * 32 + i);
            tilePattern[index / 8] |= 1 << (index % 8);
        }
    }
    glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
    glPolygonStipple(tilePattern);
}

void display()
{
    static int lastTime = -1;
    int elapsed = glutGet(GLUT_ELAPSED_TIME);
    if (lastTime == -1) lastTime = elapsed;
    int dt = elapsed - lastTime;
    lastTime = elapsed;

    static int warmup = WARMUP_MS + 2000;
    static int running = TEST_MS;
    warmup -= dt;
    if (warmup <= 0 && test < TESTS)
    {
        running -= dt;
        results_time[test] += dt;
        results_frames[test] += 1;
        if (running <= 0)
        {
            printf("%s %s %.6fms\n", names[test], USE_32_BIT?"rgba32":"rgba8", results_time[test]/(float)(results_frames[test] * TEST_LOOP));
            test += 1;
            warmup = WARMUP_MS;
            running = TEST_MS;
        }
    }

    #if DRAW_GRAPH
    glBindFramebuffer(GL_FRAMEBUFFER, 0);
    glViewport(0, 0, viewport[0], viewport[1]);
    glClear(GL_COLOR_BUFFER_BIT);

    float s = 2.0f / TESTS;
    glBegin(GL_QUADS);
    for (int i = 0; i < TESTS; ++i)
    {
        if (!results_frames[i]) continue;
        glColor3fv(colours[i]);
        float x = -1.0f + 2.0f * i / (float)TESTS;
        float y = -1.0f + 2.0f * (results_time[i]/(float)(results_frames[i] * TEST_LOOP)) / SCALE_MS;
        glVertex2f(x, -1.0f); glVertex2f(x, y); glVertex2f(x + s, y); glVertex2f(x + s, -1.0f);
    }
    glEnd();
    #endif

    glBindFramebuffer(GL_FRAMEBUFFER, fbo);

    switch (test)
    {
    case 0: //straight full screen quad
        setupDraw(shader, RESOLUTION_X, RESOLUTION_Y);
        for (int i = 0; i < TEST_LOOP; ++i)
            glDrawArrays(GL_TRIANGLES, 0, 3);
        break;
    case 1: //full screen quad, discarding pixels in the frag shader
        setupDraw(shaderDiscard, RESOLUTION_X, RESOLUTION_Y);
        for (int i = 0; i < TEST_LOOP; ++i)
            glDrawArrays(GL_TRIANGLES, 0, 3);
        break;
    case 2: //using polygon stipple to mask out fragments
        polygonStippleGrid(4, 4);
        glEnable(GL_POLYGON_STIPPLE);
        setupDraw(shader, RESOLUTION_X, RESOLUTION_Y);
        for (int i = 0; i < TEST_LOOP; ++i)
            glDrawArrays(GL_TRIANGLES, 0, 3);
        glDisable(GL_POLYGON_STIPPLE);
        break;
    case 3: //drawing points, but computing the position in the vertex shader
        glUseProgram(shaderPoints);
        glUniform2i(glGetUniformLocation(shaderPoints, "s"), RESOLUTION_X, RESOLUTION_Y);
        for (int i = 0; i < TEST_LOOP; ++i)
            glDrawArrays(GL_POINTS, 0, (RESOLUTION_X/4)*(RESOLUTION_Y/4));
        break;
    case 4: //a quad one quarter of the screen (as a speed comparison)
        setupDraw(shader, RESOLUTION_X / 4, RESOLUTION_Y / 4);
        for (int i = 0; i < TEST_LOOP; ++i)
            glDrawArrays(GL_TRIANGLES, 0, 3);
        break;
    case 5: //a 1x1 quad (as a speed comparison)
        setupDraw(shader,1, 1);
        for (int i = 0; i < TEST_LOOP; ++i)
            glDrawArrays(GL_TRIANGLES, 0, 3);
        break;
    default: break;
    }
    glUseProgram(0);
    glDisableVertexAttribArray(0); //HACK: assumes location is always zero
    //printf("%i %i %i\n", test, warmup, running);

    glFinish();
    glutSwapBuffers();
    glutPostRedisplay();

    assert(glGetError() == GL_NO_ERROR);
}

void reshape(int x, int y)
{
    viewport[0] = x;
    viewport[1] = y;
}

int main(int argc, char **argv)
{
    memset(results_time, 0, sizeof(results_time));
    memset(results_frames, 0, sizeof(results_frames));

    //init glut
    glutInit(&argc, argv);
    glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA);
    glutCreateWindow("quadtest");
    glutReshapeFunc(reshape);
    glutDisplayFunc(display);

    glewInit();

    //init gl stuff
    glGenTextures(1, &colourTex);
    glBindTexture(GL_TEXTURE_2D, colourTex);
    #if USE_32_BIT
    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA32F, RESOLUTION_X, RESOLUTION_Y, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
    #else
    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, RESOLUTION_X, RESOLUTION_Y, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
    #endif

    /*
    GLuint stencilRB;
    glGenRenderbuffers(1, &stencilRB);
    glBindRenderbuffer(GL_RENDERBUFFER, stencilRB);
    glRenderbufferStorage(GL_RENDERBUFFER, GL_DEPTH_STENCIL, RESOLUTION_X, RESOLUTION_Y);
    */

    glGenFramebuffers(1, &fbo);
    glBindFramebuffer(GL_FRAMEBUFFER, fbo);
    glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, colourTex, 0);
    //glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_RENDERBUFFER, stencilRB);
    assert(glCheckFramebufferStatus(GL_FRAMEBUFFER) == GL_FRAMEBUFFER_COMPLETE);

    glGenBuffers(1, &vbo);
    glBindBuffer(GL_ARRAY_BUFFER, vbo);
    glBufferData(GL_ARRAY_BUFFER, sizeof(triangleVerts), triangleVerts, GL_STATIC_DRAW);

    GLuint v = glCreateShader(GL_VERTEX_SHADER);
    GLuint vp = glCreateShader(GL_VERTEX_SHADER);
    GLuint f = glCreateShader(GL_FRAGMENT_SHADER);
    GLuint fd = glCreateShader(GL_FRAGMENT_SHADER);
    glShaderSource(v, 1, &vertexShaderSrc, NULL);
    glShaderSource(vp, 1, &vertexShaderPointsSrc, NULL);
    glShaderSource(f, 1, &fragmentShaderSrc, NULL);
    glShaderSource(fd, 1, &fragmentShaderDiscardSrc, NULL);

    GLint ok = GL_TRUE;
    shader = glCreateProgram();
    glAttachShader(shader, v);
    glAttachShader(shader, f);
    glLinkProgram(shader);
    glGetProgramiv(shader, GL_LINK_STATUS, &ok);
    assert(ok == GL_TRUE);

    /*
    char log[512];
    int n;
    glGetShaderInfoLog(v, 512, &n, log);
    printf("%s\n", log);
    glGetProgramInfoLog(shader, 512, &n, log);
    printf("%s\n", log);
    */

    shaderPoints = glCreateProgram();
    glAttachShader(shaderPoints, vp);
    glAttachShader(shaderPoints, f);
    glLinkProgram(shaderPoints);
    glGetProgramiv(shaderPoints, GL_LINK_STATUS, &ok);
    assert(ok == GL_TRUE);

    shaderDiscard = glCreateProgram();
    glAttachShader(shaderDiscard, v);
    glAttachShader(shaderDiscard, fd);
    glLinkProgram(shaderDiscard);
    glGetProgramiv(shaderDiscard, GL_LINK_STATUS, &ok);
    assert(ok == GL_TRUE);

    glDisable(GL_DEPTH_TEST);

    assert(glGetError() == GL_NO_ERROR);

    glutMainLoop();
    return 0;
}

Interestingly, using GL_RGBA32F 32 bit colour impacts performance a fair bit, also bringing back the overhead of the discard method to approximately the same as a full screen quad. The glPolygonStipple method gives dramatic improvements in this case, more so than with 8 bit. There is a discrepancy with the previous glPolygonStipple result too, I can reproduce both and haven't narrowed down the difference yet.

output for GL_RGBA:

full rgba8 0.059ms
full discard rgba8 0.112ms
full stipple rgba8 0.050ms
draw points rgba8 0.079ms
quarter rgba8 0.004ms
one rgba8 <0.001ms

output for GL_RGBA32F:

full rgba32 0.240ms
full discard rgba32 0.241ms
full stipple rgba32 0.101ms
draw points rgba32 0.091ms
quarter rgba32 0.015ms
one rgba32 <0.001ms

Drawing points and positioning from gl_VertexID will beat glPolygonStipple for GL_RGBA32F. I'd assume this trend would carry on for more expensive shaders (or at least memory-intensive).


Solution

  • The scattered memory writes of a sparse grid may simply mean more overhead that can't be avoided.

    1. Draw GL_POINTs
    2. Use glPolygonStipple
    3. Initialize the stencil buffer with the pattern for a masking a full screen quad

    What ever you do do not use the discard method if the fragment shader is expensive[1]. This is really stupid because you clog the pipeline with many threads which don't do anything.

    [1] Either takes a long time to execute or uses lots of registers or local memory