get 3d point in space using 2d point in image in python opengl

Im trying to simulate a depth camera in a room, my camera is able to move and rotate in the world and the room is simulated as a 3d cube around the (0,0,0) At the click of a button I want to sample N random points in the image and get the distance of these points from the camera (distance in the "real world"). So far I've managed to create the scene of the moving camera and the cube (Example)

I tried gluUnProject to get the 3d point

model_view = np.array(glGetDoublev(GL_MODELVIEW_MATRIX))
proj = np.array(glGetDoublev(GL_PROJECTION_MATRIX))
view = np.array(glGetDoublev(GL_VIEWPORT))

3d_point = gluUnProject(x,y, 0.0)

where x,y are the coordinates of the pixel in the image, but when I check this on pixels that I know their location (the cube corners) I get what feels like random results.

I very new to openGL so I might be missing something, math-wise all I want to do is apply the inverse of the projection and view matrix on the pixel coordinates but that doesnt work.

I attache the code for the room simulation below.

Thanks in advance.

import pygame
from pygame.locals import *
import numpy as np
import random
from OpenGL.GL import *
from OpenGL.GLU import *
display = (800, 600)
import math

def get_cube_information():

    vertices = (
        (1, -1, -1),
        (1, 1, -1),
        (-1, 1, -1),
        (-1, -1, -1),
        (1, -1, 1),
        (1, 1, 1, ),
        (-1, -1, 1),
        (-1, 1, 1),
        )

    edges = (
        (0,1),
        (0,3),
        (0,4),
        (2,1),
        (2,3),
        (2,7),
        (6,3),
        (6,4),
        (6,7),
        (5,1),
        (5,4),
        (5,7),
        )

    surfaces = (
        (0,1,2,3),
        (3,2,7,6),
        (6,7,5,4),
        (4,5,1,0),
        (1,5,7,2),
        (4,0,3,6),
        )

    colors = (
        (1.000, 0.920, 0.000),
        (0.000, 0.860, 0.000),
        (1.000, 0.480, 0.000),
        (1.000, 1.000, 1.000),
        (0.900, 0.000, 0.000),
        (0.000, 0.000, 0.950)
    )
    return vertices, edges, surfaces, colors


def Cube():
    glBegin(GL_QUADS)

    (vertices, edges, surfaces, colors) = get_cube_information()
    for i, surface in enumerate(surfaces):
        x = 0
        color = colors[i]
        for vertex in surface:
            x += 1
            glColor3fv(color)
            glVertex3fv(vertices[vertex])


    glEnd()

    glBegin(GL_LINES)
    for edge in edges:
        for vertex in edge:
            glVertex3fv(vertices[vertex])

    glEnd()


def main():
    pygame.init()
    tx = 0
    ty = 0
    tz = 0
    ry = 0
    rx = 0
    pygame.display.set_mode(display, DOUBLEBUF|OPENGL|RESIZABLE)

    glMatrixMode(GL_PROJECTION)
    gluPerspective(45, (display[0] / display[1]), 0.1, 50.0)

    view_mat = np.matrix(np.identity(4), copy=False, dtype='float32')

    glMatrixMode(GL_MODELVIEW)
    glLoadIdentity()
    glTranslatef(0, 0, 0)
    glGetFloatv(GL_MODELVIEW_MATRIX, view_mat)
    glLoadIdentity()

    while True:
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                pygame.quit()
                quit()
            elif event.type == pygame.KEYDOWN:
                if event.key == pygame.K_ESCAPE:
                    pygame.quit()
                    quit()
                if event.key == pygame.K_a:
                    tx = 0.05
                elif event.key == pygame.K_d:
                    tx = -0.05
                elif event.key == pygame.K_w:
                    tz = 0.05
                elif event.key == pygame.K_s:
                    tz = -0.05
                elif event.key == pygame.K_RIGHT:
                    ry = 1.0
                elif event.key == pygame.K_LEFT:
                    ry = -1.0
                elif event.key == pygame.K_UP:
                    rx = -1.0
                elif event.key == pygame.K_DOWN:
                    rx = 1.0
                elif event.key == pygame.K_SPACE:
                    continue
            elif event.type == pygame.KEYUP:
                if event.key == pygame.K_a and tx > 0:
                    tx = 0
                elif event.key == pygame.K_d and tx < 0:
                    tx = 0
                elif event.key == pygame.K_w and tz > 0:
                    tz = 0
                elif event.key == pygame.K_s and tz < 0:
                    tz = 0
                elif event.key == pygame.K_RIGHT and ry > 0:
                    ry = 0.0
                elif event.key == pygame.K_LEFT and ry < 0:
                    ry = 0.0
                elif event.key == pygame.K_DOWN and rx > 0:
                    rx = 0.0
                elif event.key == pygame.K_UP and rx < 0:
                    rx = 0.0
            elif event.type == pygame.MOUSEBUTTONDOWN:
                #here I want to sample the points and return their (x,y) in the image and their distance from the camera.
                continue

        glPushMatrix()
        glLoadIdentity()
        glTranslatef(tx, ty, tz)
        glRotatef(ry, 0, 1, 0)
        glRotatef(rx, 1, 0, 0)

        glMultMatrixf(view_mat)
        glGetFloatv(GL_MODELVIEW_MATRIX, view_mat)
        glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT)
        Cube()
        glPopMatrix()
        pygame.display.flip()
        pygame.time.wait(10)
main()

Solution

To find the world position of a point on the viewport, you have to know the depth value of the point.

The x and y screen position and the depth have be transformed to normalized device coordinates in the range [-1, 1]. For this the viewport rectangle has to be known:

ndc = [2.0* x/vp_width - 1.0, 1.0 - 2.0*y/vp_height, depth*2.0 - 1.0];

The normalized device space coordinate has to be transformed by the inverse projection matrix to the view space (Finally a perspective divide has to be performed).

With the inverse view matrix, the view space coordinate can be transformed to world space.

gluUnProject does all this for you, but you have to know the depth of the fragment. The depth of a fragment can be read by glReadPixels:

# get mouse position
x, y = pygame.mouse.get_pos()

# get the fragment depth
depth = glReadPixels(x, y, 1, 1, GL_DEPTH_COMPONENT, GL_FLOAT)

# get projection matrix, view matrix and the viewport rectangle
model_view = np.array(glGetDoublev(GL_MODELVIEW_MATRIX))
proj = np.array(glGetDoublev(GL_PROJECTION_MATRIX))
view = np.array(glGetIntegerv(GL_VIEWPORT))

# unproject the point
point = gluUnProject(x, y, depth, model_view, proj, view)
print( point )

Note, you have to enable the Depth Test otherwise the depth buffer will not be set. This also gives the benefit, that polygons at the front cover the polygons "behind" them:

glEnable(GL_DEPTH_TEST)
Cube()

Of course the projection matrix and the model view matrix have to be proper set, when the vlues are read by glGetDoublev(GL_PROJECTION_MATRIX) respectively glGetDoublev(GL_MODELVIEW_MATRIX).

This means that the reading of the view matrix should be done after setting it:

glPushMatrix()
glLoadIdentity()
glTranslatef(tx, ty, tz)
glRotatef(ry, 0, 1, 0)
glRotatef(rx, 1, 0, 0)

glMultMatrixf(view_mat)
glGetFloatv(GL_MODELVIEW_MATRIX, view_mat)

model_view = np.array(glGetDoublev(GL_MODELVIEW_MATRIX))

Note, if for the 4th parameter (model) of gluUnProject is used the identity matrix, then gluUnProject does not calculate world coordinates, but it calculates view coordinates.