Deprojecting depth onto original mesh

I am trying to get blender render depth map of an object and then moving it to overlay the original object. Currently I have no issue with rendering the object and extracting it into it's place.

However, I am stuck when trying to position the object into it's original position.

I'm trying to apply inverse camera world matrix to the rendered pointcloud (in blue). Unfortunately, when I apply said camera inverse it doesn't appear nowhere near where I'd expect (in red).

I have attached the entirety of code that I have to replicate this behaviour. I would appreciate it if someone would point me to the right matrix that I should be multiplying the point cloud by.

from mathutils import Vector, Quaternion, Euler, Matrix
import numpy as np
import bpy


def main_script():
    clear_scene()
    prepare_views()

    tmp_path = "/tmp/tmp_render.exr"
    scene = get_scene("Scene")
    camera = create_camera("Camera")
    camera.rotation_euler = Euler([np.pi * 0.5, 0, np.pi * 0.5], "XYZ")
    camera.location = Vector([4.5, 0, 1])

    bpy.ops.mesh.primitive_monkey_add(
        location=(0, 0, 1), rotation=(0, 0, np.pi*0.5), size=1.0)

    _w, _h = 640, 480

    update_scene()
    init_rendering(scene, camera, width=640, height=480)
    update_scene()

    matrix_K = get_calibration_matrix_K_from_blender(scene, camera.data)
    _fy, _fx = matrix_K[0][0], matrix_K[1][1]
    _cy, _cx = matrix_K[0][2], matrix_K[1][2]

    scene.render.filepath = tmp_path
    bpy.ops.render.render(write_still=True)
    depth = read_exr(tmp_path, "R")["R"]
    depth = np.reshape(convert_to_numpy(depth), [_h, _w])
    exr_cloud = depth_to_cloud(
        _w, _h, _fx, _fy, _cx, _cy, depth)
    exr_cloud = np.reshape(exr_cloud, [-1, 3])
    exr_cloud = exr_cloud[(exr_cloud[..., 2] < 100) & (exr_cloud[..., 2] > 0)]

    matrix = np.reshape(camera.matrix_world, [4, 4])
    matrix = np.linalg.inv(matrix) # why doesn't this place the depth properly

    vertices = np.ones([exr_cloud.shape[0], 4], dtype=np.float32)
    vertices[:, 0:3] = exr_cloud
    vertices = np.array(
        [matrix @ vertex for vertex in vertices], dtype=np.float32)
    vertices = vertices[..., :3]

    create_mesh("Suzanne_EXR", exr_cloud, [])
    create_mesh("SuzanneT_EXR", vertices, [])

"""
    utilities methods required to run the script
"""

def clear_scene():
    for scene in bpy.data.scenes:
        for obj in scene.objects:
            bpy.context.collection.objects.unlink(obj)

def read_exr(path, channels):
    import OpenEXR as _OpenEXR
    import Imath as _Imath

    file = _OpenEXR.InputFile(path)

    FLOAT = _Imath.PixelType(_Imath.PixelType.FLOAT)

    results = {}
    for ch in channels:
        results[ch] = file.channel(ch, FLOAT)

    file.close()

    return results


def convert_to_numpy(data):
    import array as _array
    return np.array(_array.array("f", data).tolist())


def update_scene():
    dg = bpy.context.evaluated_depsgraph_get()
    dg.update()


def prepare_views():
    preferences = bpy.context.preferences

    preferences.view.show_tooltips_python = True
    preferences.view.show_developer_ui = True
    preferences.view.render_display_type = "NONE"


def init_rendering(scene, camera, width=None, height=None):
    def set_rendering_settings(camera, scene, width=640, height=480):
        image_settings = scene.render.image_settings
        image_settings.file_format = "OPEN_EXR"
        image_settings.use_zbuffer = True

        scene.render.resolution_x = width
        scene.render.resolution_y = height
        # scene.render.use_antialiasing = False

    scene.use_nodes = True
    scene.camera = camera
    node_tree = scene.node_tree
    nodes = node_tree.nodes

    node_render_layers = nodes["Render Layers"]
    node_composite = nodes["Composite"]

    node_tree.links.clear()
    node_tree.links.new(
        node_render_layers.outputs["Depth"], node_composite.inputs["Image"])

    set_rendering_settings(camera, scene)


def get_scene(name): return bpy.data.scenes[name]


def create_camera(name):
    camera = bpy.data.cameras.new(name)
    camera.lens = 50

    obj = bpy.data.objects.new(name, camera)
    bpy.context.collection.objects.link(obj)

    return obj

# ---------------------------------------------------------------
# 3x4 P matrix from Blender camera
# ---------------------------------------------------------------

# Build intrinsic camera parameters from Blender camera data
#
# See notes on this in
# blender.stackexchange.com/questions/15102/what-is-blenders-camera-projection-matrix-model


def get_calibration_matrix_K_from_blender(scene, camera):
    from mathutils import Matrix as _Matrix
    f_in_mm = camera.lens
    resolution_x_in_px = scene.render.resolution_x
    resolution_y_in_px = scene.render.resolution_y
    scale = scene.render.resolution_percentage / 100
    sensor_width_in_mm = camera.sensor_width
    sensor_height_in_mm = camera.sensor_height
    pixel_aspect_ratio = scene.render.pixel_aspect_x / scene.render.pixel_aspect_y
    if (camera.sensor_fit == 'VERTICAL'):
        # the sensor height is fixed (sensor fit is horizontal),
        # the sensor width is effectively changed with the pixel aspect ratio
        s_u = resolution_x_in_px * scale / sensor_width_in_mm / pixel_aspect_ratio
        s_v = resolution_y_in_px * scale / sensor_height_in_mm
    else:  # 'HORIZONTAL' and 'AUTO'
        # the sensor width is fixed (sensor fit is horizontal),
        # the sensor height is effectively changed with the pixel aspect ratio
        pixel_aspect_ratio = scene.render.pixel_aspect_x / scene.render.pixel_aspect_y
        s_u = resolution_x_in_px * scale / sensor_width_in_mm
        s_v = resolution_y_in_px * scale * pixel_aspect_ratio / sensor_height_in_mm

    # Parameters of intrinsic calibration matrix K
    alpha_u = f_in_mm * s_u
    alpha_v = f_in_mm * s_v
    u_0 = resolution_x_in_px * scale / 2
    v_0 = resolution_y_in_px * scale / 2
    skew = 0  # only use rectangular pixels

    K = _Matrix(
        ((alpha_u, skew,    u_0),
         (0, alpha_v, v_0),
         (0, 0,        1)))
    return K


def create_mesh(name, vertices, faces):
    import bmesh as _bmesh
    mesh = bpy.data.meshes.new("Mesh_%s" % name)
    mesh.from_pydata(vertices, [], faces)
    mesh.update()

    obj = bpy.data.objects.new(name, mesh)

    bpy.context.collection.objects.link(obj)

    bm = _bmesh.new()
    bm.from_mesh(mesh)

    bm.to_mesh(mesh)
    bm.free()

    return obj


def depth_to_cloud(w, h, fx, fy, cx, cy, depth):
    from numpy import concatenate as _concat
    from numpy import indices as _indices
    from numpy import newaxis as _newaxis

    indices = _indices(depth.shape)

    indices_y, indices_x = indices

    ys, xs, zs = \
        (indices_y - cy) * depth / fy, \
        (indices_x - cx) * depth / fx, \
        depth

    points = _concat([xs[..., _newaxis], ys[..., _newaxis],
                      zs[..., _newaxis]], axis=2)

    return points


if __name__ == "__main__":
    raise main_script()

Solution

The problem was compound, first I needed to replace my transformed vertex calculation from instead using inverse camera world matrix, to negatively scaled camera world matrix like so

    matrix_cam = np.reshape(camera.matrix_world, [4, 4])
    mat_scale = np.array(Matrix.Scale(-1, 4))

    matrix = matrix_cam @ mat_scale

    vertices = np.ones([exr_cloud.shape[0], 4], dtype=np.float32)
    vertices[:, 0:3] = exr_cloud
    vertices = np.array(
        [matrix @ vertex for vertex in vertices], dtype=np.float32)
    vertices = vertices[..., :3]

Additionally, there was an issue with depth decoding which caused the point cloud to be deformed, fixed like so

    ys, xs, zs = \
        (indices_y - cx) * depth / fx, \
        (indices_x - cy) * depth / fy, \
        depth