OpenCL kernel produces incorrect image on GPU

I have a OpenCL 1.2 program that converts an equirectangular hdr image to a cubemap.

On my i7-9750H CPU everything works correctly, but when I run the program on my GTX 1650 GPU the result is consistently incorrect.

In case it is relevant, my GPU driver version is 536.99 and 7.6.0.0228 for my CPU.

Correct	Wrong

Here is the code:

constant sampler_t srcSampler =
    CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;

// These transforms specify the directions based on the cube face
// They are based on
// https://www.khronos.org/opengl/wiki_opengl/images/CubeMapAxes.png The order
// is +X, -X, +Y, -Y, +Z, -Z Using the reference image the transforms are
// (horizontal face axis, vertical face axis, face direction)
__constant float3 xTransforms[6] = {
    (float3)(0.0f, 0.0f, 1.0f), (float3)(0.0f, 0.0f, -1.0f),
    (float3)(1.0f, 0.0f, 0.0f), (float3)(1.0f, 0.0f, 0.0f),
    (float3)(1.0f, 0.0f, 0.0f), (float3)(-1.0f, 0.0f, 0.0f)};
__constant float3 yTransforms[6] = {
    (float3)(0.0, -1.0f, 0.0f),  (float3)(0.0f, -1.0f, 0.0f),
    (float3)(0.0f, 0.0f, 1.0f),  (float3)(0.0f, 0.0f, -1.0f),
    (float3)(0.0f, -1.0f, 0.0f), (float3)(0.0f, -1.0f, 0.0f)};
__constant float3 zTransforms[6] = {
    (float3)(-1.0f, 0.0f, 0.0f), (float3)(1.0f, 0.0f, 0.0f),
    (float3)(0.0f, 1.0f, 0.0f),  (float3)(0.0f, -1.0f, 0.0f),
    (float3)(0.0f, 0.0f, 1.0f),  (float3)(0.0f, 0.0f, -1.0f)};

float2 projectSphericalMap(float3 dir) {
  float2 uv = (float2)(atan2pi(dir.z, dir.x) * 0.5, asinpi(dir.y));
  uv += (float2)(0.5f, 0.5f);
  return uv;
}

// The kernel is invoked for every pixel on every face of the cubemap
// 'size' is the size of a cube map face
// 'sizefac' is 2/(size-1) precomputed
__kernel void reproject_environment(__read_only image2d_t srcImage,
                                    __write_only image2d_t dstImage, int size,
                                    float sizefac) {
  int outu = get_global_id(0);
  int outv = get_global_id(1);
  int face = get_global_id(2);

  // This check is probably unnecessary
  if (outu >= size || outv >= size || face >= 6) {
    return;
  }

  // The value range is [-1, 1]
  float horizontal = (float)(outu)*sizefac - 1.0;
  float vertical = (float)(outv)*sizefac - 1.0;

  float3 vec = (float3)(horizontal, vertical, 1.0f);

  float x = dot(vec, xTransforms[face]);
  float y = dot(vec, yTransforms[face]);
  float z = dot(vec, zTransforms[face]);

  float3 dir = (float3)(x, y, z);

  float2 uv = projectSphericalMap(normalize(dir));
  float4 color = read_imagef(srcImage, srcSampler, uv);
  // the cube map faces are stacked vertically
  write_imagef(dstImage, (int2)(outu, outv + size * face), color);
}

This is my first time using OpenCL so the mistake might be very obvious.

Solution

After adding some printfs for debugging I noticed that on my GPU the float3s read from the Transform arrays were wrong.

The issue is the compiler was aligning the array elements to 4 floats which resulted in a 1 float shift when indexing the elements. Changing float3 to float4 fixed the problem.