I guess I am in some really strange border case, maybe with double precision issues and I want to know, whats going on.
Inside an OpenCL Kernel I use:
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
__private int k = 2; // I need k to be an int, because I want to use as a counter
__private double s = 18;
__private double a = 1;
a = a/(double)k; // just to show, that I make in-place typecasting of k
a = k+1;
k = (int)a; //to show that I store k in a double buffer in an intermediate-step
if ((k-1)==2)
{
// k = 3;
s = pow(s/(double)(k-1),0.5);
}
This leads me to s = 2.999[...]6
However, if I uncomment the k=3
line, I get the (in my eyes) correct result s = 3
. Why is that?
As a side information: The same behaviour doesn't happen when I do
s = sqrt(s/(double)(k-1))
Below follows the full, minimal Kernel and Host code for pyopencl
Kernel (Minima.cl):
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
__kernel void init_z(__global double * buffer)
{
__private int x = get_global_id(0);
__private int y = get_global_id(1);
//w,h
__private int w_y = get_global_size(1);
__private int address = x*w_y+y;
//h,w
__private double init = 3.0;
buffer[address]=init;
}
__kernel void root(__global double * buffer)
{
__private int x = get_global_id(0);
__private int y = get_global_id(1);
//w,h
__private int w_y = get_global_size(1);
__private int address = x*w_y+y;
//h,w
__private double value = 18;
__private int k;
__private double out;
k = (int) buffer[address];
//k = 3; If this line is uncommented, the result will be exact.
out = pow(value/(double)(k-1), 0.5);
buffer[address] = out;
}
Host:
import pyopencl as cl
import numpy as np
platform = cl.get_platforms()[0]
devs = platform.get_devices()
device1 = devs[1]
h_buffer = np.empty((10,10)).astype(np.float64)
mf = cl.mem_flags
ctx = cl.Context([device1])
Queue1 = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
Queue2 = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
mf = cl.mem_flags
m_dic = {0:mf.READ_ONLY,1:mf.WRITE_ONLY,2:mf.READ_WRITE}
fi = open('Minimal.cl', 'r')
fstr = "".join(fi.readlines())
prg = cl.Program(ctx, fstr).build()
knl = prg.init_z
knl.set_scalar_arg_dtypes([None,])
knl_root = prg.root
knl_root.set_scalar_arg_dtypes([None,])
def f():
d_buffer = cl.Buffer(ctx,m_dic[2], int(10 * 10 * 8))
knl.set_args(d_buffer)
knl_root.set_args(d_buffer)
a = cl.enqueue_nd_range_kernel(Queue2,knl,(10,10),None)
b = cl.enqueue_nd_range_kernel(Queue2,knl_root,(10,10),None, wait_for = [a,])
cl.enqueue_copy(Queue1,h_buffer,d_buffer,wait_for=[b,])
return h_buffer
a = f()
a[0,0] # Getting the result on the host.
Edit: Because of some more unclarities I update this question one more time. I understand, that the value of pow
and sqrt
doesn't have to be the same for the same input. My question is, why pow
shows different output for the SAME input, depending on where I get it from.
The binaries are on pastebin: k_explicit and k_read
printf("a%\n", out)
leads to 0x1.8p+1
with the k=3
line and to 0x1.7ffffffffffffp+1
when it's commented out.
So, after lots of discussion it doesn't seem to be a problem with my code. The behaviour seems to be hardware-dependent and so far only reproducable on Nvidia Hardware. I'm still interested in the "why" and "how" that happens but in this case the question is answered.