My android app passes in an OpenGL texture2D to my OpenCL kernel, however the pixels values being read by my kernel are out of bounds (>255).
I create my OpenGL texture like this:
GLES20.glGenTextures ( 2, targetTex, 0 );
GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, targetTex[0]);
GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR);
GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR);
GLES20.glTexImage2D(GLES20.GL_TEXTURE_2D, 0, GLES20.GL_RGBA, image_width, image_height, 0, GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, null);
GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
The texture is then rendered to by binding it with a FBO:
targetFramebuffer = IntBuffer.allocate(1);
GLES20.glGenFramebuffers(1, targetFramebuffer);
GLES20.glBindFramebuffer(GLES20.GL_FRAMEBUFFER, targetFramebuffer.get(0));
GLES20.glFramebufferTexture2D(GLES20.GL_FRAMEBUFFER, GLES20.GL_COLOR_ATTACHMENT0, GLES20.GL_TEXTURE_2D, targetTex[0], 0);
GLES20.glBindFramebuffer(GLES20.GL_FRAMEBUFFER, 0);
I create the cl memory object like so:
mem_images[0] = clCreateFromGLTexture2D(m_clContext, CL_MEM_READ_ONLY, GL_TEXTURE_2D, 0, in_tex, &err);
and this is my OpenCL kernel:
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
kernel void transfer_data(__read_only image2d_t input_image, __global float* debug) {
int2 pos;
uint4 pixel;
for (pos.y = get_global_id(1); pos.y < HEIGHT; pos.y += get_global_size(1)) {
for (pos.x = get_global_id(0); pos.x < WIDTH; pos.x += get_global_size(0)) {
pixel = read_imageui(input_image, sampler, pos);
debug[(pos.x + pos.y*WIDTH)*NUM_CHANNELS + 0] = pixel.x;
debug[(pos.x + pos.y*WIDTH)*NUM_CHANNELS + 1] = pixel.y;
debug[(pos.x + pos.y*WIDTH)*NUM_CHANNELS + 2] = pixel.z;
}
}
}
This is how I am enqueing the kernel:
local2Dsize[0] = 4;
local2Dsize[1] = 4;
global2Dsize[0] = clamp(image_width, 0, max_work_items[0]);
global2Dsize[1] = clamp(image_height, 0, max_work_items[1]);
global2Dsize[0] = ceil((float)global2Dsize[0]/(float)local2Dsize[0])*local2Dsize[0];
global2Dsize[1] = ceil((float)global2Dsize[1]/(float)local2Dsize[1])*local2Dsize[1];
twoDlocal_sizes["transfer_data"] = local2Dsize;
twoDglobal_sizes["transfer_data"] = global2Dsize;
kernels["transfer_data"] = clCreateKernel(m_program, "transfer_data", &err);
err = clSetKernelArg(kernels["transfer_data"], 0, sizeof(cl_mem), &mem_images[0]);
err |= clSetKernelArg(kernels["transfer_data"], 1, sizeof(cl_mem), &mems["image"]);
err = clEnqueueAcquireGLObjects(m_queue, 1, &mem_images[0], 0, 0, 0);
CHECK_ERROR_OCL(err, "acquiring GL objects", return false);
err = clEnqueueNDRangeKernel(m_queue, kernels["transfer_data"], 2, NULL, twoDglobal_sizes["transfer_data"], twoDlocal_sizes["transfer_data"], 0, NULL, NULL);
err = clFinish(m_queue);
err = clEnqueueReleaseGLObjects(m_queue, 1, &mem_images[0], 0, 0, 0);
CHECK_ERROR_OCL(err, "releasing GL objects", return false);
Now back on host when I print out these pixel values (from the array debug), they are way out of bounds and I don't understand why that is the case.
If you need more insight:
The overall aim of my project is to obtain camera frames in form of an OpenGL texture, process them using OpenCL and render the output back to the screen. However the texture obtained from android camera can only be bound to GL_TEXTURE_EXTERNAL_OES (http://developer.android.com/reference/android/graphics/SurfaceTexture.html), and this is not a valid texture to create an OpenCL memory object from. Therefore I am rendering the camera output to a GL_TEXTURE_2D and passing that to OpenCL.
I am sure that the pixels are being rendered to the texture correctly, because when I display the texture on the screen (without any OpenCL involved) it displays the image properly.
I did some debugging by creating a texture (as opposed to getting data from the camera) and passing that to opencl. So these are the mappings I get:
0 -> 0
1 -> 7172
2 -> 8196
3 -> 8710
4 -> 9220
5 -> 9477
6 -> 9734
7 -> 9991
8 -> 10244
9 -> 10372
10 -> 10501
11 -> 10629
12 -> 10758
13 -> 10886
14 -> 11015
15 -> 11143
16 -> 11268
17 -> 11332
18 -> 11396
19 -> 11460
20 -> 11525
21 -> 11589
22 -> 11653
23 -> 11717
24 -> 11782
25 -> 11846
26 -> 11910
27 -> 11974
28 -> 12039
29 -> 12103
30 -> 12167
31 -> 12231
32 -> 12292
33 -> 12324
34 -> 12356
35 -> 12388
36 -> 12420
37 -> 12452
38 -> 12484
39 -> 12516
40 -> 12549
41 -> 12581
42 -> 12613
43 -> 12645
44 -> 12677
45 -> 12709
46 -> 12741
47 -> 12773
48 -> 12806
49 -> 12838
50 -> 12870
51 -> 12902
52 -> 12934
53 -> 12966
54 -> 12998
55 -> 13030
56 -> 13063
57 -> 13095
58 -> 13127
59 -> 13159
60 -> 13191
61 -> 13223
62 -> 13255
63 -> 13287
64 -> 13316
65 -> 13332
66 -> 13348
67 -> 13364
68 -> 13380
69 -> 13396
70 -> 13412
71 -> 13428
72 -> 13444
73 -> 13460
74 -> 13476
75 -> 13492
76 -> 13508
77 -> 13524
78 -> 13540
79 -> 13556
80 -> 13573
81 -> 13589
82 -> 13605
83 -> 13621
84 -> 13637
85 -> 13653
86 -> 13669
87 -> 13685
88 -> 13701
89 -> 13717
90 -> 13733
91 -> 13749
92 -> 13765
93 -> 13781
94 -> 13797
95 -> 13813
96 -> 13830
97 -> 13846
98 -> 13862
99 -> 13878
100 -> 13894
101 -> 13910
102 -> 13926
103 -> 13942
104 -> 13958
105 -> 13974
106 -> 13990
107 -> 14006
108 -> 14022
109 -> 14038
110 -> 14054
111 -> 14070
112 -> 14087
113 -> 14103
114 -> 14119
115 -> 14135
116 -> 14151
117 -> 14167
118 -> 14183
119 -> 14199
120 -> 14215
121 -> 14231
122 -> 14247
123 -> 14263
124 -> 14279
125 -> 14295
126 -> 14311
127 -> 14327
128 -> 14340
129 -> 14348
130 -> 14356
131 -> 14364
132 -> 14372
133 -> 14380
134 -> 14388
135 -> 14396
136 -> 14404
137 -> 14412
138 -> 14420
139 -> 14428
140 -> 14436
141 -> 14444
142 -> 14452
143 -> 14460
144 -> 14468
145 -> 14476
146 -> 14484
147 -> 14492
148 -> 14500
149 -> 14508
150 -> 14516
151 -> 14524
152 -> 14532
153 -> 14540
154 -> 14548
155 -> 14556
156 -> 14564
157 -> 14572
158 -> 14580
159 -> 14588
160 -> 14597
161 -> 14605
162 -> 14613
163 -> 14621
164 -> 14629
165 -> 14637
166 -> 14645
167 -> 14653
168 -> 14661
169 -> 14669
170 -> 14677
171 -> 14685
172 -> 14693
173 -> 14701
174 -> 14709
175 -> 14717
176 -> 14725
177 -> 14733
178 -> 14741
179 -> 14749
180 -> 14757
181 -> 14765
182 -> 14773
183 -> 14781
184 -> 14789
185 -> 14797
186 -> 14805
187 -> 14813
188 -> 14821
189 -> 14829
190 -> 14837
191 -> 14845
192 -> 14854
193 -> 14862
194 -> 14870
195 -> 14878
196 -> 14886
197 -> 14894
198 -> 14902
199 -> 14910
200 -> 14918
201 -> 14926
202 -> 14934
203 -> 14942
204 -> 14950
205 -> 14958
206 -> 14966
207 -> 14974
208 -> 14982
209 -> 14990
210 -> 14998
211 -> 15006
212 -> 15014
213 -> 15022
214 -> 15030
215 -> 15038
216 -> 15046
217 -> 15054
218 -> 15062
219 -> 15070
220 -> 15078
221 -> 15086
222 -> 15094
223 -> 15102
224 -> 15111
225 -> 15119
226 -> 15127
227 -> 15135
228 -> 15143
229 -> 15151
230 -> 15159
231 -> 15167
232 -> 15175
233 -> 15183
234 -> 15191
235 -> 15199
236 -> 15207
237 -> 15215
238 -> 15223
239 -> 15231
240 -> 15239
241 -> 15247
242 -> 15255
243 -> 15263
244 -> 15271
245 -> 15279
246 -> 15287
247 -> 15295
248 -> 15303
249 -> 15311
250 -> 15319
251 -> 15327
252 -> 15335
253 -> 15343
254 -> 15351
255 -> 15359
On the left is the colour value that I input in the OpenGL texture and on the left is the corresponding value I get when I read the values in OpenCL.
Okay so I think it's a bug in the OpenCL implementation.
With a bit of help from Wolfram Alpha, I created a function to reverse the above mappings and obtain values in the range of 0 and 255.
float GL_to_CL(uint val) {
if (val >= 14340) return round(0.1245790*val - 1658.44); //>=128
if (val >= 13316) return round(0.0622869*val - 765.408); //>=64
if (val >= 12292) return round(0.0311424*val - 350.800); //>=32
if (val >= 11268) return round(0.0155702*val - 159.443); //>=16
float v = (float) val;
return round(0.0000000000000125922*pow(v,4.f) - 0.00000000026729*pow(v,3.f) + 0.00000198135*pow(v,2.f) - 0.00496681*v - 0.0000808829);
}
So the GL_to_CL() is a combination of 4 linear functions and a Quartic function. I tried creating just a single function using polynomial interpolation, however the degree of the polynomial was too large and therefore more computationally expensive to solve than the combination of the 5 functions proposed above.
Another alternate is to use a 15k sized array to achieve constant time, however that would require me uploading roughly 15k Bytes to GPU's global memory. Considering I am using the kernels to do image processing, I would be slightly pushing it. Also accessing from global memory in OpenCL is often more expensive than performing some simple calculations.