After reading quite a bit about fixed-point arithmetic I think I can say I've understood the basics, unfortunately I don't know yet how to convert routines that use sin/cos/sqrt or any other fp function.
Consider this simple mcve:
#include <math.h>
#include <stdio.h>
#include <ctime>
#include <fstream>
#include <iostream>
typedef char S8;
typedef short S16;
typedef int S32;
typedef unsigned char U8;
typedef unsigned short U16;
typedef unsigned int U32;
typedef float F32;
typedef double F64;
// -------- Fixed point helpers QM.N(32bits) --------
typedef S32 FP32;
#define LUT_SIZE_BITS 9 // 0xffffffff>>(32-9)=511; 32-23=9; 2^9=512
#define LUT_SIZE 512
#define FRACT_BITS 28 // Number fractional bits
#define M (1 << FRACT_BITS) // Scaling factor
inline F32 Q2F(FP32 X) { return ((F32)X / (F32)(M)); }
inline FP32 F2Q(F32 X) { return (FP32)(X * (M)); }
const F32 PI = 3.141592653589793f;
const F32 pi = 3.141592653589793f;
const U32 WIDTH = 256;
const U32 HEIGHT = 256;
FP32 cos_table[LUT_SIZE];
FP32 sin_table[LUT_SIZE];
void init_luts() {
const F32 deg_to_rad = PI / 180.f;
const F32 sample_to_deg = 1 / LUT_SIZE * 360.f;
for (S32 i = 0; i < LUT_SIZE; i++) {
F32 rad = ((F32)i * sample_to_deg) * deg_to_rad;
F32 c = cos(rad);
F32 s = sin(rad);
cos_table[i] = F2Q(c);
sin_table[i] = F2Q(s);
}
}
// -------- Image processing --------
U8 clamp(F32 valor) { return valor > 255 ? 255 : (valor < 0 ? 0 : (U8)valor); }
struct Pbits {
U32 width;
U32 height;
U8 *data;
Pbits(U32 width, U32 height, U8 *data) {
this->width = width;
this->height = height;
this->data = data;
}
Pbits(Pbits *src) {
this->width = src->width;
this->height = src->height;
this->data = new U8[src->width * src->height * 3];
memcpy(this->data, src->data, width * height * 3);
}
~Pbits() { delete this->data; }
void to_bgr() {
U8 r, g, b;
for (S32 y = 0; y < height; y++) {
for (S32 x = 0; x < width; x++) {
get_pixel(y, x, r, g, b);
set_pixel(y, x, b, g, r);
}
}
}
void get_pixel(U32 y, U32 x, U8 &r, U8 &g, U8 &b) {
U32 offset = (y * height * 3) + (x * 3);
r = this->data[offset + 0];
g = this->data[offset + 1];
b = this->data[offset + 2];
}
void set_pixel(U32 y, U32 x, U8 c1, U8 c2, U8 c3) {
U32 offset = (y * height * 3) + (x * 3);
data[offset] = c1;
data[offset + 1] = c2;
data[offset + 2] = c3;
}
};
void fx1_plasma(Pbits *dst, F32 t, F32 k1, F32 k2, F32 k3, F32 k4, F32 k5, F32 k6) {
U32 height = dst->height;
U32 width = dst->width;
for (U32 y = 0; y < height; y++) {
F32 uv_y = (F32)y / height;
for (U32 x = 0; x < width; x++) {
F32 uv_x = (F32)x / width;
F32 v1 = sin(uv_x * k1 + t);
F32 v2 = sin(k1 * (uv_x * sin(t) + uv_y * cos(t / k2)) + t);
F32 cx = uv_x + sin(t / k1) * k1;
F32 cy = uv_y + sin(t / k2) * k1;
F32 v3 = sin(sqrt(k3 * (cx * cx + cy * cy)) + t);
F32 vf = v1 + v2 + v3;
U8 r = (U8)clamp(255 * cos(vf * pi));
U8 g = (U8)clamp(255 * sin(vf * pi + k4 * pi / k2));
U8 b = (U8)clamp(255 * cos(vf * pi + k5 * pi / k2));
dst->set_pixel(y, x, r, g, b);
}
}
}
// -------- Image helpers --------
inline void _write_s32(U8 *dst, S32 offset, S32 v) {
dst[offset] = (U8)(v);
dst[offset + 1] = (U8)(v >> 8);
dst[offset + 2] = (U8)(v >> 16);
dst[offset + 3] = (U8)(v >> 24);
}
void write_bmp(Pbits *src, S8 *filename) {
Pbits *dst = new Pbits(src);
dst->to_bgr();
S32 w = dst->width;
S32 h = dst->height;
U8 *img = dst->data;
S32 filesize = 54 + 3 * w * h;
U8 bmpfileheader[14] = {'B', 'M', 0, 0, 0, 0, 0, 0, 0, 0, 54, 0, 0, 0};
U8 bmpinfoheader[40] = {40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 24, 0};
U8 bmppad[3] = {0, 0, 0};
_write_s32(bmpfileheader, 2, filesize);
_write_s32(bmpinfoheader, 4, w);
_write_s32(bmpinfoheader, 8, h);
FILE *f = fopen(filename, "wb");
fwrite(bmpfileheader, 1, 14, f);
fwrite(bmpinfoheader, 1, 40, f);
for (S32 i = 0; i < h; i++) {
fwrite(img + (w * (h - i - 1) * 3), 3, w, f);
fwrite(bmppad, 1, (4 - (w * 3) % 4) % 4, f);
}
delete dst;
}
void write_ppm(Pbits *dst, S8 *filename) {
std::ofstream file(filename, std::ofstream::trunc);
if (!file.is_open()) {
std::cout << "yep! file is not open" << std::endl;
}
file << "P3\n" << dst->width << " " << dst->height << "\n255\n";
U8 r, g, b, a;
for (U32 y = 0; y < dst->height; y++) {
for (U32 x = 0; x < dst->width; x++) {
dst->get_pixel(y, x, r, g, b);
file << (S32)r << " " << (S32)g << " " << (S32)b << "\n";
}
}
}
S32 main() {
Pbits *dst = new Pbits(WIDTH, HEIGHT, new U8[WIDTH * HEIGHT * 3]);
init_luts();
clock_t begin = clock();
fx1_plasma(dst, 0, 8, 36, 54, 51, 48, 4);
clock_t end = clock();
double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;
std::cout << "Generated plasma in " << elapsed_secs << "s" << std::endl;
write_ppm(dst, "plasma.ppm");
write_bmp(dst, "plasma.bmp");
delete dst;
}
This code will generate this image:
QUESTION: How would you convert this floating point algorithm into a fast fixed-point one? Right now the basics of the floating point arithmetic is +/- clear to me, as in:
fa,fb=floating point values; a,b=fixed_point ones; M=scaling factor
fa = a*M
fb = b*M
fa+fb = (a+b)*M
fa-fb = (a-b)*M
fa*fb = (a*b)*M^2
fa/fb = (a/b)
but how to use sin/cos/sqrt et al in fixed-point is still eluding me. I've found this related thread but I still don't understand how to use the trigonometric luts with random fp values.
#ifdef _MSC_VER
#pragma comment(lib,"opengl32.lib")
#pragma comment(lib,"glu32.lib")
#pragma warning(disable : 4996)
#pragma warning(disable : 26495) //varsayılan değer atamıyorum
#pragma warning(disable :6031)//dönüş değeri yok sayıldı
#endif
#include <Windows.h>
#include <gl/gl.h> //-lopengl32
//#include <gl/glu.h> //-lglu32
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
typedef unsigned char U8;
typedef unsigned int U32;
#define LUT_SIZE 1024 //1Kb 10 bit sin cos
#define LUT_SIZE2 0x000fffff //1Mb 20 bit sqrt
float cos_tab[LUT_SIZE];
float sin_tab[LUT_SIZE];
U8 clamp_cos_tab[LUT_SIZE];
U8 clamp_sin_tab[LUT_SIZE];
float sqrt_tab[LUT_SIZE2];
const float pi = 3.141592;
const float pi_k = LUT_SIZE / (2 * pi);
const U32 WIDTH = 640; //256
const U32 HEIGHT = 480; //256
struct Pbits;
Pbits* pdst;
U8 clamp(float f) { return f > 255 ? 255 : (f < 0 ? 0 : (U8)f); }
#define sin2(f) sin_tab [ (int)(pi_k * (f)) & 0x000003ff]//LUT_SIZE-1
#define cos2(f) cos_tab [ (int)(pi_k * (f)) & 0x000003ff]
#define clamp_sin(f) clamp_sin_tab [ (int)(pi_k * (f)) & 0x000003ff]
#define clamp_cos(f) clamp_cos_tab [ (int)(pi_k * (f)) & 0x000003ff]
#define sqrt2(f) sqrt_tab [*(int*)&(f)>>12] //32-20 bit
void init_luts()
{
for (int i = 0; i < LUT_SIZE; i++)
{
cos_tab[i] = cos(i / pi_k);
sin_tab[i] = sin(i / pi_k);
clamp_cos_tab[i] = clamp(255 * cos(i / pi_k));
clamp_sin_tab[i] = clamp(255 * sin(i / pi_k));
}
for (int i = 0; i < LUT_SIZE2; i++)//init_luts
{
int ii=i<<12; //32-20 bit
float f = *(float *)ⅈ //i to float
sqrt_tab[i] = sqrt(f);
}
}
float sqrt3(float x)
{
//https ://www.codeproject.com/Articles/69941/Best-Square-Root-Method-Algorithm-Function-Precisi
unsigned int i = *(unsigned int*)& x;
i += (127 << 23);
i >>= 1;
return *(float*)&i;
}
float sqrt4(float x)
{
//https: //stackoverflow.com/questions/1349542/john-carmacks-unusual-fast-inverse-square-root-quake-iii
float xhalf = 0.5f * x;
int i = *(int*)&x; // get bits for floating value
i = 0x5f375a86 - (i >> 1); // gives initial guess y0
x = *(float*)&i; // convert bits back to float
x = x * (1.5f - xhalf * x * x); // Newton step, repeating increases accuracy
return x;
}
struct Pbits
{
int width;
int height;
U8* data;
Pbits(int _width, int _height, U8* _data = 0)
{
width = _width;
height = _height;
if (!_data)
_data = (U8*)calloc(width * height * 3, 1);
data = _data;
}
~Pbits() { free(data); }
void set_pixel(int y, int x, U8 c1, U8 c2, U8 c3)
{
int offset = (y * width * 3) + (x * 3);
data[offset] = c1;
data[offset + 1] = c2;
data[offset + 2] = c3;
}
void save(const char* filename)
{
U8 pp[54] = { 'B', 'M', 0, 0, 0, 0, 0, 0, 0, 0, 54, 0, 0, 0 ,
40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 24, 0 };
*(int*)(pp + 2) = 54 + 3 * width * height;
*(int*)(pp + 18) = width;
*(int*)(pp + 22) = height;
int size = height * width * 3;
U8* p = data;
for (int i = 0; i < size; i += 3)//to_bgr()
{
U8 tmp = p[i];
p[i] = p[i + 2];
p[i + 2] = tmp;
}
FILE* f = fopen(filename, "wb");
fwrite(pp, 1, 54, f);
fwrite(data, size, 1, f);
fclose(f);
for (int i = 0; i < size; i += 3)//to_rgb()
{
U8 tmp = p[i];
p[i] = p[i + 2];
p[i + 2] = tmp;
}
}
};
void fn_plasma_slow(Pbits& dst, float t,
float k1, float k2, float k3, float k4, float k5, float k6)
{
int height = dst.height;
int width = dst.width;
for (int y = 0; y < height; y++)
{
float uv_y = (float)y / height;
for (int x = 0; x < width; x++)
{
float uv_x = (float)x / width;
float v1 = sin(uv_x * k1 + t);
float v2 = sin(k1 * (uv_x * sin(t) + uv_y * cos(t / k2)) + t);
float cx = uv_x + sin(t / k1) * k1;
float cy = uv_y + sin(t / k2) * k1;
float v3 = sin(sqrt(k3 * (cx * cx + cy * cy)) + t);
float vf = v1 + v2 + v3;
U8 r = (U8)clamp(255 * cos(vf * pi));
U8 g = (U8)clamp(255 * sin(vf * pi + k4 * pi / k2));
U8 b = (U8)clamp(255 * cos(vf * pi + k5 * pi / k2));
dst.set_pixel(y, x, r, g, b);
}
}
}
void fn_plasma_fast(Pbits& dst, float t,
float k1, float k2, float k3,
float k4, float k5, float k6)
{
U8* p = dst.data;
float
height = dst.height,
width = dst.width,
_k42 = pi * k4 / k2,
_k52 = pi * k5 / k2,
_cx = sin2(t / k1) * k1,
_cy = sin2(t / k2) * k1,
_x = sin2(t),
_y = cos2(t / k2);
for (float j = 0; j < height; j++)
for (float i = 0; i < width; i++)
{
float
x = i / width,
y = j / height,
v1 = sin2(k1 * x + t),
v2 = sin2(k1 * (x * _x + y * _y) + t),
cx = x + _cx,
cy = y + _cy,
aa1 = k3 * (cx * cx + cy * cy),
v3 = sin2(sqrt2(aa1) + t),
vf = pi * (v1 + v2 + v3);
*p++ = clamp_cos(vf); //red
*p++ = clamp_sin(vf + _k42); //green
*p++ = clamp_cos(vf + _k52); //blue
}
}
void fn_plasma_fast2(Pbits& dst, float t,
float k1, float k2, float k3,
float k4, float k5, float k6)
{
U8* p = dst.data;
static float data_v1[1024];
static float data_cx[1024];
static float data_cy[1024];
static float data_xx3[1024];
static float data_yy3[1024];
float
height = dst.height,
width = dst.width,
_k42 = pi * k4 / k2,
_k52 = pi * k5 / k2,
_cx = sin2(t / k1) * k1,
_cy = sin2(t / k2) * k1,
_x = sin2(t)/width*k1 ,
_y = cos2(t/k2)/height*k1;
for (int x = 0; x < width; x++)
{
data_v1[x] = sin2(k1 * x /width+ t);
float f = x / width + _cx;
data_cx[x] =k3* f*f;
data_xx3[x] = x * _x;
}
for (int y = 0; y < height; y++)
{
float f = y / height + _cy;
data_cy[y] = k3*f * f;
data_yy3[y] = y*_y ;
};
for (int y = 0; y < height; y++)
for (int x = 0; x < width; x++)
{
//float v1 = data_v1[x];
//float v2 = sin2(data_xx3[x] + data_yy3[y]);
float aa1 = data_cx[x] + data_cy[y];
//float v3 = sin2(sqrt2(aa1) + t);
//float vf = pi * (v1 + v2 + v3);
float vf = pi * (data_v1[x]+ sin2(data_xx3[x] + data_yy3[y])+ sin2(sqrt2(aa1) + t));
*p++ = clamp_cos(vf); //red
*p++ = clamp_sin(vf + _k42); //green
*p++ = clamp_cos(vf + _k52); //blue
}
}
struct window
{
int x, y, width, height; //iç x y +en boy
HINSTANCE hist; // program kaydı
HWND hwnd; // window
HDC hdc; // device context
HGLRC hrc; // opengl context
//WNDPROC fn_pencere; // pencere fonksiyonu
WNDCLASS wc; // pencere sınıfı
PIXELFORMATDESCRIPTOR pfd;
window(int _width = 256, int _height = 256)
{
memset(this, 0, sizeof(*this));
x = 100;
y = 100;
width = _width;
height = _height;
//HINSTANCE
hist = GetModuleHandle(NULL);
//WNDCLASS
wc.lpfnWndProc = (WNDPROC)fn_window;
wc.hInstance = hist;
wc.hIcon = LoadIcon(0, IDI_WINLOGO);
wc.hCursor = LoadCursor(0, IDC_ARROW);
wc.lpszClassName = "opengl";
RegisterClass(&wc);
//HWND
hwnd = CreateWindow("opengl", "test",
WS_OVERLAPPEDWINDOW,
x, y, width + 16, height + 39,
NULL, NULL, hist, NULL);
//HDC
hdc = GetDC(hwnd);
//PFD
pfd.nSize = sizeof(pfd);
pfd.nVersion = 1;
pfd.dwFlags = PFD_DRAW_TO_WINDOW | PFD_SUPPORT_OPENGL | PFD_DOUBLEBUFFER;
pfd.iPixelType = PFD_TYPE_RGBA;
pfd.cColorBits = 32;
int pf = ChoosePixelFormat(hdc, &pfd);
SetPixelFormat(hdc, pf, &pfd);
DescribePixelFormat(hdc, pf, sizeof(PIXELFORMATDESCRIPTOR), &pfd);
//HRC
hrc = wglCreateContext(hdc);
wglMakeCurrent(hdc, hrc);
ShowWindow(hwnd, SW_SHOW);
SetFocus(hwnd);
}
~window()
{
if (hrc)
wglMakeCurrent(NULL, NULL),
wglDeleteContext(hrc);
if (hdc) ReleaseDC(hwnd, hdc);
if (hwnd) DestroyWindow(hwnd);
if (hist) UnregisterClass("opengl", hist);
}
void run()
{
MSG msg;
BOOL dongu = true;
while (dongu)
{
if (PeekMessage(&msg, NULL, 0, 0, PM_REMOVE))
{
if (msg.message == WM_QUIT) dongu = 0;
else
{
TranslateMessage(&msg);
DispatchMessage(&msg);
}
}
else
{
render();
SwapBuffers(hdc);
}
}
}
static int __stdcall fn_window(HWND hwnd, UINT msg, WPARAM wParam, LPARAM lParam)
{
switch (msg)
{
//case WM_CREATE: {} break;
//case WM_COMMAND: {} break;
//case WM_PAINT: {} break;
case WM_CLOSE: { DestroyWindow(hwnd); }break;
case WM_DESTROY: {PostQuitMessage(0); }break;
}
return DefWindowProc(hwnd, msg, wParam, lParam);
}
static void render()
{
//OPENGL 1.0
//glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
//glMatrixMode(GL_PROJECTION); glLoadIdentity();
//glMatrixMode(GL_MODELVIEW); glLoadIdentity();
static float t; t += 0.02;
fn_plasma_fast2(*pdst, t, 8, 36, 54, 51, 48, 4);//FAST
glRasterPos3f(-1, -1, 0);
glDrawPixels(WIDTH, HEIGHT, GL_RGB, GL_UNSIGNED_BYTE, pdst->data);
}
};
int main()
{
Pbits dst(WIDTH, HEIGHT);
pdst = &dst;
init_luts();
int begin;
begin = clock();
fn_plasma_slow(dst, 0, 8, 36, 54, 51, 48, 4);
printf("fn_plasma_slow: %4d\n", clock() - begin );
dst.save("plasma_slow.bmp");
begin = clock();
fn_plasma_fast(dst, 0, 8, 36, 54, 51, 48, 4);
printf("fn_plasma_fast: %4d\n", clock() - begin);
dst.save("plasma_fast.bmp");
begin = clock();
fn_plasma_fast2(dst, 0, 8, 36, 54, 51, 48, 4);
printf("fn_plasma_fast2: %4d\n", clock() - begin );
dst.save("plasma_fast2.bmp");
window win(WIDTH, HEIGHT);
win.run();
return 0;
}