Hullo, I am not too much experienced in asm and I would like to rewrite carmack's inverted square root c routine in assembly
; float InvSqrt (float x){
;
@173:
push ebp
mov ebp,esp
add esp,-8
;
; float xhalf = 0.5f*x;
;
fld dword ptr [@174]
fmul dword ptr [ebp+8]
fstp dword ptr [ebp-4]
;
; int i = *(int*)&x;
;
mov eax,dword ptr [ebp+8]
mov dword ptr [ebp-8],eax
;
; i = 0x5f3759df - (i>>1);
;
mov edx,dword ptr [ebp-8]
sar edx,1
mov ecx,1597463007
sub ecx,edx
mov dword ptr [ebp-8],ecx
;
; x = *(float*)&i;
;
mov eax,dword ptr [ebp-8]
mov dword ptr [ebp+8],eax
;
; x = x*(1.5f - xhalf*x*x);
;
fld dword ptr [ebp-4]
fmul dword ptr [ebp+8]
fmul dword ptr [ebp+8]
fsubr dword ptr [@174+4]
fmul dword ptr [ebp+8]
fstp dword ptr [ebp+8]
;
; return x;
;
fld dword ptr [ebp+8]
;
; }
;
@176:
@175:
pop ecx
pop ecx
pop ebp
ret
here was what compiler generated, but I would like to optimise it and rewrite to asm routine
(this code generated is far from optimal i thing - mixing fpu with integer operations, maybe some revrite by conscious person would much improve it)
how it can be optymized?
edit:
as to answer @harold
there is an improvement:
1.0/sqrt(100.0) takes 140 cycles on my old machine
InvSqrt - c version - takes 44 cycles (though accuracy is not stunning)
ansver below in asm works same as c version and it takes 29 cycles
(measurments may be somewhat approximate but genarlly seem be ok IMO, done by rtdsc 1000x for loop then resulting 140000/1000 = 140cycles 29000/1000 = 29cycles and so)
Many of those moves to/from memory aren't really necessary. This probably isn't too much of an improvement though (especially not compared to not doing any of this in the first place and just using SSE).
Not tested:
; i = 0x5f3759df - (reinterpret_cast<int32>(number) >> 1)
mov eax, dword ptr [ebp+8]
sar eax,1
mov edx, 0x5f3759df
sub edx, eax
mov dword ptr [ebp-4], edx
; y = reinterpret_cast<float>(i)
fld dword ptr [ebp-4]
; x2 = numer * 0.5f
fld dword ptr [ebp+8]
fmul dword ptr [half]
; (x2 * y) * y
fmul st(0), st(1)
fmul st(0), st(1)
; 1.5f - (stuff)
fld dword ptr [threehalfs]
fsubrp st(1), st(0)
; y * (stuff)
fmulp st(1), st(0)
It shouldn't really be too hard to follow, but I'll make some stack diagrams if you want them.