Hullo, i ve got some c procedure :
inline float intersectRaySphere(float3* rayO, float3* rayV, float3* sO, float sR)
{
static float3 Q;
Q = sub(sO,rayO);
float cc = dot(&Q,&Q);
float v = dot(&Q,rayV);
float d = sR*sR - (cc - v*v);
// If there was no intersection, return -1
if (d < 0.0) return (-1.0f);
// Return the distance to the [first] intersecting point
return (v - sqrt(d));
}
I was trying to rewrite it in x86 fpu asm and create such one
_asm_intersectRaySphere:; Function begin
push ebp ; 0000 _ 55
mov ebp, esp ; 0001 _ 89. E5
add esp, -20 ; 0003 _ 83. C4, EC
mov eax, dword [ebp+8H] ; 0006 _ 8B. 45, 08
mov ecx, dword [ebp+0CH] ; 0009 _ 8B. 4D, 0C
mov edx, dword [ebp+10H] ; 000C _ 8B. 55, 10
fld dword [edx] ; 000F _ D9. 02
fsub dword [eax] ; 0011 _ D8. 20
fld dword [edx+4H] ; 0013 _ D9. 42, 04
fsub dword [eax+4H] ; 0016 _ D8. 60, 04
fld dword [edx+8H] ; 0019 _ D9. 42, 08
fsub dword [eax+8H] ; 001C _ D8. 60, 08
fld st2 ; 001F _ D9. C2
fmul st0, st(0) ; 0021 _ DC. C8
fld st2 ; 0023 _ D9. C2
fmul st0, st(0) ; 0025 _ DC. C8
fld st2 ; 0027 _ D9. C2
fmul st0, st(0) ; 0029 _ DC. C8
faddp st1, st(0) ; 002B _ DE. C1
faddp st1, st(0) ; 002D _ DE. C1
fld dword [ecx] ; 002F _ D9. 01
fmul st(0), st4 ; 0031 _ D8. CC
fld dword [ecx+4H] ; 0033 _ D9. 41, 04
fmul st(0), st4 ; 0036 _ D8. CC
fld dword [ecx+8H] ; 0038 _ D9. 41, 08
fmul st(0), st4 ; 003B _ D8. CC
faddp st1, st(0) ; 003D _ DE. C1
faddp st1, st(0) ; 003F _ DE. C1
fst dword [ebp-4H] ; 0041 _ D9. 55, FC
fmul st0, st(0) ; 0044 _ DC. C8
fld dword [ebp+14H] ; 0046 _ D9. 45, 14
fmul st0, st(0) ; 0049 _ DC. C8
faddp st1, st(0) ; 004B _ DE. C1
fsubrp st1, st(0) ; 004D _ DE. E1
fxch st3 ; 004F _ D9. CB
fstp st0 ; 0051 _ DD. D8
fstp st0 ; 0053 _ DD. D8
fstp st0 ; 0055 _ DD. D8
ftst ; 0057 _ D9. E4
fwait ; 0059 _ 9B
fnstsw ax ; 005A _ DF. E0
fwait ; 005C _ 9B
sahf ; 005D _ 9E
jc ?_001 ; 005E _ 72, 07
fsqrt ; 0060 _ D9. FA
fsubr dword [ebp-4H] ; 0062 _ D8. 6D, FC
jmp ?_002 ; 0065 _ EB, 06
?_001:
fstp st0 ; 0067 _ DD. D8
fld1 ; 0069 _ D9. E8
fchs ; 006B _ D9. E0
?_002:
mov esp, ebp ; 006D _ 89. EC
pop ebp ; 006F _ 5D
ret ; 0070 _ C3
; _asm_intersectRaySphere End of function
tested, and it is working ok, c routine takes about 150 cycles (on my 6 or 7 years old old pentium 4), my asm routine takes about 66 cycles (*) - so it is good improvement, but maybe it can be also yet improved a little ?
tnx
(*) i was testing with not much care, on random input data so possibly it was a 'non intersection' cause - with no sqrt involved
I would replace this:
fstp st0
fstp st0
fstp st0
ftst
fwait
fnstsw ax
fwait
sahf
jc ?__001
By this:
fcompp
fstp st0
fldz
fcomip st0, st1
ja ?__001
fnstsw
isn't fast, and sahf
isn't great either, especially not on P4's. If you can't use fcomi
(ie if it has to work on P1 or PMMX), you can still skip the sahf
by testing a bit in ax
directly.