Hullo, I've got not to much knowledge in assembly, and i am thinking how to optimize it by rewriting this in x86 (32-bit fpu or sse2) assembly, thing should be optimized - rewritten in correct assembly, then I will test if I've got some speed up (length() and dot() should be written in asm here also) This code is used by my simple real time ray-tracer and it works - but I am not much good in asm optimizations)
inline float intersectRaySphere(float3* rO, float3* rV, float3* sO, float sR)
{
static float3 Q;
Q = sub(sO,rO);
float c = length(&Q);
float v = dot(&Q,rV);
float d = sR*sR - (c*c - v*v);
// If there was no intersection, return -1
if (d < 0.0) return (-1.0f);
// Return the distance to the [first] intersecting point
return (v - sqrt(d));
}
Thank You in Advance
//edits
struct float3
{
float x;
float y;
float z;
};
inline float length(float3* v) {
return sqrt( (v->x)*(v->x) + (v->y)*(v->y) + (v->z)*(v->z) );
}
inline float dot(float3* a, float3* b) {
return (*a).x * (*b).x + (*a).y * (*b).y + (*a).z * (*b).z;
}
and demo exe (unoptymized in not even so much optymized c):
dl.dropbox.com/u/42887985/re29.zip
Maybe someone could give me a somewhat good fpu asm routines for length dot (or normalize not shown here) ?? Though whole procedure for intersect procedure would be the best ;-)
This isn't a "nice" function to convert to SSE. Almost nothing is actually parallel. So let's change the function to intersect 4 rays at once. And it would help if the rays were stores in SOA (struct of arrays) instead of AOS (array of structs) as well.
With those changes, it might become something like this (not tested in any way):
inline void intersect4RaysSphere(
float* rOx, float* rOy, float* rOz,
float* rVx, float* rVy, float* rVz,
float sOx, float sOy, float sOz,
float sR)
{
// calculate Q
movss xmm0, sOx
movss xmm1, sOy
movss xmm2, sOz
shufps xmm0, xmm0, 0
shufps xmm1, xmm1, 0
shufps xmm2, xmm2, 0
subps xmm0, [rOx]
subps xmm1, [rOy]
subps xmm2, [rOz]
// calculate pow(dot(Q, rV), 2) in xmm3
movaps xmm3, [rVx]
movaps xmm4, [rVy]
movaps xmm5, [rVz]
mulps xmm3, xmm0
mulps xmm4, xmm1
mulps xmm5, xmm2
addps xmm3, xmm4
addps xmm3, xmm5
movaps xmm4, xmm3
mulps xmm3, xmm3
// calculate pow(length(Q), 2)
// there's no point in taking the square root only to then square it
mulps xmm0, xmm0
mulps xmm1, xmm1
mulps xmm2, xmm2
addps xmm0, xmm1
addps xmm0, xmm2
// calculate d
movss xmm1, sR
mulss xmm1, xmm1
shufps xmm1, xmm1, 0
subps xmm0, xmm3
subps xmm1, xmm0
sqrtps xmm1, xmm1
// test for intersection
// at this point:
// xmm3 = v * v
// xmm4 = v
// xmm1 = sqrt(d)
movaps xmm0, [minus1] // memory location with { -1.0, -1.0, -1.0, -1.0 }
subps xmm4, xmm1
// get a mask of d's smaller than 0.0
psrad xmm1, 31
// select -1 if less than zero or v*v - d if >= 0
andps xmm0, xmm1
andnps xmm1, xmm4
orps xmm0, xmm1
ret
}
Version with intrinsics (only slightly tested - it's compilable and seems to generate OK assembly):
__m128 intersect4RaysSphere(
float* rOx, float* rOy, float* rOz,
float* rVx, float* rVy, float* rVz,
float sOx, float sOy, float sOz,
float sR)
{
__m128 Qx = _mm_sub_ps(_mm_set1_ps(sOx), _mm_load_ps(rOx));
__m128 Qy = _mm_sub_ps(_mm_set1_ps(sOy), _mm_load_ps(rOy));
__m128 Qz = _mm_sub_ps(_mm_set1_ps(sOz), _mm_load_ps(rOz));
__m128 v = _mm_add_ps(_mm_mul_ps(Qx, _mm_load_ps(rVx)),
_mm_add_ps(_mm_mul_ps(Qy, _mm_load_ps(rVy)),
_mm_mul_ps(Qz, _mm_load_ps(rVz))));
__m128 vsquared = _mm_mul_ps(v, v);
__m128 lengthQsquared = _mm_add_ps(_mm_mul_ps(Qx, Qx),
_mm_add_ps(_mm_mul_ps(Qy, Qy),
_mm_mul_ps(Qz, Qz)));
__m128 sr = _mm_set1_ps(sR);
__m128 d = _mm_sub_ps(_mm_mul_ps(sr, sr), _mm_sub_ps(lengthQsquared, vsquared));
__m128 mask = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(d), 31));
//__m128 result = _mm_or_ps(_mm_and_ps(_mm_set1_ps(-1.0f), mask),
_mm_andnot_ps(mask, _mm_sub_ps(vsquared, d)));
__m128 result = _mm_or_ps(_mm_and_ps(_mm_set1_ps(-1.0f), mask),
_mm_andnot_ps(mask, _mm_sub_ps(v, _mm_sqrt_ps(d))));
return result;
}