Search code examples
assemblyx86sseraytracingfpu

some intersectRaySphere c procedure optymization through rewriting it to x86 asm (How?)


Hullo, I've got not to much knowledge in assembly, and i am thinking how to optimize it by rewriting this in x86 (32-bit fpu or sse2) assembly, thing should be optimized - rewritten in correct assembly, then I will test if I've got some speed up (length() and dot() should be written in asm here also) This code is used by my simple real time ray-tracer and it works - but I am not much good in asm optimizations)

    inline float intersectRaySphere(float3* rO, float3* rV, float3* sO, float sR)
   {
    static float3 Q;

    Q = sub(sO,rO);
    float c = length(&Q);
    float v = dot(&Q,rV);
    float d = sR*sR - (c*c - v*v);

    // If there was no intersection, return -1
    if (d < 0.0) return (-1.0f);

    // Return the distance to the [first] intersecting point
    return (v - sqrt(d));
    }

Thank You in Advance

//edits

    struct float3
    {
     float x;
     float y;
     float z;
    };


    inline float length(float3* v) {
     return sqrt( (v->x)*(v->x) + (v->y)*(v->y) + (v->z)*(v->z) );
    }

   inline float dot(float3* a, float3* b) {
     return (*a).x * (*b).x + (*a).y * (*b).y + (*a).z * (*b).z;
   }

and demo exe (unoptymized in not even so much optymized c):

dl.dropbox.com/u/42887985/re29.zip

Maybe someone could give me a somewhat good fpu asm routines for length dot (or normalize not shown here) ?? Though whole procedure for intersect procedure would be the best ;-)


Solution

  • This isn't a "nice" function to convert to SSE. Almost nothing is actually parallel. So let's change the function to intersect 4 rays at once. And it would help if the rays were stores in SOA (struct of arrays) instead of AOS (array of structs) as well.

    With those changes, it might become something like this (not tested in any way):

    inline void intersect4RaysSphere(
     float* rOx, float* rOy, float* rOz,
     float* rVx, float* rVy, float* rVz,
     float sOx, float sOy, float sOz,
     float sR)
    {
        // calculate Q
        movss xmm0, sOx
        movss xmm1, sOy
        movss xmm2, sOz
        shufps xmm0, xmm0, 0
        shufps xmm1, xmm1, 0
        shufps xmm2, xmm2, 0
        subps xmm0, [rOx]
        subps xmm1, [rOy]
        subps xmm2, [rOz]
        // calculate pow(dot(Q, rV), 2) in xmm3
        movaps xmm3, [rVx]
        movaps xmm4, [rVy]
        movaps xmm5, [rVz]
        mulps xmm3, xmm0
        mulps xmm4, xmm1
        mulps xmm5, xmm2
        addps xmm3, xmm4
        addps xmm3, xmm5
        movaps xmm4, xmm3
        mulps xmm3, xmm3
        // calculate pow(length(Q), 2)
        // there's no point in taking the square root only to then square it
        mulps xmm0, xmm0
        mulps xmm1, xmm1
        mulps xmm2, xmm2
        addps xmm0, xmm1
        addps xmm0, xmm2
        // calculate d
        movss xmm1, sR
        mulss xmm1, xmm1
        shufps xmm1, xmm1, 0
        subps xmm0, xmm3
        subps xmm1, xmm0
        sqrtps xmm1, xmm1
        // test for intersection
        // at this point:
        // xmm3 = v * v
        // xmm4 = v
        // xmm1 = sqrt(d)
        movaps xmm0, [minus1]  // memory location with { -1.0, -1.0, -1.0, -1.0 }
        subps xmm4, xmm1
        // get a mask of d's smaller than 0.0
        psrad xmm1, 31
        // select -1 if less than zero or v*v - d if >= 0
        andps xmm0, xmm1
        andnps xmm1, xmm4
        orps xmm0, xmm1
        ret
    }
    

    Version with intrinsics (only slightly tested - it's compilable and seems to generate OK assembly):

    __m128 intersect4RaysSphere(
         float* rOx, float* rOy, float* rOz,
         float* rVx, float* rVy, float* rVz,
         float sOx, float sOy, float sOz,
         float sR)
    {
        __m128 Qx = _mm_sub_ps(_mm_set1_ps(sOx), _mm_load_ps(rOx));
        __m128 Qy = _mm_sub_ps(_mm_set1_ps(sOy), _mm_load_ps(rOy));
        __m128 Qz = _mm_sub_ps(_mm_set1_ps(sOz), _mm_load_ps(rOz));
        __m128 v = _mm_add_ps(_mm_mul_ps(Qx, _mm_load_ps(rVx)),
                   _mm_add_ps(_mm_mul_ps(Qy, _mm_load_ps(rVy)),
                              _mm_mul_ps(Qz, _mm_load_ps(rVz))));
        __m128 vsquared = _mm_mul_ps(v, v);
        __m128 lengthQsquared = _mm_add_ps(_mm_mul_ps(Qx, Qx),
                                _mm_add_ps(_mm_mul_ps(Qy, Qy),
                                           _mm_mul_ps(Qz, Qz)));
        __m128 sr = _mm_set1_ps(sR);
        __m128 d = _mm_sub_ps(_mm_mul_ps(sr, sr), _mm_sub_ps(lengthQsquared, vsquared));
        __m128 mask = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(d), 31));
        //__m128 result = _mm_or_ps(_mm_and_ps(_mm_set1_ps(-1.0f), mask),
                                  _mm_andnot_ps(mask, _mm_sub_ps(vsquared, d)));
        __m128 result = _mm_or_ps(_mm_and_ps(_mm_set1_ps(-1.0f), mask),
                                  _mm_andnot_ps(mask, _mm_sub_ps(v, _mm_sqrt_ps(d))));
        return result;
    }