carmack's invsqrt in asm

Hullo, I am not too much experienced in asm and I would like to rewrite carmack's inverted square root c routine in assembly

    ;   float InvSqrt (float x){
    ;
    @173:
    push      ebp
    mov       ebp,esp
    add       esp,-8
    ;
    ;       float xhalf = 0.5f*x;
    ;
    fld       dword ptr [@174]
    fmul      dword ptr [ebp+8]
    fstp      dword ptr [ebp-4]
    ;
    ;       int i = *(int*)&x;
    ;
    mov       eax,dword ptr [ebp+8]
    mov       dword ptr [ebp-8],eax
    ;
    ;       i = 0x5f3759df - (i>>1);
    ;
    mov       edx,dword ptr [ebp-8]
    sar       edx,1
    mov       ecx,1597463007
    sub       ecx,edx
    mov       dword ptr [ebp-8],ecx
    ;
    ;       x = *(float*)&i;
    ;
    mov       eax,dword ptr [ebp-8]
    mov       dword ptr [ebp+8],eax
    ;
    ;       x = x*(1.5f - xhalf*x*x);
    ;
   fld       dword ptr [ebp-4]
   fmul      dword ptr [ebp+8]
   fmul      dword ptr [ebp+8]
   fsubr     dword ptr [@174+4]
   fmul      dword ptr [ebp+8]
   fstp      dword ptr [ebp+8]
   ;
   ;        return x;
   ;
   fld       dword ptr [ebp+8]
   ;
   ;    }
   ;
   @176:
   @175:
    pop       ecx
    pop       ecx
    pop       ebp
    ret

here was what compiler generated, but I would like to optimise it and rewrite to asm routine

(this code generated is far from optimal i thing - mixing fpu with integer operations, maybe some revrite by conscious person would much improve it)

how it can be optymized?

edit:

as to answer @harold

there is an improvement:

1.0/sqrt(100.0) takes 140 cycles on my old machine
InvSqrt - c version - takes 44 cycles (though accuracy is not stunning)
ansver below in asm works same as c version and it takes 29 cycles

(measurments may be somewhat approximate but genarlly seem be ok IMO, done by rtdsc 1000x for loop then resulting 140000/1000 = 140cycles 29000/1000 = 29cycles and so)

Solution

Many of those moves to/from memory aren't really necessary. This probably isn't too much of an improvement though (especially not compared to not doing any of this in the first place and just using SSE).

Not tested:

; i = 0x5f3759df - (reinterpret_cast<int32>(number) >> 1)
mov eax, dword ptr [ebp+8]
sar eax,1
mov edx, 0x5f3759df
sub edx, eax
mov dword ptr [ebp-4], edx
; y = reinterpret_cast<float>(i)
fld dword ptr [ebp-4]
; x2 = numer * 0.5f
fld dword ptr [ebp+8]
fmul dword ptr [half]
; (x2 * y) * y
fmul st(0), st(1)
fmul st(0), st(1)
; 1.5f - (stuff)
fld dword ptr [threehalfs]
fsubrp st(1), st(0)
; y * (stuff)
fmulp st(1), st(0)

It shouldn't really be too hard to follow, but I'll make some stack diagrams if you want them.