Search code examples
algorithmassemblyvisual-c++x86signed

Signed 64-Bit multiply and 128-Bit Divide on x86 in assembly


I have 2 functions written in assembly (masm) in visual studio that i use in my C++ project. They are an Unsigned 64-Bit multiply function that produces a 128-Bit result, and a Unsigned 128-Bit divide function that produces a 128-Bit Quotient and returns a 32-Bit Remainder.

What i need is a signed version of the functions but I'm not sure how to do it.

Below is the code of the .asm file with the Unsigned functions:

.MODEL flat, stdcall
.CODE

MUL64 PROC, A:QWORD, B:QWORD, pu128:DWORD
push EAX
push EDX
push EBX
push ECX
push EDI
mov EDI,pu128
; LO(A) * LO(B)
mov EAX,DWORD PTR A
mov EDX,DWORD PTR B
MUL EDX
mov [EDI],EAX ; Save the partial product.
mov ECX,EDX
; LO(A) * HI(B)
mov EAX,DWORD PTR A
mov EDX,DWORD PTR B+4
MUL EDX
ADD EAX,ECX
ADC EDX,0
mov EBX,EAX
mov ECX,EDX
; HI(A) * LO(B)
mov EAX,DWORD PTR A+4
mov EDX,DWORD PTR B
MUL EDX
ADD EAX,EBX
ADC ECX,EDX
PUSHFD ; Save carry.
mov [EDI+4],EAX ; Save the partial product.
; HI(A) * HI(B)
mov EAX,DWORD PTR A+4
mov EDX,DWORD PTR B+4
MUL EDX
POPFD ; Retrieve carry from above.
ADC EAX,ECX
ADC EDX,0
mov [EDI+8],EAX ; Save the partial product.
mov [EDI+12],EDX ; Save the partial product.
pop EDI
pop ECX
pop EBX
pop EDX
pop EAX
ret 20
MUL64 ENDP

IMUL64 PROC, A:SQWORD, B:SQWORD, pi128:DWORD
; How to make this work?
ret 20
IMUL64 ENDP

DIV128 PROC, pDividend128:DWORD, Divisor:DWORD, pQuotient128:DWORD
push EDX
push EBX
push ESI
push EDI
MOV ESI,pDividend128
MOV EDI,pQuotient128
MOV EBX,Divisor
XOR EDX,EDX
MOV EAX,[ESI+12]
DIV EBX
MOV [EDI+12],EAX
MOV EAX,[ESI+8]
DIV EBX
MOV [EDI+8],EAX
MOV EAX,[ESI+4]
DIV EBX
MOV [EDI+4],EAX
MOV EAX,[ESI]
DIV EBX
MOV [EDI],EAX
MOV EAX,EDX
pop EDI
pop ESI
pop EBX
pop EDX
ret 12
DIV128 ENDP

IDIV128 PROC, pDividend128:DWORD, Divisor:DWORD, pQuotient128:DWORD
; How to make this work?
ret 12
IDIV128 ENDP

END

If you found this helpful in anyway please help the project by helping code the Signed version of the functions.


Solution

  • First, the MUL64 function does not work 100%

    If you try to do 0xFFFFFFFFFFFFFFFF x 0xFFFFFFFFFFFFFFFF, the Hi 64-bit result is 0xFFFFFFFeFFFFFFFF, it should be 0xFFFFFFFFFFFFFFFe

    To fix this, the carry flag after the POPFD instruction should be added to EDX, the highest 32-bit part of the result. Now following Peter Cordes advice, remove the push and pops of EAX/ECX/EDX. Finally use setc BL and movzx EBX,BL to save the flag. Note: you cannot easily use xor EBX,EBX to zero it because xor effects the flags. We use movzx because its faster than add BL,0xFF and add is faster than adc based on Skylake specs.

    The Result:

    MUL64 PROC, A:QWORD, B:QWORD, pu128:DWORD
    push EBX
    push EDI
    mov EDI,pu128
    ; LO(A) * LO(B)
    mov EAX,DWORD PTR A
    mov EDX,DWORD PTR B
    mul EDX
    mov [EDI],EAX ; Save the partial product.
    mov ECX,EDX
    ; LO(A) * HI(B)
    mov EAX,DWORD PTR A
    mov EDX,DWORD PTR B+4
    mul EDX
    add EAX,ECX
    adc EDX,0
    mov EBX,EAX
    mov ECX,EDX
    ; HI(A) * LO(B)
    mov EAX,DWORD PTR A+4
    mov EDX,DWORD PTR B
    mul EDX
    add EAX,EBX
    adc ECX,EDX
    setc BL ; Save carry.
    movzx EBX,BL ; Zero-Extend carry.
    mov [EDI+4],EAX ; Save the partial product.
    ; HI(A) * HI(B)
    mov EAX,DWORD PTR A+4
    mov EDX,DWORD PTR B+4
    mul EDX
    add EDX,EBX ; Add carry from above.
    add EAX,ECX
    adc EDX,0
    mov [EDI+8],EAX ; Save the partial product.
    mov [EDI+12],EDX ; Save the partial product.
    pop EDI
    pop EBX
    ret 20
    MUL64 ENDP
    

    Now, to make a signed version of the function use this formula:

    my128.Hi -= (((A < 0) ? B : 0) + ((B < 0) ? A : 0));

    The Result:

    IMUL64 PROC, A:SQWORD, B:SQWORD, pi128:DWORD
    push EBX
    push EDI
    mov EDI,pi128
    ; LO(A) * LO(B)
    mov EAX,DWORD PTR A
    mov EDX,DWORD PTR B
    mul EDX
    mov [EDI],EAX ; Save the partial product.
    mov ECX,EDX
    ; LO(A) * HI(B)
    mov EAX,DWORD PTR A
    mov EDX,DWORD PTR B+4
    mul EDX
    add EAX,ECX
    adc EDX,0
    mov EBX,EAX
    mov ECX,EDX
    ; HI(A) * LO(B)
    mov EAX,DWORD PTR A+4
    mov EDX,DWORD PTR B
    mul EDX
    add EAX,EBX
    adc ECX,EDX
    setc BL ; Save carry.
    movzx EBX,BL ; Zero-Extend carry.
    mov [EDI+4],EAX ; Save the partial product.
    ; HI(A) * HI(B)
    mov EAX,DWORD PTR A+4
    mov EDX,DWORD PTR B+4
    mul EDX
    add EDX,EBX ; Add carry from above.
    add EAX,ECX
    adc EDX,0
    mov [EDI+8],EAX ; Save the partial product.
    mov [EDI+12],EDX ; Save the partial product.
    ; Signed version only:
    cmp DWORD PTR A+4,0
    jg zero_b
    jl use_b
    cmp DWORD PTR A,0
    jae zero_b
    use_b:
    mov ECX,DWORD PTR B
    mov EBX,DWORD PTR B+4
    jmp test_b
    zero_b:
    xor ECX,ECX
    mov EBX,ECX
    test_b:
    cmp DWORD PTR B+4,0
    jg zero_a
    jl use_a
    cmp DWORD PTR B,0
    jae zero_a
    use_a:
    mov EAX,DWORD PTR A
    mov EDX,DWORD PTR A+4
    jmp do_last_op
    zero_a:
    xor EAX,EAX
    mov EDX,EAX
    do_last_op:
    add EAX,ECX
    adc EDX,EBX
    sub [EDI+8],EAX
    sbb [EDI+12],EDX
    ; End of signed version!
    pop EDI
    pop EBX
    ret 20
    IMUL64 ENDP
    

    The DIV128 function should be fine (also probably the fastest) for getting a 128-bit quotient from a 32-bit divisor, but if you need to use a 128-bit divisor then look at this code https://www.codeproject.com/Tips/785014/UInt-Division-Modulus that has an example of using the Binary Shift Algorithm for 128-bit division. It could probably be 3x faster if written in assembly.

    To make a signed version of DIV128, first determine if the sign of the divisor and dividend are the same or different. If they are the same, then the result should be positive. If they are different, then the result should be negative. So... Make the dividend and divisor positive if they are negative and call DIV128, after that, negate the results if the signs were different.

    Here is some example code written in C++

    VOID IDIV128(PSDQWORD Dividend, PSDQWORD Divisor, PSDQWORD Quotient, PSDQWORD Remainder)
    {
        BOOL Negate;
        DQWORD DD, DV;
    
        Negate = TRUE;
    
        // Use local DD and DV so Dividend and Divisor dont get currupted.
        DD.Lo = Dividend->Lo;
        DD.Hi = Dividend->Hi;
        DV.Lo = Divisor->Lo;
        DV.Hi = Divisor->Hi;
    
        // if the signs are the same then: Negate = FALSE;
        if ((DD.Hi & 0x8000000000000000) == (DV.Hi & 0x8000000000000000)) Negate = FALSE;
    
        // Covert Dividend and Divisor to possitive if negative: (negate)
        if (DD.Hi & 0x8000000000000000) NEG128((PSDQWORD)&DD);
        if (DV.Hi & 0x8000000000000000) NEG128((PSDQWORD)&DV);
    
        DIV128(&DD, &DV, (PDQWORD)Quotient, (PDQWORD)Remainder);
    
        if (Negate == TRUE)
        {
            NEG128(Quotient);
            NEG128(Remainder);
        }
    }
    

    EDIT:

    Following Peter Cordes advice, we can optimize MUL64/IMUL64 even more. Look at the comments for specific changes being made. I have also replaced MUL64 PROC, A:QWORD, B:QWORD, pu128:DWORD with MUL64@20: and IMUL64@20: to eliminate unnecessary use of EBP that masm adds. I also optimized the sign-fixing work for IMUL64.

    The current .asm file for MUL64/IMUL64

    .MODEL flat, stdcall
    
    EXTERNDEF  MUL64@20     :PROC
    EXTERNDEF  IMUL64@20    :PROC
    
    .CODE
    
    MUL64@20:
    push EBX
    push EDI
    
    ;            -----------------
    ;            |     pu128     |
    ;            |---------------|
    ;            |       B       |
    ;            |---------------|
    ;            |       A       |
    ;            |---------------|
    ;            |  ret address  |
    ;            |---------------|
    ;            |      EBX      |
    ;            |---------------|
    ;    ESP---->|      EDI      |
    ;            -----------------
    
    A       TEXTEQU   <[ESP+12]>
    B       TEXTEQU   <[ESP+20]>
    pu128   TEXTEQU   <[ESP+28]>
    
    mov EDI,pu128
    ; LO(A) * LO(B)
    mov EAX,DWORD PTR A
    mul DWORD PTR B
    mov [EDI],EAX ; Save the partial product.
    mov ECX,EDX
    ; LO(A) * HI(B)
    mov EAX,DWORD PTR A
    mul DWORD PTR B+4
    add EAX,ECX
    adc EDX,0
    mov EBX,EAX
    mov ECX,EDX
    ; HI(A) * LO(B)
    mov EAX,DWORD PTR A+4
    mul DWORD PTR B
    add EAX,EBX
    adc ECX,EDX
    setc BL ; Save carry.
    mov [EDI+4],EAX ; Save the partial product.
    ; HI(A) * HI(B)
    mov EAX,DWORD PTR A+4
    mul DWORD PTR B+4
    add EAX,ECX
    movzx ECX,BL ; Zero-Extend saved carry from above.
    adc EDX,ECX
    mov [EDI+8],EAX ; Save the partial product.
    mov [EDI+12],EDX ; Save the partial product.
    pop EDI
    pop EBX
    ret 20
    
    IMUL64@20:
    push EBX
    push EDI
    
    ;            -----------------
    ;            |     pi128     |
    ;            |---------------|
    ;            |       B       |
    ;            |---------------|
    ;            |       A       |
    ;            |---------------|
    ;            |  ret address  |
    ;            |---------------|
    ;            |      EBX      |
    ;            |---------------|
    ;    ESP---->|      EDI      |
    ;            -----------------
    
    A       TEXTEQU   <[ESP+12]>
    B       TEXTEQU   <[ESP+20]>
    pi128   TEXTEQU   <[ESP+28]>
    
    mov EDI,pi128
    ; LO(A) * LO(B)
    mov EAX,DWORD PTR A
    mul DWORD PTR B
    mov [EDI],EAX ; Save the partial product.
    mov ECX,EDX
    ; LO(A) * HI(B)
    mov EAX,DWORD PTR A
    mul DWORD PTR B+4
    add EAX,ECX
    adc EDX,0
    mov EBX,EAX
    mov ECX,EDX
    ; HI(A) * LO(B)
    mov EAX,DWORD PTR A+4
    mul DWORD PTR B
    add EAX,EBX
    adc ECX,EDX
    setc BL ; Save carry.
    mov [EDI+4],EAX ; Save the partial product.
    ; HI(A) * HI(B)
    mov EAX,DWORD PTR A+4
    mul DWORD PTR B+4
    add EAX,ECX
    movzx ECX,BL ; Zero-Extend saved carry from above.
    adc EDX,ECX
    mov [EDI+8],EAX ; Save the partial product.
    mov [EDI+12],EDX ; Save the partial product.
    ; Signed version only:
    mov BL,BYTE PTR B+7
    and BL,80H
    jz zero_a
    mov EAX,DWORD PTR A
    mov EDX,DWORD PTR A+4
    jmp test_a
    zero_a:
    xor EAX,EAX
    mov EDX,EAX
    test_a:
    mov BL,BYTE PTR A+7
    and BL,80H
    jz do_last_op
    add EAX,DWORD PTR B
    adc EDX,DWORD PTR B+4
    do_last_op:
    sub [EDI+8],EAX
    sbb [EDI+12],EDX
    ; End of signed version!
    pop EDI
    pop EBX
    ret 20
    
    END