Search code examples
windowsassembly64-bitmasmportable-executable

x64 Assembly Optimization


I am trying to optimize several assembly procedures for size, I am not concerned about the speed. The optimizations I am familiar with are situations as follows:

;the following two lines
    mov rbp, rsp
    add rbp, 50h
;can be changed to
    lea rbp, [rsp+50h]

What other optimizations I can use to reduce the number of bytes in the following procedure? I am not asking anyone to fully optimize this procedure, just point out where I can improve.

;get procedure address
asmGetProc proc
push rcx                    ;pointer to function name
push rdx                    ;DllBase address (IMAGE_DOS_HEADER pointer)
push r8                     ;pointer to IMAGE_EXPORT_DIRECTORY
push r9                     ;IMAGE_EXPORT_DIRECTORY->NumberOfNames
                            ;IMAGE_EXPORT_DIRECTORY->AddressOfNameOrdinals[r9]

push rbx                    ;saved pointer to function name

push r10                    ;pointer to IMAGE_EXPORT_DIRECTORY->AddressOfNames
                            ;pointer to IMAGE_EXPORT_DIRECTORY->AddressOfNameOrdinals
                            ;pointer to IMAGE_EXPORT_DIRECTORY->AddressOfFunctions

mov rbx, rcx                ;save the function name pointer to rax

mov r8d, [rdx+3ch]          ;IMAGE_DOS_HEADER->e_lfanew (DWORD) (Offset to IMAGE_NT_HEADERS64)
add r8, rdx                 ;add DllBase to the e_lfanew offset
add r8, 88h                 ;18h - IMAGE_NT_HEADERS64->OptionalHeader (IMAGE_OPTIONAL_HEADER64) 18h bytes
                            ;70h - skip entire IMAGE_OPTIONAL_HEADER64 structure
                            ;r8 points to the IMAGE_DATA_DIRECTORY structure
mov r8d, [r8]               ;IMAGE_DATA_DIRECTORY->VirtualAddress (DWORD)
add r8, rdx                 ;add DllBase to VirtualAddress (IMAGE_EXPORT_DIRECTORY)

mov r9d, [r8+18h]           ;IMAGE_EXPORT_DIRECTORY->NumberOfNames
mov r10d, [r8+20h]          ;IMAGE_EXPORT_DIRECTORY->AddressOfNames (DWORD)
add r10, rdx                ;add DllBase to AddressOfNames (DWORD)
for_each_function:
    ;decrement function name counter
    dec r9

    ;load current index of AddressOfNames into r11
    lea rcx, [r10 + 4 * r9]     ;AddressOfNames[i] - function string RVA (relative virtual address)
    mov ecx, [rcx]              ;r11d is the AddressOfName[r9] RVA (DWORD)
    add rcx, rdx                ;add DllBase to string RVA DWORD

    call asmHsh                 ;hash the function name
    cmp rax, rbx                ;compare the function name hash with the passed hash
jnz for_each_function           ;jump to top of loop is not a match


;r8  - export directory
;r9  - function name counter
;r10 - AddressOfNameOrdinals / AddressOfFunctions array
;rax - final point to function
mov r10d, [r8+24h]          ;IMAGE_EXPORT_DIRECTORY->AddressOfNameOrdinals (DWORD)
add r10, rdx                ;add DllBase to AddressOfNameOrdinals DWORD
mov r9w, [r10+2*r9]         ;AddressOfNameOrdinals[2*r9] - (2*r9 = 2 bytes * function name counter)

mov r10d, [r8+1ch]          ;IMAGE_EXPORT_DIRECTORY->AddressOfFunctions (DWORD)
add r10, rdx                ;add DllBase to AddressOfFunctions DWORD
mov eax, [r10+r9*4]         ;AddressOfFunctions[4*r9] - (4*r9 = 4 bytes * function ordinal)
add rax, rdx                ;add DllBase to function ordinal RVA DWORD

pop r10
pop rbx
pop r9
pop r8
pop rdx
pop rcx

ret                         ;return from procedure
asmGetProc endp

EDIT: Added asmHsh (my bad)

;hash function (djb2)
asmHsh proc
;rcx - null terminated function name
push rcx
push rdx

mov rax, 5381d
hl:
    mov rdx, rax
    shl rax, 5
    add rax, rdx
    xor al, [rcx]
    inc rcx
;check for null termination
mov dl, [rcx]
cmp dl, 00h
jne short hl         

pop rdx
pop rcx
ret

asmHsh endp

Solution

  • Optimizing assembly for space in 64-bit mode one should: (1) use DWORD width when that suffices (less prefixes); (2) stick to the old X86 registers eax-edx / esi / edi / ebp (tighter encoding).

    Hopefully what's done below illustrates the idea. ML64 assembled the original routines to 135 bytes and the modified version to 103 bytes.

    Examples of changes: (1) used rbp / rsi / rdi instead of r8 / r9 / r10; (2) shrunk instruction sequences that could be accomplished via multi-component address modes; (3) used DWORD dec where the data is known to be 32-bits; (4) used IMUL in place of shift/add.

    " ;- " is in front of removed lines " ;## delta " is appended to added lines, where delta is the byte difference the new code produced. No attempt was made to adjust the comments.

    ;hash function (djb2)
    asmHsh proc
    ;rcx - null terminated function name
    push rcx
    ;-push rdx ;## -1
    
    mov rax, 5381d
    hl:
    ;-  mov rdx, rax
    ;-  shl rax, 5
    ;-  add rax, rdx
        imul rax,rax,33 ;## -6
        xor al, [rcx]
        inc rcx
    ;check for null termination
    ;-mov dl, [rcx]
    ;-cmp dl, 00h
    cmp byte ptr [rcx], 00h ;## -2
    jne short hl         
    
    ;-pop rdx ;## -1
    pop rcx
    ret
    asmHsh endp
    
    ;get procedure address
    asmGetProc proc
    push rcx                    ;pointer to function name
    push rdx                    ;DllBase address (IMAGE_DOS_HEADER pointer)
    ;-push r8                    ;pointer to IMAGE_EXPORT_DIRECTORY
    push rbp ;## -1
    ;-push r9                     ;IMAGE_EXPORT_DIRECTORY->NumberOfNames
    push rsi ;## -1
                                ;IMAGE_EXPORT_DIRECTORY->AddressOfNameOrdinals[r9]
    
    push rbx                    ;saved pointer to function name
    
    ;-push r10                    ;pointer to IMAGE_EXPORT_DIRECTORY->AddressOfNames
    push rdi ;## -1
                                ;pointer to IMAGE_EXPORT_DIRECTORY->AddressOfNameOrdinals
                                ;pointer to IMAGE_EXPORT_DIRECTORY->AddressOfFunctions
    
    mov rbx, rcx                ;save the function name pointer to rax
    
    ;-mov r8d, [rdx+3ch]          ;IMAGE_DOS_HEADER->e_lfanew (DWORD) (Offset to IMAGE_NT_HEADERS64)
    mov ebp, [rdx+3ch] ;## -1
    ;-add r8, rdx                 ;add DllBase to the e_lfanew offset
    ;-add r8, 88h                 ;18h - IMAGE_NT_HEADERS64->OptionalHeader (IMAGE_OPTIONAL_HEADER64) 18h bytes
    ;-                            ;70h - skip entire IMAGE_OPTIONAL_HEADER64 structure
    ;-                            ;r8 points to the IMAGE_DATA_DIRECTORY structure
    ;-mov r8d, [r8]               ;IMAGE_DATA_DIRECTORY->VirtualAddress (DWORD)
    mov ebp, [rbp+rdx+88h] ;## -5
    ;-add r8, rdx                 ;add DllBase to VirtualAddress (IMAGE_EXPORT_DIRECTORY)
    add rbp, rdx ;## 0
    
    ;-mov r9d, [r8+18h]           ;IMAGE_EXPORT_DIRECTORY->NumberOfNames
    mov esi, [rbp+18h] ;## -1
    ;-mov r10d, [r8+20h]          ;IMAGE_EXPORT_DIRECTORY->AddressOfNames (DWORD)
    mov edi, [rbp+20h] ;## -1
    ;-add r10, rdx                ;add DllBase to AddressOfNames (DWORD)
    add rdi, rdx ;## 0
    for_each_function:
        ;decrement function name counter
    ;-  dec r9
        dec esi ;## -1
    
        ;load current index of AddressOfNames into r11
    ;-  lea rcx, [r10 + 4 * r9]     ;AddressOfNames[i] - function string RVA (relative virtual address)
    ;-  mov ecx, [rcx]              ;r11d is the AddressOfName[r9] RVA (DWORD)
        mov ecx, [rdi + 4 * rsi] ;## -3
        add rcx, rdx                ;add DllBase to string RVA DWORD
    
        call asmHsh                 ;hash the function name
        cmp rax, rbx                ;compare the function name hash with the passed hash
    jnz for_each_function           ;jump to top of loop is not a match
    
    
    ;r8  - export directory
    ;r9  - function name counter
    ;r10 - AddressOfNameOrdinals / AddressOfFunctions array
    ;rax - final point to function
    ;-mov r10d, [r8+24h]          ;IMAGE_EXPORT_DIRECTORY->AddressOfNameOrdinals (DWORD)
    mov edi, [rbp+24h];## -1
    ;-add r10, rdx                ;add DllBase to AddressOfNameOrdinals DWORD
    add rdi, rdx; ## 0
    ;-mov r9w, [r10+2*r9]         ;AddressOfNameOrdinals[2*r9] - (2*r9 = 2 bytes * function name counter)
    mov si, [rdi+2*rsi] ;## -1
    
    ;-mov r10d, [r8+1ch]          ;IMAGE_EXPORT_DIRECTORY->AddressOfFunctions (DWORD)
    mov edi, [rbp+1ch] ;## -1
    ;-add r10, rdx                ;add DllBase to AddressOfFunctions DWORD
    add rdi, rdx ;## 0
    ;-mov eax, [r10+r9*4]         ;AddressOfFunctions[4*r9] - (4*r9 = 4 bytes * function ordinal)
    mov eax, [rdi+rsi*4] ; ## -1
    add rax, rdx                ;add DllBase to function ordinal RVA DWORD
    
    ;-pop r10
    pop rdi ; ## -1
    pop rbx
    ;-pop r9
    pop rsi
    ;-pop r8
    pop rbp ;## -1
    pop rdx
    pop rcx
    
    ret                         ;return from procedure
    asmGetProc endp