I am trying to optimize several assembly procedures for size, I am not concerned about the speed. The optimizations I am familiar with are situations as follows:
;the following two lines
mov rbp, rsp
add rbp, 50h
;can be changed to
lea rbp, [rsp+50h]
What other optimizations I can use to reduce the number of bytes in the following procedure? I am not asking anyone to fully optimize this procedure, just point out where I can improve.
;get procedure address
asmGetProc proc
push rcx ;pointer to function name
push rdx ;DllBase address (IMAGE_DOS_HEADER pointer)
push r8 ;pointer to IMAGE_EXPORT_DIRECTORY
push r9 ;IMAGE_EXPORT_DIRECTORY->NumberOfNames
;IMAGE_EXPORT_DIRECTORY->AddressOfNameOrdinals[r9]
push rbx ;saved pointer to function name
push r10 ;pointer to IMAGE_EXPORT_DIRECTORY->AddressOfNames
;pointer to IMAGE_EXPORT_DIRECTORY->AddressOfNameOrdinals
;pointer to IMAGE_EXPORT_DIRECTORY->AddressOfFunctions
mov rbx, rcx ;save the function name pointer to rax
mov r8d, [rdx+3ch] ;IMAGE_DOS_HEADER->e_lfanew (DWORD) (Offset to IMAGE_NT_HEADERS64)
add r8, rdx ;add DllBase to the e_lfanew offset
add r8, 88h ;18h - IMAGE_NT_HEADERS64->OptionalHeader (IMAGE_OPTIONAL_HEADER64) 18h bytes
;70h - skip entire IMAGE_OPTIONAL_HEADER64 structure
;r8 points to the IMAGE_DATA_DIRECTORY structure
mov r8d, [r8] ;IMAGE_DATA_DIRECTORY->VirtualAddress (DWORD)
add r8, rdx ;add DllBase to VirtualAddress (IMAGE_EXPORT_DIRECTORY)
mov r9d, [r8+18h] ;IMAGE_EXPORT_DIRECTORY->NumberOfNames
mov r10d, [r8+20h] ;IMAGE_EXPORT_DIRECTORY->AddressOfNames (DWORD)
add r10, rdx ;add DllBase to AddressOfNames (DWORD)
for_each_function:
;decrement function name counter
dec r9
;load current index of AddressOfNames into r11
lea rcx, [r10 + 4 * r9] ;AddressOfNames[i] - function string RVA (relative virtual address)
mov ecx, [rcx] ;r11d is the AddressOfName[r9] RVA (DWORD)
add rcx, rdx ;add DllBase to string RVA DWORD
call asmHsh ;hash the function name
cmp rax, rbx ;compare the function name hash with the passed hash
jnz for_each_function ;jump to top of loop is not a match
;r8 - export directory
;r9 - function name counter
;r10 - AddressOfNameOrdinals / AddressOfFunctions array
;rax - final point to function
mov r10d, [r8+24h] ;IMAGE_EXPORT_DIRECTORY->AddressOfNameOrdinals (DWORD)
add r10, rdx ;add DllBase to AddressOfNameOrdinals DWORD
mov r9w, [r10+2*r9] ;AddressOfNameOrdinals[2*r9] - (2*r9 = 2 bytes * function name counter)
mov r10d, [r8+1ch] ;IMAGE_EXPORT_DIRECTORY->AddressOfFunctions (DWORD)
add r10, rdx ;add DllBase to AddressOfFunctions DWORD
mov eax, [r10+r9*4] ;AddressOfFunctions[4*r9] - (4*r9 = 4 bytes * function ordinal)
add rax, rdx ;add DllBase to function ordinal RVA DWORD
pop r10
pop rbx
pop r9
pop r8
pop rdx
pop rcx
ret ;return from procedure
asmGetProc endp
EDIT: Added asmHsh (my bad)
;hash function (djb2)
asmHsh proc
;rcx - null terminated function name
push rcx
push rdx
mov rax, 5381d
hl:
mov rdx, rax
shl rax, 5
add rax, rdx
xor al, [rcx]
inc rcx
;check for null termination
mov dl, [rcx]
cmp dl, 00h
jne short hl
pop rdx
pop rcx
ret
asmHsh endp
Optimizing assembly for space in 64-bit mode one should: (1) use DWORD width when that suffices (less prefixes); (2) stick to the old X86 registers eax-edx / esi / edi / ebp (tighter encoding).
Hopefully what's done below illustrates the idea. ML64 assembled the original routines to 135 bytes and the modified version to 103 bytes.
Examples of changes: (1) used rbp / rsi / rdi instead of r8 / r9 / r10; (2) shrunk instruction sequences that could be accomplished via multi-component address modes; (3) used DWORD dec where the data is known to be 32-bits; (4) used IMUL in place of shift/add.
" ;- " is in front of removed lines " ;## delta " is appended to added lines, where delta is the byte difference the new code produced. No attempt was made to adjust the comments.
;hash function (djb2)
asmHsh proc
;rcx - null terminated function name
push rcx
;-push rdx ;## -1
mov rax, 5381d
hl:
;- mov rdx, rax
;- shl rax, 5
;- add rax, rdx
imul rax,rax,33 ;## -6
xor al, [rcx]
inc rcx
;check for null termination
;-mov dl, [rcx]
;-cmp dl, 00h
cmp byte ptr [rcx], 00h ;## -2
jne short hl
;-pop rdx ;## -1
pop rcx
ret
asmHsh endp
;get procedure address
asmGetProc proc
push rcx ;pointer to function name
push rdx ;DllBase address (IMAGE_DOS_HEADER pointer)
;-push r8 ;pointer to IMAGE_EXPORT_DIRECTORY
push rbp ;## -1
;-push r9 ;IMAGE_EXPORT_DIRECTORY->NumberOfNames
push rsi ;## -1
;IMAGE_EXPORT_DIRECTORY->AddressOfNameOrdinals[r9]
push rbx ;saved pointer to function name
;-push r10 ;pointer to IMAGE_EXPORT_DIRECTORY->AddressOfNames
push rdi ;## -1
;pointer to IMAGE_EXPORT_DIRECTORY->AddressOfNameOrdinals
;pointer to IMAGE_EXPORT_DIRECTORY->AddressOfFunctions
mov rbx, rcx ;save the function name pointer to rax
;-mov r8d, [rdx+3ch] ;IMAGE_DOS_HEADER->e_lfanew (DWORD) (Offset to IMAGE_NT_HEADERS64)
mov ebp, [rdx+3ch] ;## -1
;-add r8, rdx ;add DllBase to the e_lfanew offset
;-add r8, 88h ;18h - IMAGE_NT_HEADERS64->OptionalHeader (IMAGE_OPTIONAL_HEADER64) 18h bytes
;- ;70h - skip entire IMAGE_OPTIONAL_HEADER64 structure
;- ;r8 points to the IMAGE_DATA_DIRECTORY structure
;-mov r8d, [r8] ;IMAGE_DATA_DIRECTORY->VirtualAddress (DWORD)
mov ebp, [rbp+rdx+88h] ;## -5
;-add r8, rdx ;add DllBase to VirtualAddress (IMAGE_EXPORT_DIRECTORY)
add rbp, rdx ;## 0
;-mov r9d, [r8+18h] ;IMAGE_EXPORT_DIRECTORY->NumberOfNames
mov esi, [rbp+18h] ;## -1
;-mov r10d, [r8+20h] ;IMAGE_EXPORT_DIRECTORY->AddressOfNames (DWORD)
mov edi, [rbp+20h] ;## -1
;-add r10, rdx ;add DllBase to AddressOfNames (DWORD)
add rdi, rdx ;## 0
for_each_function:
;decrement function name counter
;- dec r9
dec esi ;## -1
;load current index of AddressOfNames into r11
;- lea rcx, [r10 + 4 * r9] ;AddressOfNames[i] - function string RVA (relative virtual address)
;- mov ecx, [rcx] ;r11d is the AddressOfName[r9] RVA (DWORD)
mov ecx, [rdi + 4 * rsi] ;## -3
add rcx, rdx ;add DllBase to string RVA DWORD
call asmHsh ;hash the function name
cmp rax, rbx ;compare the function name hash with the passed hash
jnz for_each_function ;jump to top of loop is not a match
;r8 - export directory
;r9 - function name counter
;r10 - AddressOfNameOrdinals / AddressOfFunctions array
;rax - final point to function
;-mov r10d, [r8+24h] ;IMAGE_EXPORT_DIRECTORY->AddressOfNameOrdinals (DWORD)
mov edi, [rbp+24h];## -1
;-add r10, rdx ;add DllBase to AddressOfNameOrdinals DWORD
add rdi, rdx; ## 0
;-mov r9w, [r10+2*r9] ;AddressOfNameOrdinals[2*r9] - (2*r9 = 2 bytes * function name counter)
mov si, [rdi+2*rsi] ;## -1
;-mov r10d, [r8+1ch] ;IMAGE_EXPORT_DIRECTORY->AddressOfFunctions (DWORD)
mov edi, [rbp+1ch] ;## -1
;-add r10, rdx ;add DllBase to AddressOfFunctions DWORD
add rdi, rdx ;## 0
;-mov eax, [r10+r9*4] ;AddressOfFunctions[4*r9] - (4*r9 = 4 bytes * function ordinal)
mov eax, [rdi+rsi*4] ; ## -1
add rax, rdx ;add DllBase to function ordinal RVA DWORD
;-pop r10
pop rdi ; ## -1
pop rbx
;-pop r9
pop rsi
;-pop r8
pop rbp ;## -1
pop rdx
pop rcx
ret ;return from procedure
asmGetProc endp