Search code examples
socketsassemblyx86-64nasmwin64

cannot return socket descriptor from WSASocketA


I have a .cpp file that makes a connection, works perfectly assembled with g++. I also have a 32-bit exe that makes a connection as well. The issue now is I want to use 64-bit assembly to make this connection. I was able to get as far as calling WSASocketA with all the right arguments (2,1,6,0,0,0) but I am simply not able to return a socket descriptor. I am getting -1 (0xFFFFFFFF). So then I noticed I can call WSAGetLastError which I did and it returns 10022. But my arguments are correct. I analyzed the objdump from my cpp connector which has the same exact arguments, in that order. And I am fully aware of using the correct calling convention for 64-bit in windows.

I am using nasm and tried both link and GoLinker.exe for my linkers which produce the same exact output. I am explicitly using ws2_32.lib from C:\Program Files (x86)\Windows Kits\10\Lib\10.0.19041.0\um\x64 when linking as well as kernel32.lib and ucrt.lib as well both in 64-bit. Is there some reason I am not able to get a socket descriptor. I'm pretty sure my code is correct:

BITS 64

SECTION .data

fmtd:
db "%d", 0x0a,0

fmts:
db "%s", 0x0a,0


SECTION .bss
;pStru: resq    1         ; This is a pointer for a dynamically created structure - malloc style
pStru:  resb    0x190     ; Shadow space for a statically created structure

SECTION .text
extern printf
extern malloc
global main
main:

;push rbp
;mov rbp, rsp

;sub   rsp, 0x28                 ; 40 bytes of shadow space
;and   rsp, 0FFFFFFFFFFFFFFF0h   ; Align the stack to a multiple of 16 bytes

; Parse PEB and find kernel32

xor rcx, rcx             ; RCX = 0
mov rax, [gs:rcx + 0x60] ; RAX = PEB
mov rax, [rax + 0x18]    ; RAX = PEB->Ldr
mov rsi, [rax + 0x20]    ; RSI = PEB->Ldr.InMemOrder
lodsq                    ; RAX = Second module
xchg rax, rsi            ; RAX = RSI, RSI = RAX
lodsq                    ; RAX = Third(kernel32)
mov rbx, [rax + 0x20]    ; RBX = Base address

; Parse kernel32 PE

xor r8, r8                 ; Clear r8
mov r8d, [rbx + 0x3c]      ; R8D = DOS->e_lfanew offset
mov rdx, r8                ; RDX = DOS->e_lfanew
add rdx, rbx               ; RDX = PE Header
mov r8d, [rdx + 0x88]      ; R8D = Offset export table
add r8, rbx                ; R8 = Export table
xor rsi, rsi               ; Clear RSI
mov esi, [r8 + 0x20]       ; RSI = Offset namestable
add rsi, rbx               ; RSI = Names table
xor rcx, rcx               ; RCX = 0
mov r9, 0x41636f7250746547 ; GetProcA

; Loop through exported functions and find GetProcAddress

Get_Function:

inc rcx                    ; Increment the ordinal
xor rax, rax               ; RAX = 0
mov eax, [rsi + rcx * 4]   ; Get name offset
add rax, rbx               ; Get function name
cmp QWORD [rax], r9        ; GetProcA ?
jnz Get_Function
xor rsi, rsi               ; RSI = 0
mov esi, [r8 + 0x24]       ; ESI = Offset ordinals
add rsi, rbx               ; RSI = Ordinals table
mov cx, [rsi + rcx * 2]    ; Number of function
xor rsi, rsi               ; RSI = 0
mov esi, [r8 + 0x1c]       ; Offset address table
add rsi, rbx               ; ESI = Address table
xor rdx, rdx               ; RDX = 0
mov edx, [rsi + rcx * 4]   ; EDX = Pointer(offset)
add rdx, rbx               ; RDX = GetProcAddress
mov rdi, rdx               ; Save GetProcAddress in RDI

; Use GetProcAddress to find the address of LoadLibrary

mov rcx, 0x41797261          ; aryA
push rcx                     ; Push on the stack
mov rcx, 0x7262694c64616f4c  ; LoadLibr
push rcx                     ; Push on stack
mov rdx, rsp                 ; LoadLibraryA
mov rcx, rbx                 ; kernel32.dll base address (rbx never changes so we could use it later for CreateProcessA)
sub rsp, 0x20                ; Allocate stack space for function call
call rdi                     ; Call GetProcAddress
add rsp, 0x20                ; Cleanup allocated stack space
mov rsi, rax                 ; LoadLibrary saved in RSI


;getws2_32:
mov rcx, 0x6c6c                 ; ll
push rcx                        ; Push on the stack
mov rcx, 0x642e32335f327377     ; d.32_2sw
push rcx                        ; Push on the stack
mov rcx, rsp                    ; ws2_32.dll
sub rsp, 0x20                   ; Allocate stack space for function call
call rsi                        ; call Loadlibrary (stored in rsi) and find ws2_32.dll
add rsp, 0x20                   ; Cleanup allocated stack space
mov r15, rax                    ; base address of ws2_32.dll saved in local variable r15 (winsock handle)


;getWSAStartup:
mov rcx, 0x7075                  ; pu
push rcx                         ; Push on the stack
mov rcx, 0x7472617453415357      ; tratSASW
push rcx                         ;Push on the stack
mov rdx, rsp                     ; copy WSAStartup from stack to 2nd argument (rdx is the 2nd arg)
mov rcx, r15                     ; winsock handler
sub rsp, 0x20                   ; Allocate stack space for function call
call rdi                         ; GetProcAddress(ws2_32.dll, WSAStartup)
add rsp, 0x20                   ; Cleanup allocated stack space
mov r14, rax                     ; ws2_32.WSAStartup saved in r14


;callWSAStartUp:
; malloc style just uncomment resq 0x190 for pStru in .bss
;mov     rcx, 0x198                    ; size of the structure
;call    malloc                     ; get the memory allocated
;mov     qword [ pStru ], rax       ; store the address in the pointer
;xor rdx, rdx
;lea rdx, [ pStru ] ; pointer to our WSAData structure
;xor rcx, rcx
;mov cx,  0x202
;call r14


; stack style
;xor rcx, rcx
;mov cx, 0x190         ; 0x190 works only when 0x28 bytes are subtracted, no more, no less!!!
;sub rsp,rcx
;lea rdx,[rsp]
;xor rcx, rcx
;mov cx,0x202
;sub rsp, 0x28
;call r14              ; call WSAStartup(MAKEWORD(2, 2), wsadata_pointer)
;add rsp, 0x28
;add rsp, 0x190


; static (.bss) style just uncomment resb 0x190 for pStru in .bss
xor rdx, rdx
lea rdx, [ pStru ] ; pointer to our WSAData structure
xor rcx, rcx
mov cx, 0x202          ; version 2,2 = 514 in decimal = 0x202 in hex, must be a word (2 bytes so only register cx is used)
sub rsp, 0x28
call r14
add rsp, 0x28

; print return value from WSAStartup (0 if no errors)
mov rdx, rax
mov rcx, fmtd
sub rsp, 0x20
call printf
add rsp, 0x20


;getWSASocketA:
xor rdx, rdx
xor rcx, rcx
mov rcx, 0x4174                  ; 'At'  original
push rcx                         ; push on stack
mov rcx, 0x656b636f53415357      ; 'ekcoSASW'
push rcx                         ; push on stack
mov rdx, rsp                     ; copy string of WSASocketA contents from stack to rdx (2nd arg for GetProcAddress)
mov rcx, r15                     ; socket handler ws2_32.dll
sub rsp, 0x30
call rdi                         ; GetProcAddress(ws2_32.dll, WSASocketA)
add rsp, 0x30                   ; Cleanup allocated stack space: standard is 32 bytes but 2 pushes = 16 more = 48 = 0x30
mov r13, rax                     ; save ws2_32.WSASocketA to r13


;callWSASocketA:
xor r9, r9                      ; lpProtocolInfo=NULL (uses itself from above: NULL)
push r9                         ; dwFlags=NULL
push r9                         ; g=NULL
xor r8, r8
mov r8, 0x6                     ; protocol=6
xor rdx, rdx
mov rdx, 0x1                    ; type=1
xor rcx, rcx
mov rcx, 0x2                    ; af=2
sub rsp, 0x28
call r13                        ; call WSASocketA
add rsp, 0x28                  ; Cleanup allocated stack space = 48 bytes = 0x30
mov r14, rax                    ; save socket descriptor of WSASocketA to r14

; print return value from WSASocketA (should be a socket descriptor)
xor rdx, rdx
mov rdx, r14
mov rcx, fmtd
sub rsp, 0x20
call printf
add rsp, 0x20

;getWSAGetLastError:
xor rcx, rcx
mov rcx, 0x726f7272457473        ; rorrEts
push rcx
mov rcx, 0x614c746547415357      ; aLteGSASW
push rcx                         ;Push on the stack
mov rdx, rsp                     ; copy WSAGetLastError from stack to 2nd argument (rdx is the 2nd arg)
mov rcx, r15                     ; winsock handler
sub rsp, 0x30                   ; Allocate stack space for function call
call rdi                         ; GetProcAddress(ws2_32.dll, WSAStartup)
add rsp, 0x30                   ; Cleanup allocated stack space
mov r11, rax                     ; ws2_32.WSAGetLastError saved in r11

;callWSAGetLastError:
call r11

; print return value from WSAGetLastError (which should be an error number if WSASocketA failed)
xor rdx, rdx
mov rdx, rax
mov rcx, fmtd
sub rsp, 0x20
call printf
add rsp, 0x20

When completed, rax has a -1 instead of a 264 or 256 as my cpp connector does. Any ideas? I did debug this for the last few days all day nonstop. My program does not crash, it simply does not get a socket descriptor. Thanks.

Using GoLink.exe:

nasm -f win64 connect64.s
c:\Golink\GoLink.exe /console /entry main kernel32.dll msvcrt.dll ws2_32.dll connect64.obj /fo connect64.exe && connect64.exe

OR

nasm -f win64 connect64.s && link connect64.obj /SUBSYSTEM:CONSOLE /OUT:connect64.exe /ENTRY:main "C:\Program Files (x86)\Windows Kits\10\Lib\10.0.19041.0\um\x64\WS2_32.LIB" "C:\Program Files (x86)\Windows Kits\10\Lib\10.0.19041.0\um\x64\KERNEL32.LIB" "C:\Program Files (x86)\Windows Kits\10\Lib\10.0.19041.0\ucrt\x64\UCRT.LIB" "legacy_stdio_definitions.lib" /LARGEADDRESSAWARE:NO && connect64.exe

Solution

  • The Windows ABI has three requirements that are relevant to this code:

    • rsp must be 16-byte aligned before the call.
    • There must be 32 bytes of free space at the top of the stack before the call, which can be freely used by the called function.
    • The first 4 function parameters are in rcx, rdx, r8, and r9, and the remaining parameters are on the stack starting at rsp+0x20.

    When a procedure is called, the call pushes the 8-byte return address onto the stack. So each function has to adjust the stack by an odd multiple of 8 to realign it to a 16-byte boundary.

    The code shown pushes the last two arguments onto the stack and then after that subtracts 0x30 from the stack pointer, so the two arguments pushed are not at rsp+0x20 where they need to be. It should subtract 0x20 instead.

    A better solution—the one used by compilers—is to subtract 0x38 from rsp at the beginning of the function and not change rsp again within the function. Initialize the two parameters using mov qword [rsp+0x20], 0; mov qword [rsp+0x28], 0 instead of push.