Search code examples
assemblynasmx86-16bootloaderosdev

How to read sector into memory and jump to it for OS?


I know this question gets asked a lot but every answer I find doesn't work for me. I'm trying to load stage 2 of my OS, located at the second sector of my image file(0x200)

This is the code I tried to use:


bits 16                             ; Starting at 16 bits
org 0x0                               ; And starting at 0

jmp main                            ; Hop to main!


; TODO: copy comment from prev. loader
; args: SI
print:
    lodsb                           ; Load the next/first character to AL
    or al, al                       ; Is it 0?
    jz donePrint                    ; Yes - Done.
    mov ah, 0eh                     ; No - keep going.
    int 10h                         ; Print character.
    jmp print                       ; Repeat
donePrint:
    ret                             ; Return


; todo: args
readSector:
    mov ah, 02h
    mov al, 1
    mov dl, 0x80
    mov ch, 0
    mov dh, 0
    mov cl, 2

    mov bx, 0x500


    int 13h
    jnc good
    jmp fail

main:
    ; First, setup some registers.
    cli                             ; Clear interrupts
    mov ax, 0x07C0                  ; Point all registers to segment
    mov ds, ax
    mov es, ax
    mov fs, ax
    mov gs, ax

    ; Create the stack(0x0000-0xFFFF).
    mov ax, 0x0000
    mov ss, ax                      ; Point SS to 0x0000
    mov sp, 0xFFFF                  ; Stack pointer at 0xFFFF
    sti                             ; Restore interrupts

    mov si, LOADING
    call print

    call readSector

    

    
fail:
    mov si, FAILURE_MSG
    call print
    
good:
    mov si, LOADOK 
    call print
    jmp 0x500

LOADING        db 0x0D, 0x0A, "Booting loader...", 0x0D, 0x0A, 0x00
FAILURE_MSG    db 0x0D, 0x0A, "ERROR: Press any key to reboot.", 0x0A, 0x00
LOADOK    db 0x0D, 0x0A, "load ok", 0x0A, 0x00



TIMES 510 - ($-$$) DB 0
DW 0xAA55

But it just bootloops. I tried other solutions to no avail. What am I doing wrong? If I need to update the question please tell me.

Thank you!

EDIT #1: According to Sep Roland's answer, I updated my code, but it is still not working. I'm putting the updated code here if it's any help. Also, if asked for it, I can post my 2nd stage code. It should be using 0x500 as org. NEW CODE:

bits 16                             ; Starting at 16 bits
org 0x0                             ; And starting at 0

jmp main                            ; Hop to main!


; TODO: copy comment from prev. loader
; args: SI
print:
    lodsb                           ; Load the next/first character to AL
    or al, al                       ; Is it 0?
    jz donePrint                    ; Yes - Done.
    mov ah, 0eh                     ; No - keep going.
    int 10h                         ; Print character.
    jmp print                       ; Repeat
donePrint:
    ret                             ; Return


; todo: args
readSector:
    mov ah, 02h
    mov al, 1
    mov ch, 0
    mov dh, 0
    mov cl, 2

    mov bx, 0x500


    int 13h
    jnc good
    jmp fail

main:
    ; First, setup some registers.
    cli                             ; Clear interrupts
    mov ax, 0x07C0                  ; Point all registers to segment
    mov ds, ax
    mov es, ax
    mov fs, ax
    mov gs, ax

    ; Create the stack(0x0000-0xFFFF).
    mov ax, 0x0000
    mov ss, ax                      ; Point SS to 0x0000
    mov sp, 0xFFFE                  ; Stack pointer at 0xFFFE
    sti                             ; Restore interrupts

    mov si, LOADING
    call print

    call readSector

    

    
fail:
    mov si, FAILURE_MSG
    call print
end:
    cli
    hlt
    jmp end
    
good:
    mov si, LOADOK 
    call print
    jmp 0x07C0:0x0500

LOADING        db 0x0D, 0x0A, "Booting loader...", 0x0D, 0x0A, 0x00
FAILURE_MSG    db 0x0D, 0x0A, "ERROR: Press any key to reboot.", 0x0A, 0x00
LOADOK    db 0x0D, 0x0A, "load ok", 0x0A, 0x00



TIMES 510 - ($-$$) DB 0
DW 0xAA55

EDIT #2: Posting second stage code including gdt.inc because someone mentioned LGDT may have been causing a problem:

MAIN CODE(SOME PARTS HAVE BEEN CUT OUT BUT THEY ARE NOT REQUIRED, like strings)

bits 16                                 ; We start at 16 bits

org 0x500                               ; We are loaded in at 0x500

jmp main                                ; Jump to main code.


; ----------------------------------------
; Includes
; ----------------------------------------
%include "include/stdio.inc"
%include "include/gdt.inc"
%include "include/a20.inc"


; ---------------------------------------
; Data and strings
; ---------------------------------------

stringhidden db "Not showing string.", 0x0D, 0x0A, 0x00
stringhidden db "Not showing string.", 0x0D, 0x0A, 0x00

; ---------------------------------------------------------------------
; main - 16-bit entry point
; Installing GDT, storing BIOS info, and enabling protected mode
; ---------------------------------------------------------------------

main:
    ; Our goal is jump to main32 to become 32-bit

    ; Setup segments and stack
    cli                                 ; Clear interrupts
    xor ax, ax                          ; Null segments AX, DS, and ES
    mov ds, ax
    mov es, ax
    mov ax, 0x9000                      ; Stack begins at 0x9000-0xFFFF
    mov ss, ax
    mov sp, 0xFFFF                      ; Stack pointer is 0xFFFF
    sti                                 ; Enable interrupts

    

    ; Install the GDT
    call installGDT                     ; Install the GDT

    ; Enable A20
    call enableA20_KKbrd_Out            ; Enable A20 through output port

    ; Print loading messages
    mov si, msg1
    call print16                        ; Print the message

    mov si, msg2                        ; A message
    call print16                        ; Print the message

    ; Enter protected mode
    cli                                 ; Clear interrupts
    mov eax, cr0                        ; Set bit 0 in CR0--ENTER protected mode
    or eax, 1
    mov cr0, eax

    jmp CODE_DESC:main32                ; Far jump to fix CS
    
    ; We can't re-enable interrupts because that would triple-fault. This will be fixed in main32.


bits 32                                 ; We are now 32 bit!

%include "include/stdio32.inc"

main32:
    ; Set registers up
    mov ax, 0x10                        ; Setup data segments to 0x10(data selector)
    mov ds, ax
    mov ss, ax
    mov es, ax
    mov esp, 90000h                     ; Stack begins from 90000h
    
    call clear32                        ; Clear screen
    mov ebx, MSGHIDDEN                   ; Setup params for our message
    call puts32                         ; Call puts32 to print

    cli                                 ; Clear interrupts
    hlt                                 ; Halt the processor

LGDT CODE:

%ifndef __GDT_INC_67343546FDCC56AAB872_INCLUDED__
%define __GDT_INC_67343546FDCC56AAB872_INCLUDED__

bits 16                     ; We are in 16-bit mode


; -----------------------------------------
; installGDT - install the GDT
; -----------------------------------------
installGDT:
    cli                     ; Clear interrupts
    pusha                   ; Save the registers
    lgdt [toc]              ; Load GDT into GDTR
    sti                     ; Re-enable interrupts
    popa                    ; Restore registers
    ret                     ; Return!


; ----------------------------------------
; Global Descriptor Table data
; ----------------------------------------

gdt_data:
    dd 0                    ; Null descriptor
    dd 0
    
    ; GDT code starts here
    dw 0FFFFh               ; Limit low
    dw 0                    ; Base low
    db 0                    ; Base middle
    db 10011010b            ; Access
    db 11001111b            ; Granularity
    db 0                    ; Base high

    ; GDT data starts here(mostly same as code, only difference is access)
    dw 0FFFFh               ; Limit low, again.
    dw 0                    ; Base low
    db 0                    ; Base middle
    db 10010010b            ; Access - different
    db 11001111b            ; Granularity
    db 0

gdt_end:
toc:
    dw gdt_end - gdt_data - 1
    dd gdt_data             ; Base of GDT

; Descriptor offsets names

%define NULL_DESC 0
%define CODE_DESC 0x8
%define DATA_DESC 0x10

; End of GDT code.
%endif ;__GDT_INC_67343546FDCC56AAB872_INCLUDED__

EDIT #3: Possible problems with stdio and stdio32 so putting those here

stdio.inc:

; ==============================================
; stdio.inc - IO routines
; Thanks to BrokenThorn Entertainment
; ==============================================

; First, show that we are defining stdio.inc
%ifndef __STDIO_INC_67343546FDCC56AAB872_INCLUDED__
%define __STDIO_INC_67343546FDCC56AAB872_INCLUDED__

; ------------------------------------------------
; Print16 - printing a null terminated string
; SI - 0 terminated string
; ------------------------------------------------

print16:
    pusha                                       ; Save registers for later
.loop1:
    lodsb                                       ; Load the next byte from the string into AL
    or al, al                                   ; Is AL 0?
    jz print16done                              ; Yes - we are done.
    mov ah, 0eh                                 ; No - print next character
    int 10h                                     ; Call BIOS
    jmp .loop1                                  ; Repeat!
print16done:
    popa                                        ; Restore registers
    ret                                         ; Return



%endif ;__STDIO_INC_67343546FDCC56AAB872_INCLUDED__

stdio32.inc:

; ==================================================
; stdio32.inc - Handles 32-bit graphics
; ==================================================

%ifndef __GFX_INC_67343546FDCC56AAB872_INCLUDED__
%define __GFX_INC_67343546FDCC56AAB872_INCLUDED__

bits 32                                         ; 32-bits

%define VIDEO_MEMORY 0xB8000                    ; Video memory address
%define COLS 80                                 ; Width of the screen
%define LINES 25                                ; Height of the string
%define CHARACTER_ATTRIBURE 63                  ; White text on Cyan background

_CurrentXPos db 0
_CurrentYPos db 0

; ---------------------------------------------------------
; char32 - Print a character to the screen(32-bit)
;   BL - Character to print
; ---------------------------------------------------------

char32:
    pusha                                       ; Save registers
    mov edi, VIDEO_MEMORY                       ; Get the pointer to the video memory

    ; Get current position
    xor eax, eax                                ; Zero-out EAX

    mov ecx, COLS*2                             ; Mode 7 has 2 bytes per character - and so COLS*2 bytes per line.
    mov al, byte [_CurrentYPos]                 ; Get Y position
    mul ecx                                     ; Multiply COLS * Y
    push eax                                    ; Save EAX--the multiplication

    mov al, byte [_CurrentXPos]                 ; Multiply _CurrentXPos by 2 because 2 bytes per char(Mode 7)
    mov cl, 2
    mul cl
    pop ecx                                     ; Pop Y*COLS result
    add eax, ecx
    
    xor ecx, ecx
    add edi, eax                                ; Add to base address

    ; Watch for a new line!
    cmp bl, 0x0A                                ; 0x0A - newline character.
    je .row                                     ; Jump to .row if newline char

    ; Print the character
    mov dl, bl                                  ; Get character
    mov dh, CHARACTER_ATTRIBURE                 ; Change DH to Character Attribute
    mov word [edi], dx                          ; Write to video memory
    
    ; Update next pos
    inc byte [_CurrentXPos]                     ; Go to next character
    ;cmp byte [_CurrentXPos], COLS               ; EOL?
    ;je .row                                     ; Yep - move to next row
    jmp .done                                   ; Nope - BAIL!
.row:
    ; Goto next row.
    mov byte [_CurrentXPos], 0                  ; Return to col 0
    inc byte [_CurrentYPos]                     ; Go to next row.

.done:
    ; Return
    popa
    ret


; ---------------------------------------------------------
; puts32 - print a null terminated string
;   EBX - String to print
; ---------------------------------------------------------

puts32:

    ; Store registers(EBX and EDI)
    pusha                                       ; Save registers
    push ebx                                    ; Copy string
    pop edi

.loop:
    
    mov bl, byte [edi]                          ; Get next character
    cmp bl, 0                                   ; Check if it's null
    je .done                                    ; It is - done printing.

    call char32                                 ; It isn't - print the character

    inc edi                                     ; Increment EDI for next character
    jmp .loop                                   ; Restart loop

.done:
    ; Update the hardware cursor

    mov bh, byte [_CurrentXPos]                 ; BH and BL are the params for movecursor
    mov bl, byte [_CurrentYPos]                 
    call movecursor                             ; Update cursor position

    popa                                        ; Restore registers
    ret                                         ; Return!

bits 32

; ---------------------------------------------------------
; movecursor - Move the cursor to an X and Y position
;   BH - X position
;   BL - Y position
; ---------------------------------------------------------

movecursor:
    
    pusha                                       ; Save registers

    ; Get current position(BH and BL are relative to the current position on screen, not memory)

    xor eax, eax                                ; Clear EAX
    mov ecx, COLS                               ; Store COLS in ECX for multiplication
    mov al, bh                                  ; Get Y position
    mul ecx                                     ; Multiply Y by cols
    add al, bl                                  ; Add X
    mov ebx, eax

    ; Set low byte index to VGA register

    mov al, 0x0f
    mov dx, 0x03D4
    out dx, al

    mov al, bl
    mov dx, 0x03D5
    out dx, al

    ; Do the same but for high byte

    xor eax, eax
    
    mov al, 0x0e
    mov dx, 0x03D4
    out dx, al

    mov al, bl
    mov dx, 0x03D5
    out dx, al

    popa                                        ; Restore registers
    ret                                         ; Return



; ---------------------------------------------------------
; clear32 - clearing the screen
; ---------------------------------------------------------

clear32:
    pusha                                       ; Save registers

    cld
    mov edi, VIDEO_MEMORY                       ; Set EDI to video memory
    mov cx, 2000                                
    mov ah, CHARACTER_ATTRIBURE                 ; Clear screen with character attribute
    mov al, ' '                                 ; Replace all chars with space
    rep stosw

    mov byte [_CurrentXPos], 0                  ; Reset X and Y position
    mov byte [_CurrentYPos], 0
    popa                                        ; Restore registers
    ret

    

; ---------------------------------------------------------
; gotoxy - Set X and Y position
;   AL - X position
;   AH - Y position
; ---------------------------------------------------------

gotoxy:
    pusha
    mov [_CurrentXPos], al                      ; Set X and Y position
    mov [_CurrentYPos], ah                      
    popa
    ret

%endif ;__STDIO_INC_67343546FDCC56AAB872_INCLUDED__

Solution

  • EDIT0 reviews the 1st stage
    EDIT1 reviews the 2nd stage
    EDIT2 reviews the included stdio32.inc

    [EDIT0]

    The sector that you load yourself, was loaded at offset 0x500 in the extra segment at 0x7C0.
    The jmp 0x500 instruction jumps to offset 0x500 in the code segment.
    There's no guarantee that CS==0x7C0. Use a far jump instead:

    jmp 0x07C0:0x0500
    

    mov dl, 0x80
    

    Are you sure about this drive number? It's always better to use the value that BIOS provided in the DL register when the bootloader is given control.


    mov sp, 0xFFFF
    

    A word-aligned stack pointer will be so much nicer!

    Use mov sp, 0xFFFE or even xor sp, sp (trusting wraparound).


    fail:
      mov si, FAILURE_MSG
      call print
    
    good:
      mov si, LOADOK 
      call print
      jmp 0x500
    

    In case loading the sector failed, you jump to fail, but after displaying the message you happily continue (fall through) with the code at good. You need to halt:

    fail:
        mov  si, FAILURE_MSG
        call print
    theEnd:
        cli
        hlt
        jmp  theEnd
    good:
        mov  si, LOADOK
        call print
        jmp  0x07C0:0x0500
    

    [EDIT1]

    xor ax, ax   ; Null segments AX, DS, and ES
    mov ds, ax
    mov es, ax
    

    You have applied corrections to the first stage bootloader. Control has successfully passed to 0x07C0:0x0500. Because the second stage uses an org 0x500 it is vital that the segment registers (at least DS) remain at 0x07C0. But the first thing I see is that you reload DS and ES with 0. This is not going to work since it creates a mismatch between the offsets (in accordance with the org) generated by the assembler and the offsets (relative DS) where the actual data resides.

    The first manifestation of this mismatch is with the lgdt [toc] instruction.
    All the data from the second stage resides in memory above the 0x8100 mark (0x7C00 + 0x0500).
    The toc label on the other hand will have been translated by the assembler to an offset address of little over 0x0500. With DS=0, this addresses memory much lower than that 0x8100 mark. There's simply no valid data to act upon, hence the crash (or similar)!

    The problem is not only with the lgdt [toc] instruction. mov si, msg1 and mov si, msg2 will fail in the same way, and mov ebx, MSGHIDDEN and dd gdt_data will fail on not being linear addresses at all. dw gdt_end - gdt_data - 1 is not impaired since the difference does not change when both magnitudes are wrong (in the same manner).

    What you should do is either keep the segment registers at 0x07C0 in accordance with the org 0x0500, or much better zero the segment registers already in the first stage bootloader and use an org 0x7C00 for the first stage and an org 0x8100 for the second stage. These settings will leave everything at the same place in memory, with a 768-bytes gap between the first and second stages which is something we don't see everyday.
    The much preferred way however is to zero the segment registers already in the first stage bootloader and use an org 0x7C00 for the first stage and an org 0x0600 for the second stage. This puts the second stage right behind the low memory BIOS variables. MS-DOS being our great example.

    A further review

    installGDT:
        cli                     ; Clear interrupts
        pusha                   ; Save the registers
        lgdt [toc]              ; Load GDT into GDTR
        sti                     ; Re-enable interrupts
        popa                    ; Restore registers
        ret                     ; Return!
    

    It is redundant to preserve the general purpose registers in this code. The lgdt instruction doesn't change any of them.


    ; Create the stack(0x0000-0xFFFF).
    mov ax, 0x0000
    mov ss, ax        ; Point SS to 0x0000
    mov sp, 0xFFFE    ; Stack pointer at 0xFFFE
    
    mov ax, 0x9000    ; Stack begins at 0x9000-0xFFFF
    mov ss, ax
    mov sp, 0xFFFF    ; Stack pointer is 0xFFFF
    
    mov ax, 0x10      ; Setup data segments to 0x10(data selector)
    mov ds, ax
    mov ss, ax
    mov es, ax
    mov esp, 90000h   ; Stack begins from 90000h
    

    (1) Don't use an odd value for SP.
    (2) Don't put anything between loading SS and ESP.
    (3) Don't use the word "begin" both for the low end and the high end of the stack.

    You are setting up the stack 3 times and each time in a different place and size! The first time it runs from 0x00000000 to 0x0000FFFD, the second time it runs from 0x00090000 to 0x0009FFFE, and the third time it runs from 0x00000000 to 0x0008FFFF.
    I would advice to setup the stack such that the real mode addresses correspond to the protected mode addresses. At least for the top 64KB.
    In the first stage use:

    mov  ax, 0x8000
    mov  ss, ax
    xor  sp, sp           ; 0x00080000 - 0x0008FFFF (64KB)
    

    In the second stage use:

    mov  ax, DATA_DESC
    mov  ss, ax
    mov  esp, 0x00090000  ; 0x00000000 - 0x0008FFFF (576KB)
    

    The first stage

    bits 16
    org  0x7C00
    
    jmp  main
    
    ; args: SI
    print:
        lodsb
        or   al, al
        jz   donePrint
        mov  bx, 0007h
        mov  ah, 0Eh
        int  10h
        jmp  print
    donePrint:
        ret
    
    main:
        cli
        xor  ax, ax
        mov  ds, ax
        mov  es, ax
        mov  fs, ax
        mov  gs, ax
        mov  ax, 0x8000    ; Stack between 0x8000:0x0000
        mov  ss, ax        ;           and 0x8000:0xFFFF (64KB)
        xor  sp, sp
        sti
    
        mov  si, LOADING
        call print
    
    readSector:
        mov  dh, 0
        mov  cx, 0002h
        mov  bx, 0x0600     ; Sector buffer at 0x0000:0x0600
        mov  ax, 0201h
        int  13h
        jnc  good
        
    fail:
        mov  si, FAILURE_MSG
        call print
    end:
        cli
        hlt
        jmp  end
        
    good:
        mov  si, LOADOK 
        call print
        jmp  0x0000:0x0600  ; Start second stage
    
    LOADING     db 13, 10, "Booting loader...", 13, 10, 0
    FAILURE_MSG db 13, 10, "ERROR: Press any key to reboot.", 10, 0
    LOADOK      db 13, 10, "load ok", 10, 0
    
    TIMES 510 - ($-$$) DB 0
    DW 0xAA55
    

    The second stage

    bits 16
    org  0x0600
    
    jmp  main
    ; ---------------------------------------
    ; Includes
    ; ---------------------------------------
    %include "include/stdio.inc"
    %include "include/gdt.inc"
    %include "include/a20.inc"
    ; ---------------------------------------
    ; Data and strings
    ; ---------------------------------------
    stringhidden db "Not showing string.", 0x0D, 0x0A, 0x00
    stringhidden db "Not showing string.", 0x0D, 0x0A, 0x00
    ; ---------------------------------------
    ; main - 16-bit entry point
    ; Installing GDT, storing BIOS info, and enabling protected mode
    ; ---------------------------------------
    
    main:
        call installGDT
        call enableA20_KKbrd_Out
        mov  si, msg1
        call print16
        mov  si, msg2
        call print16
    
        ; Enter protected mode
        cli
        mov  eax, cr0
        or   eax, 1
        mov  cr0, eax
    
        jmp  CODE_DESC:main32 ; Far jump to fix CS
        
        ; We can't re-enable interrupts because that would triple-fault. This will be fixed in main32.
    
    
    bits 32                   ; We are now 32 bit!
    
    %include "include/stdio32.inc"
    
    main32:
        ; Set registers up
        mov  ax, DATA_DESC
        mov  ds, ax
        mov  es, ax
        mov  fs, ax
        mov  gs, ax
        mov  ss, ax           ; Stack between 0x00000000
        mov  esp, 0x00090000  ;           and 0x0008FFFF (576KB)
    
        call clear32          ; Clear screen
        mov  ebx, MSGHIDDEN
        call puts32           ; Call puts32 to print
    
        cli
        hlt
    
        ...
    

    [EDIT2]

    @Sep Roland Your changes mostly work, but unfortunately there might be something wrong with my video code as well.. The system appears to stop typing after a little bit(24 bytes) and just gives up. I posted video code, but if this seems a like too much to ask I can stop here. Thank you!

    I reviewed your stdio32.inc and have found a number of errors in it!

    • The char32 code mentions in a comment "; Mode 7 has 2 bytes per character..." If indeed you are working on the monochrome video mode 7 then VIDEO_MEMORY should be set to 0xB0000 instead of 0xB8000.
    • The clear32 code moves 2000 in CX but rep stosw will be using ECX. The garbage in the high word of ECX can do a lot of harm. Use mov ecx, 2000.
    • The movecursor code receives X in BH and Y in BL, which is opposite of what I would have expected, you are calculating X * 80 + Y which needs to be Y * 80 + X, the forementioned addition uses the wrong size, and you out twice the low byte instead of doing low byte then high byte.

    The improved stdio32.inc

    ; ==================================================
    ; stdio32.inc - Handles 32-bit graphics
    ; ==================================================
    
    %ifndef __GFX_INC_67343546FDCC56AAB872_INCLUDED__
    %define __GFX_INC_67343546FDCC56AAB872_INCLUDED__
    
    bits 32
    
    %define VIDEO_MEMORY 0xB8000  ; Video memory address
    %define COLS 80               ; Width of the screen
    %define LINES 25              ; Height of the screen
    %define ATTRIB 0x3F           ; WhiteOnCyan
    _CurrentXPos db 0
    _CurrentYPos db 0
    
    ; ---------------------------------------------------------
    ; char32 - Print a character to the screen(32-bit)
    ;   BL - Character to print
    ; ---------------------------------------------------------
    
    char32:
        cmp   bl, 10
        je    .row
        push  eax
        push  edi
        movzx edi, byte [_CurrentYPos]
        imul  edi, COLS*2
        movzx eax, byte [_CurrentXPos]
        lea   edi, [VIDEO_MEMORY + edi + eax * 2]
        mov   al, bl
        mov   ah, ATTRIB
        mov   [edi], ax
        pop   edi
        pop   eax
        inc   byte [_CurrentXPos]
        cmp   byte [_CurrentXPos], COLS               ; EOL?
        je    .row
        ret
    .row:
        mov   byte [_CurrentXPos], 0
        inc   byte [_CurrentYPos]
        ret
    
    ; ---------------------------------------------------------
    ; puts32 - print a null terminated string
    ;   EBX - String to print
    ; ---------------------------------------------------------
    
    puts32:
    
        push  ebx
        push  edi
        mov   edi, ebx
        jmp   .start
    .loop:
        call  char32
        inc   edi
    .start:
        mov   bl, [edi]
        test  bl, bl
        jnz   .loop
        movzx ebx, word [_CurrentXPos] ; Load XPos and YPos together!
        call  movecursor               ; Update hardware cursor
        pop   edi
        pop   ebx
        ret
    
    ; ---------------------------------------------------------
    ; movecursor - Move the cursor to an X and Y position
    ;   BL - X position
    ;   BH - Y position
    ;   BH and BL are relative to the current position on screen
    ; ---------------------------------------------------------
    
    movecursor:
        pushad
        movzx eax, bh             ; BH * COLS + BL
        imul  eax, COLS
        movzx ebx, bl
        add   ebx, eax
        ; Set low byte index to VGA register
        mov   al, 0x0F
        mov   dx, 0x03D4
        out   dx, al
        mov   al, bl
        inc   dx
        out   dx, al
        ; Do the same but for high byte
        mov   al, 0x0E
        dec   dx
        out   dx, al
        mov   al, bh
        inc   dx
        out   dx, al
        popad
        ret
    
    ; ---------------------------------------------------------
    ; clear32 - clearing the screen
    ; ---------------------------------------------------------
    
    clear32:
        pushad
        mov   edi, VIDEO_MEMORY
        mov   ecx, 1000            ; 2000 words
        mov   eax, ((ATTRIB * 256 + 32) * 256 + ATTRIB) * 256 + 32
        rep stosd
        mov   [_CurrentXPos], cx   ; Reset XPos and YPos together!
        popad
        ret
    
    ; ---------------------------------------------------------
    ; gotoxy - Set X and Y position
    ;   AL - X position
    ;   AH - Y position
    ; ---------------------------------------------------------
    
    gotoxy:
        mov   [_CurrentXPos], ax   ; Set XPos and YPos together!
        ret
    
    %endif ;__STDIO_INC_67343546FDCC56AAB872_INCLUDED__
    

    While optimizing the code, it has become much shorter.
    What I am thinking now is that your code had gotten longer than 512 bytes and that the single-sector load from the first stage did not bring all of it into memory. That could certainly explain some partial string output that you are experiencing.

    I never remember (and my manual doesn't mention it), but for NASM is pusha the same as pushad when bits 32 is active?
    I deal with it this way:

    • Always writing pushad in 32-bit code.
    • Avoiding pushad and preferring to push individual registers as this is faster.