What is the minimum X86 assembly needed for a spinlock

To implement a spinlock in assembly. Here I post a solution I came up with. Is it correct? Do you know a shorter one?

lock:

    mov ecx, 0
.loop:
    xchg [eax], ecx
    cmp ecx, 0
    je .loop

release:

    lock dec dword [eax]

eax is initialized to -1 (which means lock is free). This should work for many threads (not necessarily 2).

Solution

Shortest would probably be:

acquire:
    lock bts [eax],0
    jc acquire

release:
    mov [eax],0

For performance, it's best to use a "test, test and set" approach, and use pause, like this:

acquire:
    lock bts [eax],0    ;Optimistic first attempt
    jnc l2              ;Success if acquired
l1:
    pause
    test [eax],1        
    jne l1              ;Don't attempt again unless there's a chance

    lock bts [eax],0    ;Attempt to acquire
    jc l1               ;Wait again if failed

l2:

release:
    mov [eax],0

For debugging, you can add extra data to make it easier to detect problems, like this:

acquire:
    lock bts [eax],31         ;Optimistic first attempt
    jnc l2                    ;Success if acquired

    mov ebx,[CPUnumber]
    lea ebx,[ebx+0x80000000]
    cmp [eax],ebx             ;Is the lock acquired by this CPU?
    je .bad                   ; yes, deadlock
    lock inc dword [eax+4]    ;Increase "lock contention counter"
l1:
    pause
    test [eax],0x80000000        
    jne l1                    ;Don't attempt again unless there's a chance

    lock bts [eax],31         ;Attempt to acquire
    jc l1                     ;Wait again if failed

l2: mov [eax],ebx             ;Store CPU number

release:
    mov ebx,[CPUnumber]
    lea ebx,[ebx+0x80000000]
    cmp [eax],ebx             ;Is lock acquired, and is CPU same?
    jne .bad                  ; no, either not acquired or wrong CPU
    mov [eax],0