I have to develop a bubble sort algorithm with AVX instructions with single precision numbers in input. Can anyone help me to look for the best implementation?
I did a bubble sort version for SSE3:
global sort32
sort32: start
mov eax, [ebp+8] ; float* x
mov ebx, [ebp+12] ; int n
call sort
stop
; --------------------------------------------------
; Inserire qui il proprio algoritmo di ordinamento
; --------------------------------------------------
; eax = vector start address
; ebx = vector length
; --------------------------------------------------
sort:
mov edi, ebx ;contatore ciclo esterno
sub edi, 4
ciclo_esterno:
mov esi, 0 ;contatore ciclo interno
ciclo_interno:
movups xmm0, [eax+esi*4]
jmp verifica
; controllo se l'array da 4 non è già ordinato
verifica:
movaps xmm4, xmm0
shufps xmm4, xmm4, 10010000b
cmpleps xmm4, xmm0
movmskps edx, xmm4
cmp edx, 15
je incremento
movaps xmm1, xmm0
movhlps xmm1, xmm0
movaps xmm4, xmm0 ;confronto
minps xmm0, xmm1
maxps xmm1, xmm4
shufps xmm1, xmm1, 11100001b ;inverto i massimi e riconfronto
movaps xmm4, xmm0 ;confronto
minps xmm0, xmm1
maxps xmm1, xmm4
movaps xmm4, xmm0
shufps xmm4, xmm4, 11100001b ; confronto la coppia dei minimi
cmpltps xmm4, xmm0
movmskps edx, xmm4
cmp edx, 2
je cmp_max
shufps xmm0, xmm0, 11100001b ; non sono ordinati all'interno quindi scambio
cmp_max:
movaps xmm4, xmm1
shufps xmm4, xmm4, 11100001b ; confronto la coppia dei massimi
cmpltps xmm4, xmm1
movmskps edx, xmm4
cmp edx, 2
je unisci
shufps xmm1, xmm1, 11100001b ; non sono ordinati all'interno quindi scambio
unisci:
movlhps xmm0, xmm1
movups [eax+esi*4], xmm0
incremento:
inc esi
cmp esi, edi
jg aggiorna_edi
jmp ciclo_interno
aggiorna_edi:
dec edi
cmp edi, 0
jl endl
jmp ciclo_esterno
endl: ret
Most sorting algorithms do not generally lend themselves to SIMD implementation. You might want to consider using a network sort algorithm instead, which is relatively simple to implement with SIMD for small numbers of elements. For larger sorts you can use the network sort as the inner "kernel" of a larger non-SIMD sort algorithm.