I needed to sort some QWORDs in an X64 program and since there is a distinct lack of 64 bit sort routines out there I adapted one I had lying around to X64 mode.
// Syntax
SortArray DQ 1,7,2,5,3,8,4,6,9,0,0xFFFFFFFFFF
invoke RadixSortI64,offset SortArray,11
Pass64(Arg) macro
xor rax, rax
mov rcx, 256*8
:
and [rsp + rcx - 8], rax
and [rsp + rcx - 16], rax
and [rsp + rcx - 24], rax
and [rsp + rcx - 32], rax
sub rcx, 4*8
jnz <
mov rcx, rbx
:
movzx rax, B[rsi + rcx - 8 + Arg]
inc Q[rsp + rax*8]
sub rcx, 8
jnz <
xor rcx, rcx
xor rdx, rdx
:
mov rax, [rsp + rcx]
mov [rsp + rcx], rdx
add rdx, rax
add rcx, 8
cmp rcx, 256*8
jnz <
xor rcx, rcx
:
movzx rax, B[rsi + rcx + Arg]
mov rdx, [rsp + rax*8]
inc Q[rsp + rax*8]
mov rax, [rsi + rcx]
mov [rdi + rdx*8], rax
add rcx, 8
cmp rcx, rbx
jnz <
mov rax, rdi
mov rdi, rsi
mov rsi, rax
endm
RadixSortI64 FRAME pArray, dCount
uses rbx,rdi,rsi
mov rbx, [dCount]
mov rsi, [pArray]
shl rbx, 3
invoke VirtualAlloc, NULL, rbx, MEM_COMMIT or MEM_RESERVE, PAGE_READWRITE
test rax,rax
jz >>
mov rdi, rax
sub rsp, 256 * 8
Pass64(0)
Pass64(1)
Pass64(2)
Pass64(3)
Pass64(4)
Pass64(5)
Pass64(6)
Pass64(7)
add rsp, 256 * 8
invoke VirtualFree, rdi, 0, MEM_RELEASE
:
ret
endf
I think the original routine was written by Nan, I'm not completely sure though.
Edgar
and [rsp + rcx - 8], rax
and [rsp + rcx - 16], rax
and [rsp + rcx - 24], rax
and [rsp + rcx - 32], rax
wouldn't MOV be faster ?
Quote from: dedndave on January 28, 2012, 04:44:41 PM
and [rsp + rcx - 8], rax
and [rsp + rcx - 16], rax
and [rsp + rcx - 24], rax
and [rsp + rcx - 32], rax
wouldn't MOV be faster ?
Could be, I just adapted the algorithm from NaN's original. I didn't spend any time looking into optimizing it as it is a pretty minor part of my project and the array it needs to sort only has around 200 elements max. I'll have to try it both ways on a random array to be sure.
Edgar
This version should work in either x64 mode or x86 compatibility mode (/x86 on the GoAsm command line). For 32 bit builds change the S=8 to S=4.
S=8
Pass64(Arg) macro
xor rax, rax
mov rcx, 256*8
:
and [rsp + rcx - (S)], rax
and [rsp + rcx - (S*2)], rax
and [rsp + rcx - (S*3)], rax
and [rsp + rcx - (S*4)], rax
sub rcx, 4*S
jnz <
mov rcx, rbx
:
movzx rax, B[rsi + rcx - 8 + Arg]
inc S[rsp + rax*8]
sub rcx, 8
jnz <
xor rcx, rcx
xor rdx, rdx
:
mov rax, [rsp + rcx]
mov [rsp + rcx], rdx
add rdx, rax
add rcx, 8
cmp rcx, 256*8
jnz <
xor rcx, rcx
:
movzx rax, B[rsi + rcx + Arg]
mov rdx, [rsp + rax*8]
inc S[rsp + rax*8]
mov rax, [rsi + rcx]
mov [rdi + rdx*8], rax
add rcx, 8
cmp rcx, rbx
jnz <
mov rax, rdi
mov rdi, rsi
mov rsi, rax
endm
RadixSortI64 FRAME pArray, dCount
uses rbx,rdi,rsi
mov rbx, [dCount]
mov rsi, [pArray]
shl rbx, 3
invoke VirtualAlloc, NULL, rbx, MEM_COMMIT or MEM_RESERVE, PAGE_READWRITE
test rax,rax
jz >>
mov rdi, rax
sub rsp, 256 * 8
Pass64(0)
Pass64(1)
Pass64(2)
Pass64(3)
Pass64(4)
Pass64(5)
Pass64(6)
Pass64(7)
add rsp, 256 * 8
invoke VirtualFree, rdi, 0, MEM_RELEASE
:
ret
endf