The MASM Forum Archive 2004 to 2012

Project Support Forums => GoAsm Assembler and Tools => Topic started by: donkey on January 28, 2012, 04:32:04 PM

Title: x64 radix sort
Post by: donkey on January 28, 2012, 04:32:04 PM
I needed to sort some QWORDs in an X64 program and since there is a distinct lack of 64 bit sort routines out there I adapted one I had lying around to X64 mode.

// Syntax

SortArray DQ 1,7,2,5,3,8,4,6,9,0,0xFFFFFFFFFF

invoke RadixSortI64,offset SortArray,11


Pass64(Arg) macro
xor rax, rax
mov rcx, 256*8
:
and [rsp + rcx - 8], rax
and [rsp + rcx - 16], rax
and [rsp + rcx - 24], rax
and [rsp + rcx - 32], rax
sub rcx, 4*8
jnz <

mov rcx, rbx
:
movzx rax, B[rsi + rcx - 8 + Arg]
inc Q[rsp + rax*8]
sub rcx, 8
jnz <

xor rcx, rcx
xor rdx, rdx
:
mov rax, [rsp + rcx]
mov [rsp + rcx], rdx
add rdx, rax
add rcx, 8
cmp rcx, 256*8
jnz <

xor rcx, rcx
:
movzx rax, B[rsi + rcx + Arg]
mov rdx, [rsp + rax*8]
inc Q[rsp + rax*8]
mov rax, [rsi + rcx]
mov [rdi + rdx*8], rax
add rcx, 8
cmp rcx, rbx
jnz <

mov rax, rdi
mov rdi, rsi
mov rsi, rax

endm

RadixSortI64 FRAME pArray, dCount
uses rbx,rdi,rsi

mov rbx, [dCount]
mov rsi, [pArray]
shl rbx, 3
invoke VirtualAlloc, NULL, rbx, MEM_COMMIT or MEM_RESERVE, PAGE_READWRITE
test rax,rax
jz >>
mov rdi, rax
sub rsp, 256 * 8

Pass64(0)
Pass64(1)
Pass64(2)
Pass64(3)

Pass64(4)
Pass64(5)
Pass64(6)
Pass64(7)

add rsp, 256 * 8
invoke VirtualFree, rdi, 0, MEM_RELEASE
   :
ret
endf


I think the original routine was written by Nan, I'm not completely sure though.

Edgar
Title: Re: x64 radix sort
Post by: dedndave on January 28, 2012, 04:44:41 PM
and [rsp + rcx - 8], rax
and [rsp + rcx - 16], rax
and [rsp + rcx - 24], rax
and [rsp + rcx - 32], rax


wouldn't MOV be faster ?
Title: Re: x64 radix sort
Post by: donkey on January 28, 2012, 04:48:38 PM
Quote from: dedndave on January 28, 2012, 04:44:41 PM
and [rsp + rcx - 8], rax
and [rsp + rcx - 16], rax
and [rsp + rcx - 24], rax
and [rsp + rcx - 32], rax


wouldn't MOV be faster ?

Could be, I just adapted the algorithm from NaN's original. I didn't spend any time looking into optimizing it as it is a pretty minor part of my project and the array it needs to sort only has around 200 elements max. I'll have to try it both ways on a random array to be sure.

Edgar

This version should work in either x64 mode or x86 compatibility mode (/x86 on the GoAsm command line). For 32 bit builds change the S=8 to S=4.

S=8

Pass64(Arg) macro
xor rax, rax
mov rcx, 256*8
:
and [rsp + rcx - (S)], rax
and [rsp + rcx - (S*2)], rax
and [rsp + rcx - (S*3)], rax
and [rsp + rcx - (S*4)], rax
sub rcx, 4*S
jnz <

mov rcx, rbx
:
movzx rax, B[rsi + rcx - 8 + Arg]
inc S[rsp + rax*8]
sub rcx, 8
jnz <

xor rcx, rcx
xor rdx, rdx
:
mov rax, [rsp + rcx]
mov [rsp + rcx], rdx
add rdx, rax
add rcx, 8
cmp rcx, 256*8
jnz <

xor rcx, rcx
:
movzx rax, B[rsi + rcx + Arg]
mov rdx, [rsp + rax*8]
inc S[rsp + rax*8]
mov rax, [rsi + rcx]
mov [rdi + rdx*8], rax
add rcx, 8
cmp rcx, rbx
jnz <

mov rax, rdi
mov rdi, rsi
mov rsi, rax

endm

RadixSortI64 FRAME pArray, dCount
uses rbx,rdi,rsi

mov rbx, [dCount]
mov rsi, [pArray]
shl rbx, 3
invoke VirtualAlloc, NULL, rbx, MEM_COMMIT or MEM_RESERVE, PAGE_READWRITE
test rax,rax
jz >>
mov rdi, rax
sub rsp, 256 * 8

Pass64(0)
Pass64(1)
Pass64(2)
Pass64(3)

Pass64(4)
Pass64(5)
Pass64(6)
Pass64(7)

add rsp, 256 * 8
invoke VirtualFree, rdi, 0, MEM_RELEASE
   :
ret
endf