News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

x64 radix sort

Started by donkey, January 28, 2012, 04:32:04 PM

Previous topic - Next topic

donkey

I needed to sort some QWORDs in an X64 program and since there is a distinct lack of 64 bit sort routines out there I adapted one I had lying around to X64 mode.

// Syntax

SortArray DQ 1,7,2,5,3,8,4,6,9,0,0xFFFFFFFFFF

invoke RadixSortI64,offset SortArray,11


Pass64(Arg) macro
xor rax, rax
mov rcx, 256*8
:
and [rsp + rcx - 8], rax
and [rsp + rcx - 16], rax
and [rsp + rcx - 24], rax
and [rsp + rcx - 32], rax
sub rcx, 4*8
jnz <

mov rcx, rbx
:
movzx rax, B[rsi + rcx - 8 + Arg]
inc Q[rsp + rax*8]
sub rcx, 8
jnz <

xor rcx, rcx
xor rdx, rdx
:
mov rax, [rsp + rcx]
mov [rsp + rcx], rdx
add rdx, rax
add rcx, 8
cmp rcx, 256*8
jnz <

xor rcx, rcx
:
movzx rax, B[rsi + rcx + Arg]
mov rdx, [rsp + rax*8]
inc Q[rsp + rax*8]
mov rax, [rsi + rcx]
mov [rdi + rdx*8], rax
add rcx, 8
cmp rcx, rbx
jnz <

mov rax, rdi
mov rdi, rsi
mov rsi, rax

endm

RadixSortI64 FRAME pArray, dCount
uses rbx,rdi,rsi

mov rbx, [dCount]
mov rsi, [pArray]
shl rbx, 3
invoke VirtualAlloc, NULL, rbx, MEM_COMMIT or MEM_RESERVE, PAGE_READWRITE
test rax,rax
jz >>
mov rdi, rax
sub rsp, 256 * 8

Pass64(0)
Pass64(1)
Pass64(2)
Pass64(3)

Pass64(4)
Pass64(5)
Pass64(6)
Pass64(7)

add rsp, 256 * 8
invoke VirtualFree, rdi, 0, MEM_RELEASE
   :
ret
endf


I think the original routine was written by Nan, I'm not completely sure though.

Edgar
"Ahhh, what an awful dream. Ones and zeroes everywhere...[shudder] and I thought I saw a two." -- Bender
"It was just a dream, Bender. There's no such thing as two". -- Fry
-- Futurama

Donkey's Stable

dedndave

and [rsp + rcx - 8], rax
and [rsp + rcx - 16], rax
and [rsp + rcx - 24], rax
and [rsp + rcx - 32], rax


wouldn't MOV be faster ?

donkey

Quote from: dedndave on January 28, 2012, 04:44:41 PM
and [rsp + rcx - 8], rax
and [rsp + rcx - 16], rax
and [rsp + rcx - 24], rax
and [rsp + rcx - 32], rax


wouldn't MOV be faster ?

Could be, I just adapted the algorithm from NaN's original. I didn't spend any time looking into optimizing it as it is a pretty minor part of my project and the array it needs to sort only has around 200 elements max. I'll have to try it both ways on a random array to be sure.

Edgar

This version should work in either x64 mode or x86 compatibility mode (/x86 on the GoAsm command line). For 32 bit builds change the S=8 to S=4.

S=8

Pass64(Arg) macro
xor rax, rax
mov rcx, 256*8
:
and [rsp + rcx - (S)], rax
and [rsp + rcx - (S*2)], rax
and [rsp + rcx - (S*3)], rax
and [rsp + rcx - (S*4)], rax
sub rcx, 4*S
jnz <

mov rcx, rbx
:
movzx rax, B[rsi + rcx - 8 + Arg]
inc S[rsp + rax*8]
sub rcx, 8
jnz <

xor rcx, rcx
xor rdx, rdx
:
mov rax, [rsp + rcx]
mov [rsp + rcx], rdx
add rdx, rax
add rcx, 8
cmp rcx, 256*8
jnz <

xor rcx, rcx
:
movzx rax, B[rsi + rcx + Arg]
mov rdx, [rsp + rax*8]
inc S[rsp + rax*8]
mov rax, [rsi + rcx]
mov [rdi + rdx*8], rax
add rcx, 8
cmp rcx, rbx
jnz <

mov rax, rdi
mov rdi, rsi
mov rsi, rax

endm

RadixSortI64 FRAME pArray, dCount
uses rbx,rdi,rsi

mov rbx, [dCount]
mov rsi, [pArray]
shl rbx, 3
invoke VirtualAlloc, NULL, rbx, MEM_COMMIT or MEM_RESERVE, PAGE_READWRITE
test rax,rax
jz >>
mov rdi, rax
sub rsp, 256 * 8

Pass64(0)
Pass64(1)
Pass64(2)
Pass64(3)

Pass64(4)
Pass64(5)
Pass64(6)
Pass64(7)

add rsp, 256 * 8
invoke VirtualFree, rdi, 0, MEM_RELEASE
   :
ret
endf
"Ahhh, what an awful dream. Ones and zeroes everywhere...[shudder] and I thought I saw a two." -- Bender
"It was just a dream, Bender. There's no such thing as two". -- Fry
-- Futurama

Donkey's Stable