Sorting a REAL4 is straightforward - just treat it as a dword. That's what I thought. But it fails with negative numbers:
1999999 1.23456593368275280e+38
1999998 1.23456593368275280e+38
1999997 1.23456542662251271e+38
1999996 1.23456512238636865e+38
...
4 -1.25958426030349832e+32 <<<< that one is lower than #0
3 -5.56941002978919206e+31
2 -1.36275776159796797e+31
1 -5.94511516030574806e+30
0 -5.77247571756150055e+30
I assume it's by design :bg
Any known workaround compatible with a speedy innermost loop?
I think you have seen this link before.
Comparing floating point numbers (http://www.cygnus-software.com/papers/comparingfloats/Comparing%20floating%20point%20numbers.htm)
"A more general way of handling negative numbers is to adjust them so that they are lexicographically ordered as twos-complement integers instead of as signed magnitude integers. This is done by detecting negative numbers and subtracting them from 0x80000000."
Perfect, Michael, that is exactly what I was looking for :U
The method appears to work correctly. I tried to implement it in a version of the MASM32 ascending insertion string sort that I had previously modified to sort integers, but modifying it to handle REAL4 values proved to more work than I cared to do. So I used the CRT qsort function, and put the conversion code in the compare procedure.
;===================================================================================
include \masm32\include\masm32rt.inc
;===================================================================================
printf MACRO format:REQ, args:VARARG
IFNB <args>
invoke crt_printf, cfm$(format), args
ELSE
invoke crt_printf, cfm$(format)
ENDIF
EXITM <>
ENDM
;===================================================================================
.data
double REAL8 ?
.code
;===================================================================================
compare proc c p1:dword, p2:dword
mov ecx, p1
mov edx, p2
mov eax, [ecx]
.if sdword ptr eax < 0
mov eax, 80000000h
sub eax, [ecx]
.endif
mov ecx, [edx]
.if sdword ptr ecx < 0
mov ecx, 80000000h
sub ecx, [edx]
.endif
.if sdword ptr eax < ecx
mov eax, -1
.elseif sdword ptr eax > ecx
mov eax, 1
.else
xor eax, eax
.endif
ret
compare endp
;===================================================================================
start:
;===================================================================================
mov esi, alloc(400)
xor ebx, ebx
.while ebx < 100
invoke nrandom, 100
sub eax, 50
push eax
fild dword ptr [esp]
fldpi
fdiv
fstp dword ptr [esi+ebx*4]
pop eax
inc ebx
.endw
xor ebx, ebx
.while ebx < 100
fld dword ptr [esi+ebx*4]
fstp double
printf( "%f\n", double )
inc ebx
.endw
inkey "Press any key to continue..."
invoke crt_qsort, esi, 100, 4, addr compare
xor ebx, ebx
.while ebx < 100
fld dword ptr [esi+ebx*4]
fstp double
printf( "%f\n", double )
inc ebx
.endw
inkey "Press any key to exit..."
exit
;===================================================================================
end start
This version is my attempt to test over the full range of REAL4 values.
;===================================================================================
include \masm32\include\masm32rt.inc
;===================================================================================
printf MACRO format:REQ, args:VARARG
IFNB <args>
invoke crt_printf, cfm$(format), args
ELSE
invoke crt_printf, cfm$(format)
ENDIF
EXITM <>
ENDM
;===================================================================================
.data
double REAL8 ?
q dq 0
.code
;===================================================================================
compare proc c p1:dword, p2:dword
mov ecx, p1
mov edx, p2
mov eax, [ecx]
.if sdword ptr eax < 0
mov eax, 80000000h
sub eax, [ecx]
.endif
mov ecx, [edx]
.if sdword ptr ecx < 0
mov ecx, 80000000h
sub ecx, [edx]
.endif
.if sdword ptr eax < ecx
mov eax, -1
.elseif sdword ptr eax > ecx
mov eax, 1
.else
xor eax, eax
.endif
ret
compare endp
;===================================================================================
start:
;===================================================================================
N = 1000000
mov esi, alloc(N*4)
xor ebx, ebx
.while ebx < N-1
invoke nrandom, 0ffffffffh
;----------------------------------------------------------
; Clear the sign bit and set it on half of the iterations.
;----------------------------------------------------------
mov ecx, ebx
shl ecx, 31
and eax, 7fffffffh
or eax, ecx
;-------------------------------------------------------
; Load the result as a REAL4 and store it in the array.
;-------------------------------------------------------
push eax
fld dword ptr [esp]
fstp dword ptr [esi+ebx*4]
pop eax
inc ebx
.endw
;-----------------------------------------
; Reduce N here to force errors for test.
;-----------------------------------------
invoke crt_qsort, esi, N, 4, addr compare
inkey "Sorted, press any key to view samples..."
xor ebx, ebx
.while ebx < N
fld dword ptr [esi+ebx*4]
fstp double
printf( "%f\n", double)
add ebx, (N/200)
.endw
printf( "\n" )
inkey "Press any key to check sort order..."
xor ebx, ebx
.while ebx < N-1
fld dword ptr [esi+ebx*4]
fld dword ptr [esi+ebx*4+4]
fcompp
fstsw ax
fwait
sahf
jz @f ; ok if equal
ja @f ; ok if element at higher address larger
printf( "*" )
@@:
inc ebx
.endw
printf( "\n" )
inkey "Press any key to exit..."
exit
;===================================================================================
end start
I think some counter measurements (not countermeasures :lol) are in order.
compare_SSE1 proc C p1:PTR REAL4, p2:PTR REAL4
mov eax,p1
mov edx,p2
movss xmm0,real4 ptr [eax]
movss xmm1,real4 ptr [edx]
xor eax,eax
comiss xmm0,xmm1
setz al
sbb edx,edx
sbb edx,-1
dec eax
and eax,edx
ret
compare_SSE1 endp
compare_P6 proc C p1:PTR REAL4, p2:PTR REAL4
mov eax,p1
mov edx,p2
fld real4 ptr [eax]
fld real4 ptr [edx]
xor eax,eax
fcomip st(0),st(1)
ffree st
setz al
sbb edx,edx
sbb edx,-1
dec eax
and eax,edx
ret
compare_P6 endp
Actually, I found a better solution, which costs only a few cycles after the sort:
Since all negative numbers are sorted in the wrong order, I just find the first negative number, and swap the area between first negative and last item.
Timings are almost identical, and it allows to use a flag telling the algo "sort as dword integer" or "sort as REAL4".
I was expecting my brain-dead compare-as-integer procedure to do much worse than it does.
;==============================================================================
include \masm32\include\masm32rt.inc
.686
.MMX
.XMM
include \masm32\macros\timers.asm
;==============================================================================
.data
i1 dd 1
i2 dd 2
r1 REAL4 1.23
r2 REAL4 -2.34
.code
;==============================================================================
; I included this as a minimum (will not work reliably for negative values).
compare proc C p1:PTR DWORD, p2:PTR DWORD
mov ecx, p1
mov edx, p2
mov eax, [ecx]
sub eax, [edx]
ret
compare endp
;==============================================================================
compare_SSE1 proc C p1:PTR REAL4, p2:PTR REAL4
mov eax,p1
mov edx,p2
movss xmm0,real4 ptr [eax]
movss xmm1,real4 ptr [edx]
xor eax,eax
comiss xmm0,xmm1
setz al
sbb edx,edx
sbb edx,-1
dec eax
and eax,edx
ret
compare_SSE1 endp
;==============================================================================
compare_P6 proc C p1:PTR REAL4, p2:PTR REAL4
mov eax,p1
mov edx,p2
fld real4 ptr [eax]
fld real4 ptr [edx]
xor eax,eax
fcomip st(0),st(1)
ffree st
setz al
sbb edx,edx
sbb edx,-1
dec eax
and eax,edx
ret
compare_P6 endp
;==============================================================================
compare_asint proc C p1:PTR REAL4, p2:PTR REAL4
mov ecx, p1
mov edx, p2
mov eax, [ecx]
.if sdword ptr eax < 0
mov eax, 80000000h
sub eax, [ecx]
.endif
mov ecx, [edx]
.if sdword ptr ecx < 0
mov ecx, 80000000h
sub ecx, [edx]
.endif
.if sdword ptr eax < ecx
mov eax, -1
.elseif sdword ptr eax > ecx
mov eax, 1
.else
xor eax, eax
.endif
ret
compare_asint endp
;==============================================================================
;==============================================================================
start:
;==============================================================================
invoke Sleep, 3000
REPEAT 3
counter_begin 1000, HIGH_PRIORITY_CLASS
counter_end
print str$(eax)," cycles, empty",13,10
counter_begin 1000, HIGH_PRIORITY_CLASS
invoke compare, addr i1, addr i2
counter_end
print str$(eax)," cycles, compare",13,10
counter_begin 1000, HIGH_PRIORITY_CLASS
invoke compare_P6, addr r1, addr r2
counter_end
print str$(eax)," cycles, compare_P6",13,10
counter_begin 1000, HIGH_PRIORITY_CLASS
invoke compare_SSE1, addr r1, addr r2
counter_end
print str$(eax)," cycles, compare_SSE1",13,10
counter_begin 1000, HIGH_PRIORITY_CLASS
invoke compare_asint, addr r1, addr r2
counter_end
print str$(eax)," cycles, compare_asint",13,10,13,10
ENDM
inkey "Press any key to exit..."
exit
;==============================================================================
end start
Running on a P6:
0 cycles, empty
6 cycles, compare
13 cycles, compare_P6
12 cycles, compare_SSE1
13 cycles, compare_asint
0 cycles, empty
6 cycles, compare
13 cycles, compare_P6
12 cycles, compare_SSE1
13 cycles, compare_asint
0 cycles, empty
6 cycles, compare
13 cycles, compare_P6
12 cycles, compare_SSE1
13 cycles, compare_asint
Celeron M:
0 cycles, empty
3 cycles, compare
9 cycles, compare_P6
9 cycles, compare_SSE1
10 cycles, compare_asint
AMD Opteron 148
0 cycles, empty
3 cycles, compare
6 cycles, compare_P6
3 cycles, compare_SSE1
9 cycles, compare_asint
0 cycles, empty
3 cycles, compare
5 cycles, compare_P6
3 cycles, compare_SSE1
9 cycles, compare_asint
0 cycles, empty
3 cycles, compare
6 cycles, compare_P6
3 cycles, compare_SSE1
9 cycles, compare_asint
The problem with the negative numbers seems to be solved, see here (http://www.masm32.com/board/index.php?topic=12460.msg135639#msg135639). So far no bugs detected, and speed is also more than fine.
I put up a small testbed:
Intel(R) Pentium(R) 4 CPU 3.40GHz (SSE3)
Initial loop count is 10; cpe=cycles per array element
** 160000 elements:
88450 kCycles, 566 cpe for nrQsortA
139969 kCycles, 896 cpe for CombSortA
129500 kCycles, 829 cpe for ArraySort
** 80000 elements:
40859 kCycles, 523 cpe for nrQsortA
64227 kCycles, 822 cpe for CombSortA
59053 kCycles, 756 cpe for ArraySort
** 40000 elements:
19181 kCycles, 491 cpe for nrQsortA
29722 kCycles, 761 cpe for CombSortA
27398 kCycles, 701 cpe for ArraySort
** 20000 elements:
9176 kCycles, 470 cpe for nrQsortA
13680 kCycles, 700 cpe for CombSortA
12904 kCycles, 661 cpe for ArraySort
** 10000 elements:
4324 kCycles, 443 cpe for nrQsortA
6280 kCycles, 643 cpe for CombSortA
5742 kCycles, 588 cpe for ArraySort
** 5000 elements:
2055 kCycles, 421 cpe for nrQsortA
2816 kCycles, 577 cpe for CombSortA
2549 kCycles, 522 cpe for ArraySort
** 2500 elements:
963 kCycles, 395 cpe for nrQsortA
1229 kCycles, 503 cpe for CombSortA
1161 kCycles, 476 cpe for ArraySort
** 1250 elements:
450 kCycles, 369 cpe for nrQsortA
568 kCycles, 466 cpe for CombSortA
508 kCycles, 416 cpe for ArraySort
prescott w/htt
Intel(R) Pentium(R) 4 CPU 3.00GHz (SSE3)
** 160000 elements:
86915 kCycles, 556 cpe for nrQsortA
142185 kCycles, 910 cpe for CombSortA
133368 kCycles, 854 cpe for ArraySort
** 80000 elements:
41284 kCycles, 528 cpe for nrQsortA
65963 kCycles, 844 cpe for CombSortA
60480 kCycles, 774 cpe for ArraySort
** 40000 elements:
20086 kCycles, 514 cpe for nrQsortA
30694 kCycles, 786 cpe for CombSortA
27904 kCycles, 714 cpe for ArraySort
** 20000 elements:
9919 kCycles, 508 cpe for nrQsortA
13585 kCycles, 696 cpe for CombSortA
15085 kCycles, 772 cpe for ArraySort
** 10000 elements:
4351 kCycles, 446 cpe for nrQsortA
6662 kCycles, 682 cpe for CombSortA
6034 kCycles, 618 cpe for ArraySort
** 5000 elements:
2244 kCycles, 460 cpe for nrQsortA
2851 kCycles, 584 cpe for CombSortA
2841 kCycles, 582 cpe for ArraySort
** 2500 elements:
965 kCycles, 395 cpe for nrQsortA
1288 kCycles, 528 cpe for CombSortA
1212 kCycles, 497 cpe for ArraySort
** 1250 elements:
464 kCycles, 381 cpe for nrQsortA
614 kCycles, 503 cpe for CombSortA
513 kCycles, 421 cpe for ArraySort