News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

Sorting a REAL4 as integer

Started by jj2007, April 02, 2011, 12:46:26 AM

Previous topic - Next topic

jj2007

Sorting a REAL4 is straightforward - just treat it as a dword. That's what I thought. But it fails with negative numbers:
1999999 1.23456593368275280e+38
1999998 1.23456593368275280e+38
1999997 1.23456542662251271e+38
1999996 1.23456512238636865e+38
...
4       -1.25958426030349832e+32   <<<< that one is lower than #0
3       -5.56941002978919206e+31
2       -1.36275776159796797e+31
1       -5.94511516030574806e+30
0       -5.77247571756150055e+30


I assume it's by design :bg
Any known workaround compatible with a speedy innermost loop?

MichaelW

I think you have seen this link before.

Comparing floating point numbers

"A more general way of handling negative numbers is to adjust them so that they are lexicographically ordered as twos-complement integers instead of as signed magnitude integers. This is done by detecting negative numbers and subtracting them from 0x80000000."
eschew obfuscation

jj2007

Perfect, Michael, that is exactly what I was looking for :U

MichaelW

The method appears to work correctly. I tried to implement it in a version of the MASM32 ascending insertion string sort that I had previously modified to sort integers, but modifying it to handle REAL4 values proved to more work than I cared to do. So I used the CRT qsort function, and put the conversion code in the compare procedure.

;===================================================================================
    include \masm32\include\masm32rt.inc
;===================================================================================

printf MACRO format:REQ, args:VARARG
    IFNB <args>
        invoke crt_printf, cfm$(format), args
    ELSE
        invoke crt_printf, cfm$(format)
    ENDIF
    EXITM <>
ENDM

;===================================================================================
    .data
        double REAL8 ?
    .code
;===================================================================================

compare proc c p1:dword, p2:dword
    mov ecx, p1
    mov edx, p2
    mov eax, [ecx]
    .if sdword ptr eax < 0
        mov eax, 80000000h
        sub eax, [ecx]
    .endif
    mov ecx, [edx]
    .if sdword ptr ecx < 0
        mov ecx, 80000000h
        sub ecx, [edx]
    .endif
    .if sdword ptr eax < ecx
        mov eax, -1
    .elseif sdword ptr eax > ecx
        mov eax, 1
    .else
        xor eax, eax
    .endif
    ret
compare endp

;===================================================================================
start:
;===================================================================================
    mov esi, alloc(400)

    xor ebx, ebx
    .while ebx < 100
        invoke nrandom, 100
        sub eax, 50
        push eax
        fild dword ptr [esp]
        fldpi
        fdiv
        fstp dword ptr [esi+ebx*4]
        pop eax
        inc ebx
    .endw

    xor ebx, ebx
    .while ebx < 100
        fld dword ptr [esi+ebx*4]
        fstp double
        printf( "%f\n", double )
        inc ebx
    .endw

    inkey "Press any key to continue..."

    invoke crt_qsort, esi, 100, 4, addr compare

    xor ebx, ebx
    .while ebx < 100
        fld dword ptr [esi+ebx*4]
        fstp double
        printf( "%f\n", double )
        inc ebx
    .endw

    inkey "Press any key to exit..."
    exit

;===================================================================================
end start
eschew obfuscation

MichaelW

This version is my attempt to test over the full range of REAL4 values.

;===================================================================================
    include \masm32\include\masm32rt.inc
;===================================================================================

printf MACRO format:REQ, args:VARARG
    IFNB <args>
        invoke crt_printf, cfm$(format), args
    ELSE
        invoke crt_printf, cfm$(format)
    ENDIF
    EXITM <>
ENDM

;===================================================================================
    .data
        double REAL8 ?
        q dq 0
    .code
;===================================================================================

compare proc c p1:dword, p2:dword
    mov ecx, p1
    mov edx, p2
    mov eax, [ecx]
    .if sdword ptr eax < 0
        mov eax, 80000000h
        sub eax, [ecx]
    .endif
    mov ecx, [edx]
    .if sdword ptr ecx < 0
        mov ecx, 80000000h
        sub ecx, [edx]
    .endif
    .if sdword ptr eax < ecx
        mov eax, -1
    .elseif sdword ptr eax > ecx
        mov eax, 1
    .else
        xor eax, eax
    .endif
    ret
compare endp

;===================================================================================
start:
;===================================================================================

    N = 1000000

    mov esi, alloc(N*4)

    xor ebx, ebx
    .while ebx < N-1

        invoke nrandom, 0ffffffffh

        ;----------------------------------------------------------
        ; Clear the sign bit and set it on half of the iterations.
        ;----------------------------------------------------------

        mov ecx, ebx
        shl ecx, 31
        and eax, 7fffffffh
        or  eax, ecx

        ;-------------------------------------------------------
        ; Load the result as a REAL4 and store it in the array.
        ;-------------------------------------------------------

        push eax
        fld dword ptr [esp]
        fstp dword ptr [esi+ebx*4]
        pop eax
        inc ebx

    .endw

    ;-----------------------------------------
    ; Reduce N here to force errors for test.
    ;-----------------------------------------

    invoke crt_qsort, esi, N, 4, addr compare

    inkey "Sorted, press any key to view samples..."

    xor ebx, ebx
    .while ebx < N
        fld dword ptr [esi+ebx*4]
        fstp double
        printf( "%f\n", double)
        add ebx, (N/200)
    .endw
    printf( "\n"  )

    inkey "Press any key to check sort order..."

    xor ebx, ebx
    .while ebx < N-1
        fld dword ptr [esi+ebx*4]
        fld dword ptr [esi+ebx*4+4]
        fcompp
        fstsw ax
        fwait
        sahf
        jz  @f    ; ok if equal
        ja  @f    ; ok if element at higher address larger
        printf( "*" )
      @@:
        inc ebx
    .endw
    printf( "\n" )

    inkey "Press any key to exit..."
    exit

;===================================================================================
end start

eschew obfuscation

drizz

I think some counter measurements (not countermeasures  :lol) are in order.


compare_SSE1 proc C p1:PTR REAL4, p2:PTR REAL4

mov eax,p1
mov edx,p2
movss xmm0,real4 ptr [eax]
movss xmm1,real4 ptr [edx]
xor eax,eax
comiss xmm0,xmm1
setz al
sbb edx,edx
sbb edx,-1
dec eax
and eax,edx
ret

compare_SSE1 endp


compare_P6 proc C p1:PTR REAL4, p2:PTR REAL4

mov eax,p1
mov edx,p2
fld real4 ptr [eax]
fld real4 ptr [edx]
xor eax,eax
fcomip st(0),st(1)
ffree st
setz al
sbb edx,edx
sbb edx,-1
dec eax
and eax,edx
ret

compare_P6 endp
The truth cannot be learned ... it can only be recognized.

jj2007

Actually, I found a better solution, which costs only a few cycles after the sort:
Since all negative numbers are sorted in the wrong order, I just find the first negative number, and swap the area between first negative and last item.
Timings are almost identical, and it allows to use a flag telling the algo "sort as dword integer" or "sort as REAL4".

MichaelW

I was expecting my brain-dead compare-as-integer procedure to do much worse than it does.

;==============================================================================
    include \masm32\include\masm32rt.inc
    .686
    .MMX
    .XMM
    include \masm32\macros\timers.asm
;==============================================================================
    .data
        i1 dd 1
        i2 dd 2
        r1 REAL4 1.23
        r2 REAL4 -2.34
    .code
;==============================================================================

; I included this as a minimum (will not work reliably for negative values).

compare proc C p1:PTR DWORD, p2:PTR DWORD

    mov ecx, p1
    mov edx, p2
    mov eax, [ecx]
    sub eax, [edx]
    ret

compare endp

;==============================================================================

compare_SSE1 proc C p1:PTR REAL4, p2:PTR REAL4

mov eax,p1
mov edx,p2
movss xmm0,real4 ptr [eax]
movss xmm1,real4 ptr [edx]
xor eax,eax
comiss xmm0,xmm1
setz al
sbb edx,edx
sbb edx,-1
dec eax
and eax,edx
ret

compare_SSE1 endp

;==============================================================================

compare_P6 proc C p1:PTR REAL4, p2:PTR REAL4

mov eax,p1
mov edx,p2
fld real4 ptr [eax]
fld real4 ptr [edx]
xor eax,eax
fcomip st(0),st(1)
ffree st
setz al
sbb edx,edx
sbb edx,-1
dec eax
and eax,edx
ret

compare_P6 endp

;==============================================================================

compare_asint proc C p1:PTR REAL4, p2:PTR REAL4
    mov ecx, p1
    mov edx, p2
    mov eax, [ecx]
    .if sdword ptr eax < 0
        mov eax, 80000000h
        sub eax, [ecx]
    .endif
    mov ecx, [edx]
    .if sdword ptr ecx < 0
        mov ecx, 80000000h
        sub ecx, [edx]
    .endif
    .if sdword ptr eax < ecx
        mov eax, -1
    .elseif sdword ptr eax > ecx
        mov eax, 1
    .else
        xor eax, eax
    .endif
    ret
compare_asint endp

;==============================================================================
;==============================================================================
start:
;==============================================================================

    invoke Sleep, 3000

    REPEAT 3

      counter_begin 1000, HIGH_PRIORITY_CLASS
      counter_end
      print str$(eax)," cycles, empty",13,10

      counter_begin 1000, HIGH_PRIORITY_CLASS
        invoke compare, addr i1, addr i2
      counter_end
      print str$(eax)," cycles, compare",13,10

      counter_begin 1000, HIGH_PRIORITY_CLASS
        invoke compare_P6, addr r1, addr r2
      counter_end
      print str$(eax)," cycles, compare_P6",13,10

      counter_begin 1000, HIGH_PRIORITY_CLASS
        invoke compare_SSE1, addr r1, addr r2
      counter_end
      print str$(eax)," cycles, compare_SSE1",13,10

      counter_begin 1000, HIGH_PRIORITY_CLASS
        invoke compare_asint, addr r1, addr r2
      counter_end
      print str$(eax)," cycles, compare_asint",13,10,13,10

    ENDM

    inkey "Press any key to exit..."
    exit
;==============================================================================
end start


Running on a P6:

0 cycles, empty
6 cycles, compare
13 cycles, compare_P6
12 cycles, compare_SSE1
13 cycles, compare_asint

0 cycles, empty
6 cycles, compare
13 cycles, compare_P6
12 cycles, compare_SSE1
13 cycles, compare_asint

0 cycles, empty
6 cycles, compare
13 cycles, compare_P6
12 cycles, compare_SSE1
13 cycles, compare_asint

eschew obfuscation

jj2007

Celeron M:

0 cycles, empty
3 cycles, compare
9 cycles, compare_P6
9 cycles, compare_SSE1
10 cycles, compare_asint

drizz

AMD Opteron 148
0 cycles, empty
3 cycles, compare
6 cycles, compare_P6
3 cycles, compare_SSE1
9 cycles, compare_asint

0 cycles, empty
3 cycles, compare
5 cycles, compare_P6
3 cycles, compare_SSE1
9 cycles, compare_asint

0 cycles, empty
3 cycles, compare
6 cycles, compare_P6
3 cycles, compare_SSE1
9 cycles, compare_asint
The truth cannot be learned ... it can only be recognized.

jj2007

The problem with the negative numbers seems to be solved, see here. So far no bugs detected, and speed is also more than fine.

jj2007

I put up a small testbed:

Intel(R) Pentium(R) 4 CPU 3.40GHz (SSE3)
Initial loop count is 10; cpe=cycles per array element

** 160000 elements:
88450   kCycles,        566 cpe for nrQsortA
139969  kCycles,        896 cpe for CombSortA
129500  kCycles,        829 cpe for ArraySort

** 80000 elements:
40859   kCycles,        523 cpe for nrQsortA
64227   kCycles,        822 cpe for CombSortA
59053   kCycles,        756 cpe for ArraySort

** 40000 elements:
19181   kCycles,        491 cpe for nrQsortA
29722   kCycles,        761 cpe for CombSortA
27398   kCycles,        701 cpe for ArraySort

** 20000 elements:
9176    kCycles,        470 cpe for nrQsortA
13680   kCycles,        700 cpe for CombSortA
12904   kCycles,        661 cpe for ArraySort

** 10000 elements:
4324    kCycles,        443 cpe for nrQsortA
6280    kCycles,        643 cpe for CombSortA
5742    kCycles,        588 cpe for ArraySort

** 5000 elements:
2055    kCycles,        421 cpe for nrQsortA
2816    kCycles,        577 cpe for CombSortA
2549    kCycles,        522 cpe for ArraySort

** 2500 elements:
963     kCycles,        395 cpe for nrQsortA
1229    kCycles,        503 cpe for CombSortA
1161    kCycles,        476 cpe for ArraySort

** 1250 elements:
450     kCycles,        369 cpe for nrQsortA
568     kCycles,        466 cpe for CombSortA
508     kCycles,        416 cpe for ArraySort

dedndave

prescott w/htt
Intel(R) Pentium(R) 4 CPU 3.00GHz (SSE3)

** 160000 elements:
86915   kCycles,        556 cpe for nrQsortA
142185  kCycles,        910 cpe for CombSortA
133368  kCycles,        854 cpe for ArraySort

** 80000 elements:
41284   kCycles,        528 cpe for nrQsortA
65963   kCycles,        844 cpe for CombSortA
60480   kCycles,        774 cpe for ArraySort

** 40000 elements:
20086   kCycles,        514 cpe for nrQsortA
30694   kCycles,        786 cpe for CombSortA
27904   kCycles,        714 cpe for ArraySort

** 20000 elements:
9919    kCycles,        508 cpe for nrQsortA
13585   kCycles,        696 cpe for CombSortA
15085   kCycles,        772 cpe for ArraySort

** 10000 elements:
4351    kCycles,        446 cpe for nrQsortA
6662    kCycles,        682 cpe for CombSortA
6034    kCycles,        618 cpe for ArraySort

** 5000 elements:
2244    kCycles,        460 cpe for nrQsortA
2851    kCycles,        584 cpe for CombSortA
2841    kCycles,        582 cpe for ArraySort

** 2500 elements:
965     kCycles,        395 cpe for nrQsortA
1288    kCycles,        528 cpe for CombSortA
1212    kCycles,        497 cpe for ArraySort

** 1250 elements:
464     kCycles,        381 cpe for nrQsortA
614     kCycles,        503 cpe for CombSortA
513     kCycles,        421 cpe for ArraySort