The MASM Forum Archive 2004 to 2012

General Forums => The Workshop => Topic started by: jj2007 on April 02, 2011, 12:46:26 AM

Title: Sorting a REAL4 as integer
Post by: jj2007 on April 02, 2011, 12:46:26 AM
Sorting a REAL4 is straightforward - just treat it as a dword. That's what I thought. But it fails with negative numbers:
1999999 1.23456593368275280e+38
1999998 1.23456593368275280e+38
1999997 1.23456542662251271e+38
1999996 1.23456512238636865e+38
...
4       -1.25958426030349832e+32   <<<< that one is lower than #0
3       -5.56941002978919206e+31
2       -1.36275776159796797e+31
1       -5.94511516030574806e+30
0       -5.77247571756150055e+30


I assume it's by design :bg
Any known workaround compatible with a speedy innermost loop?
Title: Re: Sorting a REAL4 as integer
Post by: MichaelW on April 02, 2011, 01:36:15 AM
I think you have seen this link before.

Comparing floating point numbers (http://www.cygnus-software.com/papers/comparingfloats/Comparing%20floating%20point%20numbers.htm)

"A more general way of handling negative numbers is to adjust them so that they are lexicographically ordered as twos-complement integers instead of as signed magnitude integers. This is done by detecting negative numbers and subtracting them from 0x80000000."
Title: Re: Sorting a REAL4 as integer
Post by: jj2007 on April 02, 2011, 07:38:16 AM
Perfect, Michael, that is exactly what I was looking for :U
Title: Re: Sorting a REAL4 as integer
Post by: MichaelW on April 02, 2011, 07:48:26 AM
The method appears to work correctly. I tried to implement it in a version of the MASM32 ascending insertion string sort that I had previously modified to sort integers, but modifying it to handle REAL4 values proved to more work than I cared to do. So I used the CRT qsort function, and put the conversion code in the compare procedure.

;===================================================================================
    include \masm32\include\masm32rt.inc
;===================================================================================

printf MACRO format:REQ, args:VARARG
    IFNB <args>
        invoke crt_printf, cfm$(format), args
    ELSE
        invoke crt_printf, cfm$(format)
    ENDIF
    EXITM <>
ENDM

;===================================================================================
    .data
        double REAL8 ?
    .code
;===================================================================================

compare proc c p1:dword, p2:dword
    mov ecx, p1
    mov edx, p2
    mov eax, [ecx]
    .if sdword ptr eax < 0
        mov eax, 80000000h
        sub eax, [ecx]
    .endif
    mov ecx, [edx]
    .if sdword ptr ecx < 0
        mov ecx, 80000000h
        sub ecx, [edx]
    .endif
    .if sdword ptr eax < ecx
        mov eax, -1
    .elseif sdword ptr eax > ecx
        mov eax, 1
    .else
        xor eax, eax
    .endif
    ret
compare endp

;===================================================================================
start:
;===================================================================================
    mov esi, alloc(400)

    xor ebx, ebx
    .while ebx < 100
        invoke nrandom, 100
        sub eax, 50
        push eax
        fild dword ptr [esp]
        fldpi
        fdiv
        fstp dword ptr [esi+ebx*4]
        pop eax
        inc ebx
    .endw

    xor ebx, ebx
    .while ebx < 100
        fld dword ptr [esi+ebx*4]
        fstp double
        printf( "%f\n", double )
        inc ebx
    .endw

    inkey "Press any key to continue..."

    invoke crt_qsort, esi, 100, 4, addr compare

    xor ebx, ebx
    .while ebx < 100
        fld dword ptr [esi+ebx*4]
        fstp double
        printf( "%f\n", double )
        inc ebx
    .endw

    inkey "Press any key to exit..."
    exit

;===================================================================================
end start
Title: Re: Sorting a REAL4 as integer
Post by: MichaelW on April 02, 2011, 11:42:51 AM
This version is my attempt to test over the full range of REAL4 values.

;===================================================================================
    include \masm32\include\masm32rt.inc
;===================================================================================

printf MACRO format:REQ, args:VARARG
    IFNB <args>
        invoke crt_printf, cfm$(format), args
    ELSE
        invoke crt_printf, cfm$(format)
    ENDIF
    EXITM <>
ENDM

;===================================================================================
    .data
        double REAL8 ?
        q dq 0
    .code
;===================================================================================

compare proc c p1:dword, p2:dword
    mov ecx, p1
    mov edx, p2
    mov eax, [ecx]
    .if sdword ptr eax < 0
        mov eax, 80000000h
        sub eax, [ecx]
    .endif
    mov ecx, [edx]
    .if sdword ptr ecx < 0
        mov ecx, 80000000h
        sub ecx, [edx]
    .endif
    .if sdword ptr eax < ecx
        mov eax, -1
    .elseif sdword ptr eax > ecx
        mov eax, 1
    .else
        xor eax, eax
    .endif
    ret
compare endp

;===================================================================================
start:
;===================================================================================

    N = 1000000

    mov esi, alloc(N*4)

    xor ebx, ebx
    .while ebx < N-1

        invoke nrandom, 0ffffffffh

        ;----------------------------------------------------------
        ; Clear the sign bit and set it on half of the iterations.
        ;----------------------------------------------------------

        mov ecx, ebx
        shl ecx, 31
        and eax, 7fffffffh
        or  eax, ecx

        ;-------------------------------------------------------
        ; Load the result as a REAL4 and store it in the array.
        ;-------------------------------------------------------

        push eax
        fld dword ptr [esp]
        fstp dword ptr [esi+ebx*4]
        pop eax
        inc ebx

    .endw

    ;-----------------------------------------
    ; Reduce N here to force errors for test.
    ;-----------------------------------------

    invoke crt_qsort, esi, N, 4, addr compare

    inkey "Sorted, press any key to view samples..."

    xor ebx, ebx
    .while ebx < N
        fld dword ptr [esi+ebx*4]
        fstp double
        printf( "%f\n", double)
        add ebx, (N/200)
    .endw
    printf( "\n"  )

    inkey "Press any key to check sort order..."

    xor ebx, ebx
    .while ebx < N-1
        fld dword ptr [esi+ebx*4]
        fld dword ptr [esi+ebx*4+4]
        fcompp
        fstsw ax
        fwait
        sahf
        jz  @f    ; ok if equal
        ja  @f    ; ok if element at higher address larger
        printf( "*" )
      @@:
        inc ebx
    .endw
    printf( "\n" )

    inkey "Press any key to exit..."
    exit

;===================================================================================
end start

Title: Re: Sorting a REAL4 as integer
Post by: drizz on April 02, 2011, 01:54:18 PM
I think some counter measurements (not countermeasures  :lol) are in order.


compare_SSE1 proc C p1:PTR REAL4, p2:PTR REAL4

mov eax,p1
mov edx,p2
movss xmm0,real4 ptr [eax]
movss xmm1,real4 ptr [edx]
xor eax,eax
comiss xmm0,xmm1
setz al
sbb edx,edx
sbb edx,-1
dec eax
and eax,edx
ret

compare_SSE1 endp


compare_P6 proc C p1:PTR REAL4, p2:PTR REAL4

mov eax,p1
mov edx,p2
fld real4 ptr [eax]
fld real4 ptr [edx]
xor eax,eax
fcomip st(0),st(1)
ffree st
setz al
sbb edx,edx
sbb edx,-1
dec eax
and eax,edx
ret

compare_P6 endp
Title: Re: Sorting a REAL4 as integer
Post by: jj2007 on April 02, 2011, 04:35:29 PM
Actually, I found a better solution, which costs only a few cycles after the sort:
Since all negative numbers are sorted in the wrong order, I just find the first negative number, and swap the area between first negative and last item.
Timings are almost identical, and it allows to use a flag telling the algo "sort as dword integer" or "sort as REAL4".
Title: Re: Sorting a REAL4 as integer
Post by: MichaelW on April 03, 2011, 08:09:37 AM
I was expecting my brain-dead compare-as-integer procedure to do much worse than it does.

;==============================================================================
    include \masm32\include\masm32rt.inc
    .686
    .MMX
    .XMM
    include \masm32\macros\timers.asm
;==============================================================================
    .data
        i1 dd 1
        i2 dd 2
        r1 REAL4 1.23
        r2 REAL4 -2.34
    .code
;==============================================================================

; I included this as a minimum (will not work reliably for negative values).

compare proc C p1:PTR DWORD, p2:PTR DWORD

    mov ecx, p1
    mov edx, p2
    mov eax, [ecx]
    sub eax, [edx]
    ret

compare endp

;==============================================================================

compare_SSE1 proc C p1:PTR REAL4, p2:PTR REAL4

mov eax,p1
mov edx,p2
movss xmm0,real4 ptr [eax]
movss xmm1,real4 ptr [edx]
xor eax,eax
comiss xmm0,xmm1
setz al
sbb edx,edx
sbb edx,-1
dec eax
and eax,edx
ret

compare_SSE1 endp

;==============================================================================

compare_P6 proc C p1:PTR REAL4, p2:PTR REAL4

mov eax,p1
mov edx,p2
fld real4 ptr [eax]
fld real4 ptr [edx]
xor eax,eax
fcomip st(0),st(1)
ffree st
setz al
sbb edx,edx
sbb edx,-1
dec eax
and eax,edx
ret

compare_P6 endp

;==============================================================================

compare_asint proc C p1:PTR REAL4, p2:PTR REAL4
    mov ecx, p1
    mov edx, p2
    mov eax, [ecx]
    .if sdword ptr eax < 0
        mov eax, 80000000h
        sub eax, [ecx]
    .endif
    mov ecx, [edx]
    .if sdword ptr ecx < 0
        mov ecx, 80000000h
        sub ecx, [edx]
    .endif
    .if sdword ptr eax < ecx
        mov eax, -1
    .elseif sdword ptr eax > ecx
        mov eax, 1
    .else
        xor eax, eax
    .endif
    ret
compare_asint endp

;==============================================================================
;==============================================================================
start:
;==============================================================================

    invoke Sleep, 3000

    REPEAT 3

      counter_begin 1000, HIGH_PRIORITY_CLASS
      counter_end
      print str$(eax)," cycles, empty",13,10

      counter_begin 1000, HIGH_PRIORITY_CLASS
        invoke compare, addr i1, addr i2
      counter_end
      print str$(eax)," cycles, compare",13,10

      counter_begin 1000, HIGH_PRIORITY_CLASS
        invoke compare_P6, addr r1, addr r2
      counter_end
      print str$(eax)," cycles, compare_P6",13,10

      counter_begin 1000, HIGH_PRIORITY_CLASS
        invoke compare_SSE1, addr r1, addr r2
      counter_end
      print str$(eax)," cycles, compare_SSE1",13,10

      counter_begin 1000, HIGH_PRIORITY_CLASS
        invoke compare_asint, addr r1, addr r2
      counter_end
      print str$(eax)," cycles, compare_asint",13,10,13,10

    ENDM

    inkey "Press any key to exit..."
    exit
;==============================================================================
end start


Running on a P6:

0 cycles, empty
6 cycles, compare
13 cycles, compare_P6
12 cycles, compare_SSE1
13 cycles, compare_asint

0 cycles, empty
6 cycles, compare
13 cycles, compare_P6
12 cycles, compare_SSE1
13 cycles, compare_asint

0 cycles, empty
6 cycles, compare
13 cycles, compare_P6
12 cycles, compare_SSE1
13 cycles, compare_asint

Title: Re: Sorting a REAL4 as integer
Post by: jj2007 on April 03, 2011, 08:16:41 AM
Celeron M:

0 cycles, empty
3 cycles, compare
9 cycles, compare_P6
9 cycles, compare_SSE1
10 cycles, compare_asint
Title: Re: Sorting a REAL4 as integer
Post by: drizz on April 03, 2011, 11:45:59 AM
AMD Opteron 148
0 cycles, empty
3 cycles, compare
6 cycles, compare_P6
3 cycles, compare_SSE1
9 cycles, compare_asint

0 cycles, empty
3 cycles, compare
5 cycles, compare_P6
3 cycles, compare_SSE1
9 cycles, compare_asint

0 cycles, empty
3 cycles, compare
6 cycles, compare_P6
3 cycles, compare_SSE1
9 cycles, compare_asint
Title: Re: Sorting a REAL4 as integer
Post by: jj2007 on April 06, 2011, 12:34:00 AM
The problem with the negative numbers seems to be solved, see here (http://www.masm32.com/board/index.php?topic=12460.msg135639#msg135639). So far no bugs detected, and speed is also more than fine.
Title: Re: Sorting a REAL4 as integer
Post by: jj2007 on April 06, 2011, 09:52:10 AM
I put up a small testbed:

Intel(R) Pentium(R) 4 CPU 3.40GHz (SSE3)
Initial loop count is 10; cpe=cycles per array element

** 160000 elements:
88450   kCycles,        566 cpe for nrQsortA
139969  kCycles,        896 cpe for CombSortA
129500  kCycles,        829 cpe for ArraySort

** 80000 elements:
40859   kCycles,        523 cpe for nrQsortA
64227   kCycles,        822 cpe for CombSortA
59053   kCycles,        756 cpe for ArraySort

** 40000 elements:
19181   kCycles,        491 cpe for nrQsortA
29722   kCycles,        761 cpe for CombSortA
27398   kCycles,        701 cpe for ArraySort

** 20000 elements:
9176    kCycles,        470 cpe for nrQsortA
13680   kCycles,        700 cpe for CombSortA
12904   kCycles,        661 cpe for ArraySort

** 10000 elements:
4324    kCycles,        443 cpe for nrQsortA
6280    kCycles,        643 cpe for CombSortA
5742    kCycles,        588 cpe for ArraySort

** 5000 elements:
2055    kCycles,        421 cpe for nrQsortA
2816    kCycles,        577 cpe for CombSortA
2549    kCycles,        522 cpe for ArraySort

** 2500 elements:
963     kCycles,        395 cpe for nrQsortA
1229    kCycles,        503 cpe for CombSortA
1161    kCycles,        476 cpe for ArraySort

** 1250 elements:
450     kCycles,        369 cpe for nrQsortA
568     kCycles,        466 cpe for CombSortA
508     kCycles,        416 cpe for ArraySort
Title: Re: Sorting a REAL4 as integer
Post by: dedndave on April 06, 2011, 11:43:13 AM
prescott w/htt
Intel(R) Pentium(R) 4 CPU 3.00GHz (SSE3)

** 160000 elements:
86915   kCycles,        556 cpe for nrQsortA
142185  kCycles,        910 cpe for CombSortA
133368  kCycles,        854 cpe for ArraySort

** 80000 elements:
41284   kCycles,        528 cpe for nrQsortA
65963   kCycles,        844 cpe for CombSortA
60480   kCycles,        774 cpe for ArraySort

** 40000 elements:
20086   kCycles,        514 cpe for nrQsortA
30694   kCycles,        786 cpe for CombSortA
27904   kCycles,        714 cpe for ArraySort

** 20000 elements:
9919    kCycles,        508 cpe for nrQsortA
13585   kCycles,        696 cpe for CombSortA
15085   kCycles,        772 cpe for ArraySort

** 10000 elements:
4351    kCycles,        446 cpe for nrQsortA
6662    kCycles,        682 cpe for CombSortA
6034    kCycles,        618 cpe for ArraySort

** 5000 elements:
2244    kCycles,        460 cpe for nrQsortA
2851    kCycles,        584 cpe for CombSortA
2841    kCycles,        582 cpe for ArraySort

** 2500 elements:
965     kCycles,        395 cpe for nrQsortA
1288    kCycles,        528 cpe for CombSortA
1212    kCycles,        497 cpe for ArraySort

** 1250 elements:
464     kCycles,        381 cpe for nrQsortA
614     kCycles,        503 cpe for CombSortA
513     kCycles,        421 cpe for ArraySort