News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

Integer square root

Started by Farabi, October 06, 2008, 02:27:42 AM

Previous topic - Next topic

Farabi

I got this from the search by using google, I modify it as a 32-bit MASM version, I think it would be usefull, it was very fast, but if you can I want you to retest it on your computer about it speed, but im sure this function is faster than any FPU square root function.

Quote
;Put this on your WM_CREATE part.

.data
sqrt_tbl dword 0
.code

.ELSEIF uMsg==WM_CREATE

   invoke mAlloc,1024+100
   mov sqrt_tbl,eax
   
   ; table function 256.0 * sqrt( i/256.0 );
   mov ecx,256
   xor edx,edx
   @@:
   push ecx
      push edx
         fild dword ptr[esp]
         push 256
            fidiv dword ptr[esp]
            fsqrt
               push 256
                  fimul dword ptr[esp]
                  fistp dword ptr[esp]
               pop sqrt
         pop edx
      pop edx
      
   push sqrt
   pop ecx
   
   mov [eax+edx],cl
   
   pop ecx
   inc edx
   dec ecx
   jns @b
.endif

UMGetSqrt proc uses esi value:dword
   ; Gem writers: Arne Steinarson
   
      mov eax,value
      mov esi,sqrt_tbl
      
        cmp     eax,10000h
        jb      c_15_0
        cmp     eax,1000000h
        jb      c_23_16

; bit 31..24
        cmp     eax,10000000h
        jb      c_27_24
        cmp     eax,40000000h
        jb      c_29_28
        shr     eax,24
        mov     al, [esi+eax]
        shl     eax,8
        ret
c_29_28:
        shr     eax,22
        mov     al, [esi+eax]
        shl     eax,7
        ret
c_27_24:
        cmp     eax,4000000h
        jb      c_25_24
        shr     eax,20
        mov     al, [esi+eax]
        shl     eax,6
        ret
c_25_24:
        shr     eax,18
        mov     al, [esi+eax]
        shl     eax,5
        ret

; bit 23..16
c_23_16:
        cmp     eax,100000h
        jb      c_19_16
        cmp     eax,400000h
        jb      c_21_20
        shr     eax,16
        mov     al, [esi+eax]
        shl     eax,4
        ret
c_21_20:
        shr     eax,14
        mov     al, [esi+eax]
        shl     eax,3
        ret
c_19_16:
        cmp     eax,40000h
        jb      c_17_16
        shr     eax,12
        mov     al, [esi+eax]
        shl     eax,2
        ret
c_17_16:
        shr     eax,10
        mov     al, [esi+eax]
        shl     eax,1
        ret

c_15_0: cmp     eax,100h
        jb      c_7_0

; bit 15..8
        cmp     eax,1000h
        jb      c_11_8
        cmp     eax,4000h
        jb      c_13_12
        shr     eax,8
        mov     al, [esi+eax]
        ret
c_13_12:
        shr     eax,6
        mov     al, [esi+eax]
        shr     eax,1
        ret
c_11_8: cmp     eax,400h
        jb      c_9_8
        shr     eax,4
        mov     al, [esi+eax]
        shr     eax,2
        ret
c_9_8:  shr     eax,2
        mov     al, [esi+eax]
        shr     eax,3
        ret

;bit 7..0
c_7_0:  cmp     eax,10h
        jb      c_3_0
        cmp     eax,40h
        jb      c_5_4
        mov     al, [esi+eax]
        shr     eax,4
        ret
c_5_4:  shl     eax,2
        mov     al, [esi+eax]
        shr     eax,5
        ret
c_3_0:  cmp     eax,4h
        jb      c_1_0
        shl     eax,4
        mov     al, [esi+eax]
        shr     eax,6
        ret
c_1_0:  shl     eax,6
        mov     al, [esi+eax]
        shr     eax,7
        ret   
   ret
UMGetSqrt endp

If you have any idea to enhance it, let me know.

How to use:

invoke UMGetSqrt,100
Those who had universe knowledges can control the world by a micro processor.
http://www.wix.com/farabio/firstpage

"Etos siperi elegi"

raymond

It may be faster than using the FPU as long as you don't care about accuracy. It may provide the correct result for numbers below approx. 1000 (if you make the proper corrections to your posted algo). For larger numbers, it would provide only an approximate result. For example, the square root of 2086479684 (which is exactly 45678) would be returned as 45568.

As for your algo, you reserve 1024 bytes but fill only 256 of them. I think you must set the initial value of ECX to 1024. Even better, you don't need ECX  as a counter. You already use EDX as a counter: simply exit the loop when EDX exceeds 1024

Also, what is the purpose of dividing a number N by 256 before taking its square root which effectively results in sqrt(N)/16,  and later multiplying that by 256 on the FPU???
Forget the division and simply multiply sqrt(N) by 16!!! :clap:

I realize you simply translated some code but whoever designed the original code had little knowledge of basic maths. :eek

If anyone bothers to verify the speed, don't forget to also check it for accuracy.
When you assume something, you risk being wrong half the time
http://www.ray.masmcode.com

MichaelW

On my P3 UMGetSqrt, as posted, is significantly faster than an FPU version. I think now that the mean absolute error that I calculated should have been the maximum absolute error as a percentage, or perhaps both.

http://www.azillionmonkeys.com/qed/sqroot.html


[attachment deleted by admin]
eschew obfuscation

jj2007

Quote from: MichaelW on October 06, 2008, 06:42:00 AM
On my P3 UMGetSqrt, as posted, is significantly faster than an FPU version.

Celeron M aka Core Duo:

11 cycles, UMGetSqrt
63 cycles, fisqrt

Farabi

Quote from: raymond on October 06, 2008, 04:14:43 AM
It may be faster than using the FPU as long as you don't care about accuracy. It may provide the correct result for numbers below approx. 1000 (if you make the proper corrections to your posted algo). For larger numbers, it would provide only an approximate result. For example, the square root of 2086479684 (which is exactly 45678) would be returned as 45568.

As for your algo, you reserve 1024 bytes but fill only 256 of them. I think you must set the initial value of ECX to 1024. Even better, you don't need ECX  as a counter. You already use EDX as a counter: simply exit the loop when EDX exceeds 1024

Also, what is the purpose of dividing a number N by 256 before taking its square root which effectively results in sqrt(N)/16,  and later multiplying that by 256 on the FPU???
Forget the division and simply multiply sqrt(N) by 16!!! :clap:

I realize you simply translated some code but whoever designed the original code had little knowledge of basic maths. :eek

If anyone bothers to verify the speed, don't forget to also check it for accuracy.

Hi raymond, I didnot notice if it was not really accurate. To be honest, I dont really understand how it works, Im just thinking maybe it was usefull if I just translate it. Thanks for your answer.
Those who had universe knowledges can control the world by a micro processor.
http://www.wix.com/farabio/firstpage

"Etos siperi elegi"

dsouza123

An alternative unsigned 32 bit integer version.

MASM implementation of unsigned 32 bit C code with modifications to the algorithm.

This MASM version can vary the number of iterations
depending on the size of the number being evaluated.

The C code had a fixed number of iterations, this MASM version has 4 variations,
corresponding to 32 bit, 24 bit, 16 bit, 8 bit. 
It is possible to refine it further using increments of 4 bits instead of 8 bits.

It also optimizes the if code, eliminating a branch.


On an Athlon Thunderbird 1173 Mhz

109 cycles, uuddsqrt
27 cycles, fisqrt

Using the testbed by MichaelW
replacing the previous square rout routine with uddsqrt.

A heavily commented, slightly cleaned up version
of the routine in the testbed.


align 4
uddsqrt proc uses esi edi ebx ecx edx number: dword

  mov ebx, number  ; get initial copy of number
  xor edx, edx     ; mov rem, 0
  xor eax, eax     ; mov root, 0

;   finer grained than the original C code which always did 16 iterations
;   xor esi, esi     ; esi = 0
;   mov  ch, 16      ; mov i, 16           16 12  8  4  iterations  half the number of bits
;   mov  cl, 30      ; 2 less than         30 22 14  6  right shifts to get the top 2 bits
;   dec esi          ; esi = -1            32 24 16  8  bits set for ANDing

  .if ebx <= 255              ; fits in  8 bits
     mov  ch,  4   ; iterations for  8 bit
     mov  cl,  6   ; shifts needed
     mov esi, 0ffh
  .elseif ebx <= 65535            ; fits in 16 bits
     mov  ch,  8   ; iterations for 16 bit
     mov  cl, 14   ; shifts needed
     mov esi, 0ffffh
  .elseif ebx <= 65536*256 - 1    ; fits in 24 bits
     mov  ch, 12   ; iterations for 24 bit
     mov  cl, 22   ; shifts needed
     mov esi, 0ffffffh
  .else
     mov  ch, 16   ; iterations for 32 bit
     mov  cl, 30   ; shifts needed
     xor esi, esi
     dec esi
  .endif   

align 16
@@:
  add eax, eax    ; shl root, 1       shl eax, 1
  shl edx, 2      ; shl rem, 2

  mov edi, ebx    ; mov temp, number
  shr edi,  cl    ; shr temp, (one of 30, 22, 14, 6)

  shl ebx, 2      ; shl number, 2
  add edx, edi    ; add rem, temp
  and ebx, esi    ; keep number within bit fit limit

  .if eax < edx      ; .if root < rem
     inc eax         ;    inc root
     sub edx, eax    ;    sub rem, root
     inc eax         ;    inc root
  .endif             ; .endif

  dec  ch         ; dec i
  jnz @B
  shr eax, 1      ; shr root, 1
  ret
uddsqrt endp


The original fixed number of iterations, unsigned 32 bit C code, by Jack Crenshaw

listing 4
unsigned short sqrt(unsigned long a){
  unsigned long rem = 0;
  unsigned long root = 0;
  for(int i=0; i<16; i++){
    root <<= 1;
    rem = ((rem << 2) + (a >> 30));
    a <<= 2;
    root ++;
    if(root <= rem){
      rem -= root;
      root++;
    }
    else
      root--;    there was a bug in the posted code, it was missing the --
  }
  return (unsigned short)(root >> 1);
}

[attachment deleted by admin]

dsouza123

I've added the 4 bit increments, past 8 bits.

The timings vary depending on the size (bits) of the number tested.

So the 109 cycles in the previous post was using 24 bits,
it didn't have the 20 bit at that time, now it is 102 cycles,
for the test number 123456.

Contained in  8 bits   38 cycles
Contained in 32 bits 133 cycles

The test condition 123456 now

102 cycles, uddsqrt
27 cycles, fisqrt

This is for an early model 32 bit AMD Athlon.

I am curious about timings for other CPUs.

[attachment deleted by admin]

jj2007

86 cycles, uddsqrt
63 cycles, fisqrt

Celeron M, Core 2

MichaelW

This is on my P3:

129 cycles, uddsqrt
62 cycles, fisqrt


And these were my previous results:

23 cycles, UMGetSqrt
62 cycles, fisqrt

eschew obfuscation

Mark Jones

Quote from: AMD Athlon X2 x64 4000, WinXP SP3 x32
mean absolute error 0-999: 0.472000
mean absolute error 0-999999: 0.499500
mean absolute error 0-9999999: 0.499754
mean absolute error 0-99999999: 0.499950

95 cycles, uddsqrt
26 cycles, fisqrt

Great work, can it get even faster?
"To deny our impulses... foolish; to revel in them, chaos." MCJ 2003.08

drizz

Quote from: dsouza123 on October 12, 2008, 03:28:49 AM
The original fixed number of iterations, unsigned 32 bit C code, by Jack Crenshaw

sqrt proc a:DWORD
mov edx,a
xor eax,eax;root
xor ecx,ecx;rem
repeat 16
shld ecx,edx,2;; rem = ((rem << 2) + (a >> 30));
shl edx,2;;a <<= 2;
add eax,eax;; root <<= 1;
.if eax<ecx
add eax,1
sub ecx,eax;;rem -= root;
add eax,1;;root++;
.endif
endm
shr eax,1
ret
sqrt endp
The truth cannot be learned ... it can only be recognized.

dsouza123

A couple of observations.

Drizz's implementation unrolled was 92 cycles, as a loop was 146.
Minimizing the size of the unrolled/looped to 15 bytes
using cl for 2 and inc eax instead of add eax,1 didn't improve the timings.

It was supprising that there was a one to one correspondence
C statement to assembly instruction.

The lowest the cycle count was 87 cycles when testing values below 16.

On a rare cccasion the same executable would have a dramatically
larger cycle count.
--------------------------------------------------------
The lowest cycle count I've found using a variation of this algorithm is
25 cycles when testing values that fit in 4 bits. 
27 cycles for fisqrt.

.if ebx < 16
  mov ch,  2
  mov cl,  2
  mov esi, 15
.elseif

It accomplishes it by only having 2 interations instead of 16.

The special cases of 0, 1 can have 11 cycles with the short circuit
.if ebx < 2
  mov eax, ebx
  ret
.endif

NightWare

hmm... maybe some of you remember this hack from video games :
mov eax,Value
; convert to real4
push eax
fild DWORD PTR [esp]
fstp DWORD PTR [esp]
pop eax
; sqrt hack
sub eax,3f800000h
; add eax,1 ; (optionnal, for rounding)
sar eax,1
add eax,3f800000h
; convert to integer
push eax
fld DWORD PTR [esp]
fistp DWORD PTR [esp]
pop eax