Unwinding Loops with REPEAT and indexed entry into loop

Started by Phil, June 06, 2005, 06:04:56 AM

Previous topic - Next topic

Phil

I have a general question about coding for entry into an unwound loop. My application is unpacking multiple 8-digit packed bcd values in a manner that suppresses all leading zeros storing only a single zero when there are no other significant digits. The following code is an abbreviated version of my original long-hand unwound loop.

;****************************************************************************
; Unpack BCD and build numeric ASCII term for display
;****************************************************************************
BuildString:
        mov     esi,[ebp-4]
        mov     edi,pFirst
        mov     ecx,wksize
        sub     ecx,1
        mov     eax,0F0000000h
skiplz: mov     ebx,[esi+4*ecx]
        test    ebx,eax
        jnz     bld8
        rol     ebx,4
        test    ebx,eax
        jnz     bld7
        rol     ebx,4
        test    ebx,eax
        jnz     bldn ...

        rol     ebx,4
        test    ebx,eax
        jnz     bld1
        sub     ecx,1
        js      bld1
        jmp     skiplz

        align   4

bld8:   rol     ebx,4
        mov     al,0Fh
        and     al,bl
        add     al,'0'
        mov     [edi],al
        add     edi,1
bld7:   rol     ebx,4
        mov     al,0Fh
        and     al,bl
        add     al,'0'
        mov     [edi],al
        add     edi,1
bldn: ...

bld1:   rol     ebx,4
        mov     al,0Fh
        and     al,bl
        add     al,'0'
        mov     [edi],al
        add     edi,1
        sub     ecx,1
        mov     ebx,[esi+4*ecx]
        jns     bld8


I have managed to get the following code working properly with REPEAT but I am asking if there is a cleaner way to do it. I had expected it to be slightly faster than the previous code but it actually runs a little slower. Can anyone suggest a better way to do what I am trying to accomplish?
BuildString:
        mov     esi,[ebp-4]            ; esi -> multiple precision packed bcd source operand
        mov     edi,pFirst               ; edi -> first free byte in ASCII string
        mov     ecx,wksize             ; loop for wksize dwords
        xor     ebx,ebx
skiplz: sub     ecx,1
        js      bld8+7*bldlen            ; display a single 0 when no other significant digits
        mov     ebx,[esi+4*ecx]
        bsr     eax,ebx                    ; locate first significant bit
        jz      skiplz
        push    ecx
        mov     ecx,01Ch                ; move max shift value of 28 and mask to ecx
        and     eax,ecx                   ; eax = 28, 24, 20, 16, ... 4, or 0
        sub     ecx,eax                   ; ecx = 0, 4, 8 , 16, ... 24, or 28
        rol     ebx,cl                       ; shift first significant digit into bits 31-28
        shr     ecx,2                       ; ecx = 0, 1, 2, 3, ... 6, or 7
        mov     eax,bldlen               ; get length of one REPEAT code block
        mul     ecx                         ; get REPEAT entry offset
        pop     ecx
        add     eax,offset bld8         ; add to base for all 8 digits
        jmp     eax                         ; dispatch

        align   4

bld8:   REPEAT 8
        rol     ebx,4
        mov     al,0Fh
        and     al,bl
        add     al,'0'
        mov     [edi],al
        add     edi,1
        ENDM
bldlen equ (($ - bld8)/8)

        sub     ecx,1
        mov     ebx,[esi+4*ecx]
        jns     bld8


hutch--

Phil,

Depending on the hardware you run it on, you have a stall in the shift from EBX to using BL in the loop code. You could try XOR EBX, EBX or SUB EBX, EBX either before the ROL or alternatively before the loop code as this will slow the code down on some hardware.

Something I have found with unrolls is that you can do too many and make it slower, usually the cutoff point is an unroll by 4. Usually for this type of unroll you have 2 code blocks, the first that jumps out and the last that jumps back so if you do an unroll using REPEAT, do a REPEAT 3 and leave the last block unmodified so you have 4 altegether.
Download site for MASM32      New MASM Forum
https://masm32.com          https://masm32.com/board/index.php

Phil

Thanks Steve! I'll give those ideas a whirl and see what I can do. I'm not sure, but I think that unrolling this loop by 8 will be optimum since there are 8 digits packed in ebx. I'll try smaller unrolls and see if that makes a difference. Meanwhile, I've finally figured out a crude way to build an indexed jump table and I got rid of the MUL ... big win! Now, with the following code I'm writing 150000 terms to nul in 12.61 seconds on the 996 MHz P3. That's the same speed as the long-hand unwind. The earlier code slowed things down to 13.65 seconds. Anyway, I thought I'd post this and ask for comments about more standard ways to do it.
;****************************************************************************
; Unpack BCD and build numeric ASCII term for display
;****************************************************************************
        align   4

BuildString:
        mov     esi,[ebp-4]             
        mov     edi,pFirst
        mov     edx,wksize
        xor     ebx,ebx
skiplz: sub     edx,1
        js      bld8+7*bldlen
        mov     ebx,[esi+4*edx]
        bsr     eax,ebx
        jz      skiplz
        mov     ecx,01Ch
        and     eax,ecx
        sub     ecx,eax
        rol     ebx,cl
        shr     ecx,2
        jmp     [table+4*ecx]

        align   4

table   dd bld8+0*bldlen
        dd bld8+1*bldlen
        dd bld8+2*bldlen
        dd bld8+3*bldlen
        dd bld8+4*bldlen
        dd bld8+5*bldlen
        dd bld8+6*bldlen
        dd bld8+7*bldlen

bld8:   REPEAT 8
        rol     ebx,4
        mov     al,0Fh
        and     al,bl
        add     al,'0'
        mov     [edi],al
        add     edi,1
        ENDM

bldlen equ (($ - bld8)/8)

        sub     edx,1
        mov     ebx,[esi+4*edx]
        jns     bld8


hutch--

Phil,

On a PIII you need to fix the stall as they are sensitive to the problem of using a BYTE size register after a DWORD sized operation.


bld8:   REPEAT 8
        rol     ebx,4        ; DWORD operation on EBX
        mov     al,0Fh
        and     al,bl        ; BYTE operation on BL
        add     al,'0'
        mov     [edi],al
        add     edi,1
        ENDM


Try XOR EBX, EBX before the ROL or even before the REPEAT block to see if this changes the timing.
Download site for MASM32      New MASM Forum
https://masm32.com          https://masm32.com/board/index.php

Phil

Hutch,
  I can't do an xor ebx, ebx because it contains my 8 bcd digits that I'm unpacking! Also, I need to grab the posts on saving flags and the timing routines that MichaelW and MazeGen were talking about. At first glance I thought I only needed to be concerned about stalls if I had made a deposit into a partial register. I'll check it out again and see if I can figure out what it all means. Obviously, the "and al,bl" needs to wait for the "rol ebx,4" to complete but I have the "mov al,0Fh" between and I thought that would prevent the problem. I'll check it out more thoroughly and get back to you.

Mirno

Alternatively, if your buffer that [edi] is pointing to is 3 bytes longer than needed:

  mov eax, ebx
  and eax, 0Fh
  add eax, '0'
  mov [edi], eax
  add edi, 1


This'll eliminate the partial register stall, but will add 3 trailing zeros to the buffer.

The alternative is to unroll these by hand using different registers, then doing a mass store at the end. This should have allowed the results to have completed and allow the re-order buffer to unify the partial registers.

Mirno

Mirno


        REPEAT  2
        mov     eax, ebx
        and     eax, 0F0Fh
        add     eax, "00"

        mov     ecx, ebx
        shr     ecx, 4
        and     ecx, 0F0Fh
        add     ecx, "00"

        add     edi, 4
        shr     ebx, 16

        mov     [edi - 4], al
        mov     [edi - 3], cl
        mov     [edi - 2], ah
        mov     [edi - 1], al
        ENDM


Mirno

Phil

Thanks Mirno and Hutch. Your suggestions and idea above were a *real* Win! Replacing my code with the following brought me down to 9.6 seconds from 12.6 for 150000 terms written to device nul. It was necessary to extend Mirno's idea to all 8-digits because of the byte ordering. Also, I included a simplified loop at the beginning to store the initial digits if not all 8 are significant. After that the loop at bld8 flys thru the remaining digits 8 at a time. I've reordered the instructions a bit so the next instruction never depends on the previous result so it's a little less clear ... but, I am working to learn how to do things better and faster and it certainly seems that I've come to the right place!
;****************************************************************************
; Unpack BCD and build numeric ASCII term for display
;****************************************************************************
        align   4

BuildString:
        mov     esi,[ebp-4]             ; convert source operand
        mov     edi,pFirst              ; store beginning at first free char
        mov     edx,wksize
skiplz: sub     edx,1                   ; skip leading zeros
        mov     ebx,[esi+4*edx]
        bsr     eax,ebx                 ; find first one bit (nn = 31 to 0)
        jz      skiplz
        mov     ecx,01Ch                ; maximum shift count and mask is 28
        and     eax,ecx
        sub     ecx,eax                 ; shift 28-(nn & 11100) multiple of 4
        jz      bld8
        rol     ebx,cl                  ; rotate significant digits to bit 31
        mov     ecx,eax
        shr     ecx,2
@@:     rol     ebx,4                   ; convert first 1-7 digits
        mov     al,0Fh
        and     al,bl
        add     al,30h
        mov     [edi],al                ; store ASCII digit and bump pointer
        add     edi,1
        sub     ecx,1
        jns     @B
        jmp     bld0                    ; do remaining 8-digits multiples

        align   4

bld8:   mov     eax, ebx                ; quick algorithm sugessted by Mirno
        mov     ecx, ebx
        and     eax, 0F0F0F0Fh
        shr     ecx, 4
        add     eax, 30303030h
        and     ecx, 0F0F0F0Fh
        add     edi, 8
        add     ecx, 30303030h
        mov     [edi - 1], al
        mov     [edi - 2], cl
        mov     [edi - 3], ah
        mov     [edi - 4], ch
        shr     eax, 16
        shr     ecx, 16
        mov     [edi - 5], al
        mov     [edi - 6], cl
        mov     [edi - 7], ah
        mov     [edi - 8], ch
bld0:   sub     edx,1
        mov     ebx,[esi+4*edx]
        jns     bld8



The updated program is attached if anyone is interested.

[attachment deleted by admin]