News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

masm.lib functions dw2ah and a2dw

Started by Larry Hammick, November 30, 2007, 11:28:34 AM

Previous topic - Next topic

Larry Hammick

These look to me like improvements:
dw2ah proc public uses ebx  dwValue:DWORD, lpBuffer:DWORD

    mov ebx,lpBuffer
    mov ecx,8      ;cheaper but slower is push 8 \ pop ecx.
    mov word ptr[ebx+ecx],"H"  ;puts "H" in the low byte and zero in the high.
    mov edx,dwValue    ;get the value from the stack onto the CPU just once.
@@: mov al,dl
    ror edx,4
    and al,0Fh
    cmp al,0Ah  ;A trick using binary coded decimal in AL.
    sbb al,69h
    das      ;das and other b.c.d. opcodes use the auxiliary carry flag AF.
    dec ecx
    mov [ebx+ecx], al
    jnz short @B
    ret

dw2ah endp

a2dw proc uses ebx edi ecx edx String:DWORD
      ;----------------------------------------
      ; Convert decimal string into dword value
      ; return value in eax
      ;----------------------------------------
      ; Does not detect overflow of 32 bits, nor any invalid digit
      ;----------------------------------------
      mov ebx, 10
      xor ecx, ecx    ;ecx will accumulate the number
      mov edi, String
@@:   mov al,[edi]
      and eax,0FFh
      jz short @F
      sub al,"0"
      inc edi
      xchg eax, ecx
      mul ebx
      add ecx, eax
      jmp short @B
@@:   xchg eax, ecx
      ret

a2dw endp

Here's a variation of a2dw for reading decimal quadwords:
a2qw:   ;read asciiz string to quadword edx:eax
    push ebp
    mov ebp,esp
    push esi
    mov esi,[ebp+8]
    push ebx
    push edi
    xor ebx,ebx   ;ebx:edi will accumulate the number
    xor edi,edi
    mov ecx,10
@@: xor eax,eax
    cdq
    lodsb
    or al,al
    jz short @F
    sub al,"0"
    xchg eax,ebx
    mul ecx
    ;jc Overflowed
    xchg eax,ebx
    xchg eax,edi
    mul ecx
    add edi,eax
    adc ebx,edx
    ;jc Overflowed
    jmp short @B
@@: mov edx,ebx
    xchg eax,edi
    pop edi
    pop ebx
    pop esi
    pop ebp
    ret 4

Larry Hammick

ltoa can also be slightly improved:

ltoa proc lValue:DWORD, lpBuffer:DWORD

comment * -------------------------------------------------------
        convert signed 32 bit integer "lValue" to zero terminated
        string and store string at address in "lpBuffer"
        ------------------------------------------------------- *

    push lValue
    call @F  ;a trick for pushing the address of a constant string or other read-only data
    db "%ld",0
  @@:
    push lpBuffer
    call wsprintf
    cmp eax, 3
    jge @F
    xor eax, eax    ; zero EAX on fail
  @@:               ; else EAX contains count of bytes written
    ret

ltoa endp

Vortex

Hi Larry,

You need to balance the stack after calling wsprintf :

.
.
push lpBuffer
call wsprintf
add esp,3*4  ;  three parameters are passed to wsprintf
.
.


wsprintf is a C function.

Larry Hammick

Quote from: Vortex on December 01, 2007, 10:06:53 AM
Hi Larry,

You need to balance the stack after calling wsprintf :

.
.
push lpBuffer
call wsprintf
add esp,3*4  ;  three parameters are passed to wsprintf
.
.


wsprintf is a C function.
True, wsprintf doesn't clean the stack, but the "proc" thingee looks after it, by using "leave". Here's the original disassembled:
0040106E  /. 55             PUSH EBP
0040106F  |. 8BEC           MOV EBP,ESP
00401071  |. EB 04          JMP SHORT c.00401077
00401073  |. 25 6C 64 00    ASCII "%ld",0
00401077  |> FF75 08        PUSH DWORD PTR SS:[EBP+8]
0040107A  |. 68 73104000    PUSH c.00401073
0040107F  |. FF75 0C        PUSH DWORD PTR SS:[EBP+C]
00401082  |. E8 9F010000    CALL <JMP.&user32.wsprintfA>
00401087  |. 83C4 0C        ADD ESP,0C
0040108A  |. 83F8 03        CMP EAX,3
0040108D  |. 7D 02          JGE SHORT c.00401091
0040108F  |. 33C0           XOR EAX,EAX
00401091  |> C9             LEAVE
00401092  \. C2 0800        RETN 8

Vortex

Interesting. Could you provide the full source code? This is my testing :

test.asm :

.386
.model flat, stdcall
option casemap:none

include \masm32\include\user32.inc

.code

ltoa proc lValue:DWORD, lpBuffer:DWORD

comment * -------------------------------------------------------
        convert signed 32 bit integer "lValue" to zero terminated
        string and store string at address in "lpBuffer"
        ------------------------------------------------------- *

    push lValue
    call @F  ;a trick for pushing the address of a constant string or other read-only data
    db "%ld",0
  @@:
    push lpBuffer
    call wsprintf
    cmp eax, 3
    jge @F
    xor eax, eax    ; zero EAX on fail
  @@:               ; else EAX contains count of bytes written
    ret

ltoa endp

end


Disassembling with Agner Fog's tool objconv.exe :

objconv.exe -fasm test.obj disasm.asm


; Disassembly of file: test.obj
; Sun Dec 02 12:34:01 2007

; Mode: 32 bits
; Syntax: MASM/ML
; Instruction set: 80386

.386
option dotname
.model flat

public _ltoa@8

extern _wsprintfA: near


_text   SEGMENT DWORD PUBLIC 'CODE'                     ; section number 1

_ltoa@8 PROC NEAR
        push    ebp                                     ; 0000 _ 55
        mov     ebp, esp                                ; 0001 _ 8B. EC
        push    dword ptr [ebp + 08H]                   ; 0003 _ FF. 75, 08
        call    ?_002                                   ; 0006 _ E8, 00000004
?_001:
; Error: Instruction out of phase with next label
;       and     eax, 0FF00646CH                         ; 000B _ 25, FF00646C
        db 25H, 6CH, 64H, 00H

?_002   LABEL NEAR
        push    dword ptr [ebp + 0CH]                   ; 000F _ FF. 75, 0C
        call    _wsprintfA                              ; 0012 _ E8, 00000000(rel)
        cmp     eax, 3                                  ; 0017 _ 83. F8, 03
        jge     ?_003                                   ; 001A _ 7D, 02
        xor     eax, eax                                ; 001C _ 33. C0
?_003:  leave                                           ; 001E _ C9
        ret     8                                       ; 001F _ C2, 0008
_ltoa@8 ENDP
_text   ENDS

_data   SEGMENT DWORD PUBLIC 'DATA'                     ; section number 2

        db      34 dup (?)                              ; 0000 _
_data   ENDS

END

MichaelW

In my tests on a P3, the “improved� version is ~17 cycles slower than the original. The cycle count for wsprintf is some 33 times larger than the cycle count for the other code, so there is little benefit to be had from optimization.

; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
    include \masm32\include\masm32rt.inc
    .686
    include \masm32\macros\timers.asm
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
    .data
      buffer db 16 dup(0)
    .code
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««

align 4
_ltoa proc lValue:DWORD, lpBuffer:DWORD

comment * -------------------------------------------------------
        convert signed 32 bit integer "lValue" to zero terminated
        string and store string at address in "lpBuffer"
        ------------------------------------------------------- *

    push lValue
    call @F  ;a trick for pushing the address of a constant string or other read-only data
    db "%ld",0
  @@:
    push lpBuffer
    call wsprintf
    ;add esp, 12

    cmp eax, 3
    jge @F
    xor eax, eax    ; zero EAX on fail
  @@:               ; else EAX contains count of bytes written
    ret

_ltoa endp

; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
start:
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
    invoke _ltoa, 12345678, ADDR buffer
    print ADDR buffer,13,10

    invoke Sleep, 3000

    counter_begin 100000,HIGH_PRIORITY_CLASS
      invoke _ltoa, 12345678, ADDR buffer
    counter_end
    print ustr$(eax)," cycles",13,10

    counter_begin 100000,HIGH_PRIORITY_CLASS
      invoke ltoa, 12345678, ADDR buffer
    counter_end
    print ustr$(eax)," cycles",13,10

    counter_begin 100000,HIGH_PRIORITY_CLASS
      invoke _ltoa, 12345678, ADDR buffer
    counter_end
    print ustr$(eax)," cycles",13,10

    inkey "Press any key to exit..."
    exit
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
end start

eschew obfuscation

Larry Hammick

Quote from: MichaelW on December 02, 2007, 07:16:55 PM
In my tests on a P3, the “improved� version is ~17 cycles slower than the original. The cycle count for wsprintf is some 33 times larger than the cycle count for the other code, so there is little benefit to be had from optimization.
Surprising. I guess the "call" instruction costs mips, whereas the original uses a jump instead, and then loads a constant address from the code section. And no doubt the wsprintf swamps the rest of the code anyhow.
Here are Olly's disassembly of the new and old ltoa.
00401000 55             PUSH EBP
00401001 8BEC           MOV EBP,ESP
00401003 FF75 08        PUSH DWORD PTR SS:[EBP+8]
00401006 E8 04000000    CALL c.0040100F
0040100B 25 6C 64 00    ASCII "%ld",0
0040100F FF75 0C        PUSH DWORD PTR SS:[EBP+C]
00401012 E8 07020000    CALL <JMP.&user32.wsprintfA>
00401017 83F8 03        CMP EAX,3
0040101A 7D 02          JGE SHORT c.0040101E
0040101C 33C0           XOR EAX,EAX
0040101E C9             LEAVE
0040101F C2 0800        RETN 8
;;;;;;;;
00401022 55             PUSH EBP
00401023 8BEC           MOV EBP,ESP
00401025 EB 04          JMP SHORT c.0040102B
00401027 25 6C 64 00    ASCII "%ld",0
0040102B FF75 08        PUSH DWORD PTR SS:[EBP+8]
0040102E 68 27104000    PUSH c.00401027
00401033 FF75 0C        PUSH DWORD PTR SS:[EBP+C]
00401036 E8 E3010000    CALL <JMP.&user32.wsprintfA>
0040103B 83C4 0C        ADD ESP,0C
     ;add esp,0Ch does not appear in the original asm file, but
     ;got inserted by "invoke", apparently
0040103E 83F8 03        CMP EAX,3
00401041 7D 02          JGE SHORT c.00401045
00401043 33C0           XOR EAX,EAX
00401045 C9             LEAVE
00401046 C2 0800        RETN 8


Vortex, objconv tried to interpret the format string as code rather than embedded data, but the revised version runs okay. (Ollydbg -- a remarkably smart program -- figured out that it was ascii.) The new version is five bytes smaller, if that counts for anything. Three of those bytes could be conserved in the old version by using CALL instead of INVOKE for wsprintf.

Vortex

Yes, it's true that objconv interprets the ASCII byte sequence as code but it's a powerfull tool to disassemble object code. It appears that leave does the stack balancing job. The Visual C++ compiler is also applying this trick to optimize code. ( it does not insert automaticaly the add esp,XX statement if it can find a case to do this optimization. )

Mark Jones

If someone felt like making a small testbed app looping through these routines for a few seconds, I could feed it through CodeAnalyst and report exactly what was slowing it down, from cache misses, to branch mispredictions, to register stalls.

I fed it a simple console prime-number-factorization routine and found some interesting and unusual caveats, like a XOR in my code that was 15x slower than a DIV.
"To deny our impulses... foolish; to revel in them, chaos." MCJ 2003.08

jj2007

Quote from: Larry Hammick on December 02, 2007, 09:03:30 PM

0040102B FF75 08        PUSH DWORD PTR SS:[EBP+8]
0040102E 68 27104000    PUSH c.00401027
00401033 FF75 0C        PUSH DWORD PTR SS:[EBP+C]
00401036 E8 E3010000    CALL <JMP.&user32.wsprintfA>
0040103B 83C4 0C        ADD ESP,0C
     ;add esp,0Ch does not appear in the original asm file, but
     ;got inserted by "invoke", apparently
Balancing with LEAVE is ok if this happens in a subroutine without loops. Imagine what happens if you do the call 10,000 times before the code runs into LEAVE... :red
Stupid question: If I use

invoke GetProcAddress,hWindowsDLL, chr$('AnyAPI_Export')
push 123
push NULL ; some params
call eax
add esp,8 ; needed or not?

Do Windows APIs cleanup the stack? Do MASM macros behave the same?

Larry Hammick

I'm not a great master of Windows API, but as far as I know, all the functions clean the stack except wsprintf, no doubt because that one takes a variable number of parameters.
While we're at it, here's a revision of getcl.asm in masm32.lib. Replace
    xor ecx, ecx            ; zero ecx & use as counter
    mov esi, lpCmdLine
   
    @@:
      lodsb
      cmp al, 0
      je @F
      cmp al, 34            ; [ " ] character
      jne @B
      inc ecx               ; increment counter
      jmp @B
    @@:

    push ecx                ; save count

    shr ecx, 1              ; integer divide ecx by 2
    shl ecx, 1              ; multiply ecx by 2 to get dividend

    pop eax                 ; put count in eax
    cmp eax, ecx            ; check if they are the same
    je @F

with
    xor ecx,ecx      ;cl is the number of " symbols mod 2
    mov esi, lpCmdLine
   
    @@:
      lodsb
      cmp al, 0     ;variant: cmp al,ch
      je @F
      cmp al,'"'
      jne @B
      not cl         ; flip the counter (mod 2)
      jmp @B
    @@:
    jecxz @F     ;compact, but not the fastest possible way

or
    xor ecx,ecx      ;ecx counts the number of " symbols
    mov esi, lpCmdLine
   
    @@:
      lodsb
      cmp al, 0
      je @F
      cmp al,'"'
      jne @B
      inc ecx
      jmp @B
    @@:
    shr ecx,1
    jnc @F