These look to me like improvements:
dw2ah proc public uses ebx dwValue:DWORD, lpBuffer:DWORD
  mov ebx,lpBuffer
  mov ecx,8   ;cheaper but slower is push 8 \ pop ecx.
  mov word ptr[ebx+ecx],"H" ;puts "H" in the low byte and zero in the high.
  mov edx,dwValue  ;get the value from the stack onto the CPU just once.
@@: mov al,dl
  ror edx,4
  and al,0Fh
  cmp al,0Ah ;A trick using binary coded decimal in AL.
  sbb al,69h
  das   ;das and other b.c.d. opcodes use the auxiliary carry flag AF.
  dec ecx
  mov [ebx+ecx], al
  jnz short @B
  ret
dw2ah endp
a2dw proc uses ebx edi ecx edx String:DWORD
   ;----------------------------------------
   ; Convert decimal string into dword value
   ; return value in eax
   ;----------------------------------------
   ; Does not detect overflow of 32 bits, nor any invalid digit
   ;----------------------------------------
   mov ebx, 10
   xor ecx, ecx  ;ecx will accumulate the number
   mov edi, String
@@:Â Â mov al,[edi]
   and eax,0FFh
   jz short @F
   sub al,"0"
   inc edi
   xchg eax, ecx
   mul ebx
   add ecx, eax
   jmp short @B
@@:Â Â xchg eax, ecx
   ret
a2dw endp
Here's a variation of a2dw for reading decimal quadwords:
a2qw:Â Â ;read asciiz string to quadword edx:eax
  push ebp
  mov ebp,esp
  push esi
  mov esi,[ebp+8]
  push ebx
  push edi
  xor ebx,ebx  ;ebx:edi will accumulate the number
  xor edi,edi
  mov ecx,10
@@: xor eax,eax
  cdq
  lodsb
  or al,al
  jz short @F
  sub al,"0"
  xchg eax,ebx
  mul ecx
  ;jc Overflowed
  xchg eax,ebx
  xchg eax,edi
  mul ecx
  add edi,eax
  adc ebx,edx
  ;jc Overflowed
  jmp short @B
@@: mov edx,ebx
  xchg eax,edi
  pop edi
  pop ebx
  pop esi
  pop ebp
  ret 4
ltoa can also be slightly improved:
ltoa proc lValue:DWORD, lpBuffer:DWORD
comment * -------------------------------------------------------
    convert signed 32 bit integer "lValue" to zero terminated
    string and store string at address in "lpBuffer"
    ------------------------------------------------------- *
  push lValue
  call @F ;a trick for pushing the address of a constant string or other read-only data
  db "%ld",0
 @@:
  push lpBuffer
  call wsprintf
  cmp eax, 3
  jge @F
  xor eax, eax  ; zero EAX on fail
 @@:        ; else EAX contains count of bytes written
  ret
ltoa endp
Hi Larry,
You need to balance the stack after calling wsprintf :
.
.
push lpBuffer
call wsprintf
add esp,3*4 ; three parameters are passed to wsprintf
.
.
wsprintf is a C function.
Quote from: Vortex on December 01, 2007, 10:06:53 AM
Hi Larry,
You need to balance the stack after calling wsprintf :
.
.
push lpBuffer
call wsprintf
add esp,3*4Â ;Â three parameters are passed to wsprintf
.
.
wsprintf is a C function.
True, wsprintf doesn't clean the stack, but the "proc" thingee looks after it, by using "leave". Here's the original disassembled:
0040106EÂ /. 55Â Â Â Â Â Â Â PUSH EBP
0040106FÂ |. 8BECÂ Â Â Â Â Â MOV EBP,ESP
00401071Â |. EB 04Â Â Â Â Â JMP SHORT c.00401077
00401073Â |. 25 6C 64 00Â Â ASCII "%ld",0
00401077Â |> FF75 08Â Â Â Â PUSH DWORD PTR SS:[EBP+8]
0040107AÂ |. 68 73104000Â Â PUSH c.00401073
0040107FÂ |. FF75 0CÂ Â Â Â PUSH DWORD PTR SS:[EBP+C]
00401082Â |. E8 9F010000Â Â CALL <JMP.&user32.wsprintfA>
00401087Â |. 83C4 0CÂ Â Â Â ADD ESP,0C
0040108AÂ |. 83F8 03Â Â Â Â CMP EAX,3
0040108DÂ |. 7D 02Â Â Â Â Â JGE SHORT c.00401091
0040108FÂ |. 33C0Â Â Â Â Â Â XOR EAX,EAX
00401091Â |> C9Â Â Â Â Â Â Â LEAVE
00401092Â \. C2 0800Â Â Â Â RETN 8
Interesting. Could you provide the full source code? This is my testing :
test.asm :
.386
.model flat, stdcall
option casemap:none
include \masm32\include\user32.inc
.code
ltoa proc lValue:DWORD, lpBuffer:DWORD
comment * -------------------------------------------------------
convert signed 32 bit integer "lValue" to zero terminated
string and store string at address in "lpBuffer"
------------------------------------------------------- *
push lValue
call @F ;a trick for pushing the address of a constant string or other read-only data
db "%ld",0
@@:
push lpBuffer
call wsprintf
cmp eax, 3
jge @F
xor eax, eax ; zero EAX on fail
@@: ; else EAX contains count of bytes written
ret
ltoa endp
end
Disassembling with Agner Fog's tool objconv.exe :
objconv.exe -fasm test.obj disasm.asm
; Disassembly of file: test.obj
; Sun Dec 02 12:34:01 2007
; Mode: 32 bits
; Syntax: MASM/ML
; Instruction set: 80386
.386
option dotname
.model flat
public _ltoa@8
extern _wsprintfA: near
_text SEGMENT DWORD PUBLIC 'CODE' ; section number 1
_ltoa@8 PROC NEAR
push ebp ; 0000 _ 55
mov ebp, esp ; 0001 _ 8B. EC
push dword ptr [ebp + 08H] ; 0003 _ FF. 75, 08
call ?_002 ; 0006 _ E8, 00000004
?_001:
; Error: Instruction out of phase with next label
; and eax, 0FF00646CH ; 000B _ 25, FF00646C
db 25H, 6CH, 64H, 00H
?_002 LABEL NEAR
push dword ptr [ebp + 0CH] ; 000F _ FF. 75, 0C
call _wsprintfA ; 0012 _ E8, 00000000(rel)
cmp eax, 3 ; 0017 _ 83. F8, 03
jge ?_003 ; 001A _ 7D, 02
xor eax, eax ; 001C _ 33. C0
?_003: leave ; 001E _ C9
ret 8 ; 001F _ C2, 0008
_ltoa@8 ENDP
_text ENDS
_data SEGMENT DWORD PUBLIC 'DATA' ; section number 2
db 34 dup (?) ; 0000 _
_data ENDS
END
In my tests on a P3, the “improved� version is ~17 cycles slower than the original. The cycle count for wsprintf is some 33 times larger than the cycle count for the other code, so there is little benefit to be had from optimization.
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
  include \masm32\include\masm32rt.inc
  .686
  include \masm32\macros\timers.asm
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
  .data
   buffer db 16 dup(0)
  .code
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
align 4
_ltoa proc lValue:DWORD, lpBuffer:DWORD
comment * -------------------------------------------------------
    convert signed 32 bit integer "lValue" to zero terminated
    string and store string at address in "lpBuffer"
    ------------------------------------------------------- *
  push lValue
  call @F ;a trick for pushing the address of a constant string or other read-only data
  db "%ld",0
 @@:
  push lpBuffer
  call wsprintf
  ;add esp, 12
  cmp eax, 3
  jge @F
  xor eax, eax  ; zero EAX on fail
 @@:        ; else EAX contains count of bytes written
  ret
_ltoa endp
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
start:
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
  invoke _ltoa, 12345678, ADDR buffer
  print ADDR buffer,13,10
  invoke Sleep, 3000
  counter_begin 100000,HIGH_PRIORITY_CLASS
   invoke _ltoa, 12345678, ADDR buffer
  counter_end
  print ustr$(eax)," cycles",13,10
  counter_begin 100000,HIGH_PRIORITY_CLASS
   invoke ltoa, 12345678, ADDR buffer
  counter_end
  print ustr$(eax)," cycles",13,10
  counter_begin 100000,HIGH_PRIORITY_CLASS
   invoke _ltoa, 12345678, ADDR buffer
  counter_end
  print ustr$(eax)," cycles",13,10
  inkey "Press any key to exit..."
  exit
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
end start
Quote from: MichaelW on December 02, 2007, 07:16:55 PM
In my tests on a P3, the “improved� version is ~17 cycles slower than the original. The cycle count for wsprintf is some 33 times larger than the cycle count for the other code, so there is little benefit to be had from optimization.
Surprising. I guess the "call" instruction costs mips, whereas the original uses a jump instead, and then loads a constant address from the code section. And no doubt the wsprintf swamps the rest of the code anyhow.
Here are Olly's disassembly of the new and old ltoa.
00401000 55Â Â Â Â Â Â Â PUSH EBP
00401001 8BECÂ Â Â Â Â Â MOV EBP,ESP
00401003 FF75 08Â Â Â Â PUSH DWORD PTR SS:[EBP+8]
00401006 E8 04000000Â Â CALL c.0040100F
0040100B 25 6C 64 00Â Â ASCII "%ld",0
0040100F FF75 0CÂ Â Â Â PUSH DWORD PTR SS:[EBP+C]
00401012 E8 07020000Â Â CALL <JMP.&user32.wsprintfA>
00401017 83F8 03Â Â Â Â CMP EAX,3
0040101A 7D 02Â Â Â Â Â JGE SHORT c.0040101E
0040101C 33C0Â Â Â Â Â Â XOR EAX,EAX
0040101E C9Â Â Â Â Â Â Â LEAVE
0040101F C2 0800Â Â Â Â RETN 8
;;;;;;;;
00401022 55Â Â Â Â Â Â Â PUSH EBP
00401023 8BECÂ Â Â Â Â Â MOV EBP,ESP
00401025 EB 04Â Â Â Â Â JMP SHORT c.0040102B
00401027 25 6C 64 00Â Â ASCII "%ld",0
0040102B FF75 08Â Â Â Â PUSH DWORD PTR SS:[EBP+8]
0040102E 68 27104000Â Â PUSH c.00401027
00401033 FF75 0CÂ Â Â Â PUSH DWORD PTR SS:[EBP+C]
00401036 E8 E3010000Â Â CALL <JMP.&user32.wsprintfA>
0040103B 83C4 0CÂ Â Â Â ADD ESP,0C
   ;add esp,0Ch does not appear in the original asm file, but
   ;got inserted by "invoke", apparently
0040103E 83F8 03Â Â Â Â CMP EAX,3
00401041 7D 02Â Â Â Â Â JGE SHORT c.00401045
00401043 33C0Â Â Â Â Â Â XOR EAX,EAX
00401045 C9Â Â Â Â Â Â Â LEAVE
00401046 C2 0800Â Â Â Â RETN 8
Vortex, objconv tried to interpret the format string as code rather than embedded data, but the revised version runs okay. (Ollydbg -- a remarkably smart program -- figured out that it was ascii.) The new version is five bytes smaller, if that counts for anything. Three of those bytes could be conserved in the old version by using CALL instead of INVOKE for wsprintf.
Yes, it's true that objconv interprets the ASCII byte sequence as code but it's a powerfull tool to disassemble object code. It appears that leave does the stack balancing job. The Visual C++ compiler is also applying this trick to optimize code. ( it does not insert automaticaly the add esp,XX statement if it can find a case to do this optimization. )
If someone felt like making a small testbed app looping through these routines for a few seconds, I could feed it through CodeAnalyst and report exactly what was slowing it down, from cache misses, to branch mispredictions, to register stalls.
I fed it a simple console prime-number-factorization routine and found some interesting and unusual caveats, like a XOR in my code that was 15x slower than a DIV.
Quote from: Larry Hammick on December 02, 2007, 09:03:30 PM
0040102B FF75 08 PUSH DWORD PTR SS:[EBP+8]
0040102E 68 27104000 PUSH c.00401027
00401033 FF75 0C PUSH DWORD PTR SS:[EBP+C]
00401036 E8 E3010000 CALL <JMP.&user32.wsprintfA>
0040103B 83C4 0C ADD ESP,0C
;add esp,0Ch does not appear in the original asm file, but
;got inserted by "invoke", apparently
Balancing with LEAVE is ok if this happens in a subroutine without loops. Imagine what happens if you do the call 10,000 times before the code runs into LEAVE... :red
Stupid question: If I use
invoke GetProcAddress,hWindowsDLL, chr$('AnyAPI_Export')
push 123
push NULL ; some params
call eax
add esp,8 ; needed or not?
Do Windows APIs cleanup the stack? Do MASM macros behave the same?
I'm not a great master of Windows API, but as far as I know, all the functions clean the stack except wsprintf, no doubt because that one takes a variable number of parameters.
While we're at it, here's a revision of getcl.asm in masm32.lib. Replace
  xor ecx, ecx      ; zero ecx & use as counter
  mov esi, lpCmdLine
 Â
  @@:
   lodsb
   cmp al, 0
   je @F
   cmp al, 34      ; [ " ] character
   jne @B
   inc ecx        ; increment counter
   jmp @B
  @@:
  push ecx        ; save count
  shr ecx, 1       ; integer divide ecx by 2
  shl ecx, 1       ; multiply ecx by 2 to get dividend
  pop eax         ; put count in eax
  cmp eax, ecx      ; check if they are the same
  je @F
with
  xor ecx,ecx   ;cl is the number of " symbols mod 2
  mov esi, lpCmdLine
 Â
  @@:
   lodsb
   cmp al, 0   ;variant: cmp al,ch
   je @F
   cmp al,'"'
   jne @B
   not cl     ; flip the counter (mod 2)
   jmp @B
  @@:
  jecxz @F   ;compact, but not the fastest possible way
or
  xor ecx,ecx   ;ecx counts the number of " symbols
  mov esi, lpCmdLine
 Â
  @@:
   lodsb
   cmp al, 0
   je @F
   cmp al,'"'
   jne @B
   inc ecx
   jmp @B
  @@:
  shr ecx,1
  jnc @F