Here's the version I've got:
align 4
szRtrim proc src:DWORD,dst:DWORD
  push esi
  push edi
  mov esi, src
  mov edi, dst
  sub esi, 1
 @@:
  add esi, 1
  cmp BYTE PTR [esi], 32
  je @B
  cmp BYTE PTR [esi], 9
  je @B
  cmp BYTE PTR [esi], 0
  jne @F
  xor eax, eax      ; return zero on empty string
  mov BYTE PTR [edi], 0  ; set string length to zero
  jmp szLout
 @@:
  mov esi, src
  xor ecx, edx
  xor ecx, ecx    ; ECX as index and location counter
 @@:
  mov al, [esi+ecx]  ; copy bytes from src to dst
  mov [edi+ecx], al
  add ecx, 1
  test al, al
  je @F        ; exit on zero
  cmp al, 33
  jb @B
  mov edx, ecx    ; store count if asc 33 or greater
  jmp @B
 @@:
  mov BYTE PTR [edi+edx], 0
  mov eax, edx    ; return length of trimmed string
  mov ecx, dst
 szLout:
  pop edi
  pop esi
  ret
szRtrim endp
The instruction "xor ecx, edx" should be omitted.
"mov BYTE PTR [edi], 0" may be replaced by "mov [edi],al", and
"mov BYTE PTR [edi+edx], 0" by "mov [edi+edx],al"
because AL is zero in both cases and the AL variations are shorter.
A different style, equally fast according to some testing here:
szRtrim:Â ;{lpsrc,lpdest}
  push ebx
  mov ebx,[esp+12]  ;dest
  mov ecx,[esp+8]  ;src
  mov edx,ebx  ;edx maintains 1+address of the last nonspace in the destination
  sub ecx,ebx
;align 8Â Â Â ;for speed, this can make an appreciable difference
@@: mov al,[ecx+ebx]
  mov [ebx],al
  add ebx,1  ;on the P4 this is massively faster than INC EBX or LEA EBX,[EBX+1]
  test al,al   ;the two-byte INC EBX (db 0FFh,0C3h) is not great either
  jz short @F
  cmp al," "
  je short @B
  cmp al,9     ;tab
  cmovne edx,ebx  ;slower but equivalent is "JE @B / MOV EDX,EBX"
  jmp short @B
@@: mov [edx],al   ;zero
  mov ecx,[esp+12]    ;if desired
  mov eax,edx
  sub eax,ecx   ;return new string length
  pop ebx
  ret 8
Larry,
On late model Intel PIV I would do thes mods to try and get the speed up a little.
@@:
movzx eax, BYTE PTR [esi+ecx] ; copy bytes from src to dst
mov [edi+ecx], al
add ecx, 1
test eax, eax
je @F ; exit on zero
cmp al, 33
jb @B
mov edx, ecx ; store count if asc 33 or greater
jmp @B