News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

help with Asc2utf

Started by ecube, May 06, 2007, 02:35:31 AM

Previous topic - Next topic

ecube

below is delphi code that coverts a ascii string to UTF. I tried porting to masm but failed miserably :( I don't know delphi but is pretty easy to under this snipletthe delphi function ord "Provides the Ordinal value of an integer, character or enum" so if you did it on the letter A for instance it would return 65 (it's decimal value). Anyway if anyone can help me fix my code i'd really appreciate it.


delphi code that coverts ascii to UTF

function asc2utf(s:string):string;
var t:string;
    i:integer;
begin
     t:='';
     for i:=1 to length(s) do begin
         if ord(s[i])<128 then t:=t+s[i]
         else begin
              t:=t+chr(192+(ord(s[i]) div 64));
              t:=t+chr(128+ord(s[i]) and 63);
         end;
     end;
     asc2utf:=t;
end;


my attempt at converting it to masm(doesn't work)

.586
.model flat, stdcall
option casemap:none
include \masm32\include\windows.inc
include \masm32\include\kernel32.inc
include \masm32\include\user32.inc
includelib \masm32\lib\kernel32.lib
includelib \masm32\lib\user32.lib

Asc2utf proto :DWORD,:DWORD

     CTEXT MACRO text:VARARG
            local TxtName
              .data
               TxtName BYTE text,0
              .code
            EXITM <ADDR TxtName>
     ENDM

.data?
mybuf db 1024 dup(?)

.code
start:
invoke Asc2utf,CTEXT("testing"),addr mybuf
invoke MessageBox,0,addr mybuf,addr mybuf,MB_OK
invoke ExitProcess,0

Asc2utf proc iString:DWORD,oString:DWORD
LOCAL icount:DWORD
LOCAL ilen:DWORD
invoke lstrlen,iString
mov ilen,eax
mov ebx,eax
mov icount,1
mov edi,iString
mov esi,oString

.While icount < ebx
cmp byte ptr [edi],128
jge @F
mov al,byte ptr [edi]
mov [esi],al
inc esi

;t:=t+chr(192+(ord(s[i]) div 64));
mov byte ptr [esi],192
inc esi
xor edx,edx
mov al,byte ptr [edi]
mov [eax],al
mov ecx,64
div ecx
mov [esi],eax
inc esi

;t:=t+chr(128+ord(s[i]) and 63);
mov byte ptr [esi],128
inc esi
xor edx,edx
mov al,byte ptr [edi]
mov [eax],al
mov ecx,63
and ecx,eax
mov [esi],eax
inc esi

@@:
inc icount
inc edi
.endw

ret
Asc2utf endp
end start



converting back(i didn't even attempt this :(  )

function utf2asc(s:string):string;
var
  t:string;
  i:integer;
begin
  i:=1;
  t:='';
  while i<=length(s) do begin
    if ord(s[i])<128 then t:=t+s[i]
    else begin
      t:=t+chr(((ord(s[i]) and 3)*64)+(ord(s[i+1]) and 63));
      inc(i);
    end;
    if t[length(t)]=#0 then t[length(t)]:=#255;
    inc(i);
  end;
  utf2asc:=t;
end;

hutch--

cube,

I am not familiar with this type of conversion but i wonder whether you can do it with the normal ANSI to UNICODE APIs ?
Download site for MASM32      New MASM Forum
https://masm32.com          https://masm32.com/board/index.php

MichaelW

#2
This appears to work in one direction, but I'm out of time to work on it for now.

Edit: Now both directions.


; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
    include \masm32\include\masm32rt.inc

    CP_UTF8 equ 65001   ;; UTF-8 translation

    HexAsciiDump PROTO :DWORD, :DWORD
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
    .data
      ascii       db "abcd",128,129,130,0,8 dup(0)
      utf8        db 100 dup(0)
      wide        dw 100 dup(0)
    .code
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
start:
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
    invoke HexAsciiDump,ADDR ascii,1
    print chr$(13,10)

    print "rv ascii to wide: "
    invoke MultiByteToWideChar,CP_ACP,
                               MB_PRECOMPOSED,
                               ADDR ascii,
                               -1,
                               ADDR wide,
                               LENGTHOF ascii
    print ustr$(eax),13,10

    invoke HexAsciiDump,ADDR wide,1

    print "rv wide to utf8: "
    invoke WideCharToMultiByte,CP_UTF8,
                               0,
                               ADDR wide,
                               -1,
                               ADDR utf8,
                               100,
                               NULL,
                               NULL
    print ustr$(eax),13,10

    invoke HexAsciiDump,ADDR utf8,1

    print chr$(13,10)

    print "rv utf8 to wide: "
    invoke MultiByteToWideChar,CP_UTF8,
                               0,
                               ADDR utf8,
                               -1,
                               ADDR wide,
                               LENGTHOF ascii
    print ustr$(eax),13,10

    invoke HexAsciiDump,ADDR wide,1

    print "rv wide to ascii: "

    invoke WideCharToMultiByte,CP_ACP,
                               0,
                               ADDR wide,
                               -1,
                               ADDR ascii,
                               LENGTHOF ascii,
                               NULL,
                               NULL
    print ustr$(eax),13,10

    invoke HexAsciiDump, ADDR ascii, 1

    print chr$(13,10)

    inkey "Press any key to exit..."
    exit
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««

HexAsciiDump proc uses ebx esi edi address:DWORD, nparagraphs:DWORD

    mov esi, address
    mov ecx, nparagraphs
    test ecx, ecx
    jnz  rowLoop
    ret

    align 16

  hex_table:

    db "000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F"
    db "202122232425262728292A2B2C2D2E2F303132333435363738393A3B3C3D3E3F"
    db "404142434445464748494A4B4C4D4E4F505152535455565758595A5B5C5D5E5F"
    db "606162636465666768696A6B6C6D6E6F707172737475767778797A7B7C7D7E7F"
    db "808182838485868788898A8B8C8D8E8F909192939495969798999A9B9C9D9E9F"
    db "A0A1A2A3A4A5A6A7A8A9AAABACADAEAFB0B1B2B3B4B5B6B7B8B9BABBBCBDBEBF"
    db "C0C1C2C3C4C5C6C7C8C9CACBCCCDCECFD0D1D2D3D4D5D6D7D8D9DADBDCDDDEDF"
    db "E0E1E2E3E4E5E6E7E8E9EAEBECEDEEEFF0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFF"

  ascii_table:

    db ".",".",".",".",".",".",".",".",".",".",".",".",".",".",".","."
    db ".",".",".",".",".",".",".",".",".",".",".",".",".",".",".","."
    db " ","!",'"',"#","$","%","&","'","(",")","*","+",",","-",".","/"
    db "0","1","2","3","4","5","6","7","8","9",":",";","<","=",">","?"
    db "@","A","B","C","D","E","F","G","H","I","J","K","L","M","N","O"
    db "P","Q","R","S","T","U","V","W","X","Y","Z","[","\","]","^","_"
    db "`","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o"
    db "p","q","r","s","t","u","v","w","x","y","z","{","|","}","~","."
    db ".",".",".",".",".",".",".",".",".",".",".",".",".",".",".","."
    db ".",".",".",".",".",".",".",".",".",".",".",".",".",".",".","."
    db ".",".",".",".",".",".",".",".",".",".",".",".",".",".",".","."
    db ".",".",".",".",".",".",".",".",".",".",".",".",".",".",".","."
    db ".",".",".",".",".",".",".",".",".",".",".",".",".",".",".","."
    db ".",".",".",".",".",".",".",".",".",".",".",".",".",".",".","."
    db ".",".",".",".",".",".",".",".",".",".",".",".",".",".",".","."
    db ".",".",".",".",".",".",".",".",".",".",".",".",".",".",".","."

  rowLoop:

    push ecx
    print uhex$(esi)," "
    lea edi, hex_table
    push esi
    REPEAT 8
      print " "
      movzx ebx, BYTE PTR [esi]
      movzx eax, BYTE PTR [edi+ebx*2]
      invoke crt__putch, eax
      movzx eax, BYTE PTR [edi+ebx*2+1]
      invoke crt__putch, eax
      inc esi
    ENDM
    print " - "
    REPEAT 8
      movzx ebx, BYTE PTR [esi]
      movzx eax, BYTE PTR [edi+ebx*2]
      invoke crt__putch, eax
      movzx eax, BYTE PTR [edi+ebx*2+1]
      invoke crt__putch, eax
      print " "
      inc esi
    ENDM
    pop esi

    lea edi, ascii_table
    print "  "
    REPEAT 16
      movzx ebx, BYTE PTR [esi]
      movzx eax, BYTE PTR [edi+ebx]
      invoke crt__putch, eax
      inc esi
    ENDM
    print chr$(13,10)

    pop ecx
    dec ecx
    jnz rowLoop

    ret

HexAsciiDump endp

; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
end start


00403000  61 62 63 64 80 81 82 00 - 00 00 00 00 00 00 00 00   abcd............

rv ascii to wide: 8
00403074  61 00 62 00 63 00 64 00 - AC 20 81 00 1A 20 00 00   a.b.c.d.. ... ..
rv wide to utf8: 13
00403010  61 62 63 64 E2 82 AC C2 - 81 E2 80 9A 00 00 00 00   abcd............

rv utf8 to wide: 8
00403074  61 00 62 00 63 00 64 00 - AC 20 81 00 1A 20 00 00   a.b.c.d.. ... ..
rv wide to ascii: 8
00403000  61 62 63 64 80 81 82 00 - 00 00 00 00 00 00 00 00   abcd............

eschew obfuscation

Tedd

#3
This is a quick conversion of the delphi one, but fixed for null-terminated strings (delphi use pascal-strings, which are length prefixed and have no terminator) and doesn't require the messing around converting data types :bg


utf8_to_asc proto pSrc:DWORD,pDest:DWORD
    push esi
    push edi
    mov esi,pSrc        ;pointer to input string
    mov edi,pDest       ;pointer to output string
    xor eax,eax
  @@:
    mov al,[esi]        ;get a character
    add esi,1
    test al,al
    jz @out             ;jump out if at end of input string
    test al,80h         ;check if the char is 'big' (7-bit ascii is left as-is, 'big' means top bit is set)
    jnz @bigchar
    mov [edi],al        ;it's 'small' so just copy it
    add edi,1           ;next position in output string
    jmp @B              ;loop!
  @bigchar:
    mov ecx,eax         ;char needs to be stored as two bytes - make a copy and we can do both at the same time
    shr al,6            ;get upper bits
    and cl,3Fh          ;get lower bits
    or al,0C0h          ;set as leading-byte
    or cl,80h           ;set as follower-byte
    mov [edi],al        ;store leader
    mov [edi+1],cl      ;store follower (after it)
    add edi,2           ;adjust pointer
    jmp @B              ;loopy!
  @out:
    mov BYTE PTR[edi],0 ;null-terminate the output
    mov eax,pDest       ;return pointer to output
    pop edi
    pop esi
    ret
utf8_to_asc endp


The unicode conversion is a little naive, but it should be fine in most cases (8-bit ascii, latin-1 codepage.)
For full usage I'd go with the conversion with the multibyte functions (as MichaelW's used.)

Basically all it's doing is..
if (top-most bit == 1):
    ascii           utf8
    1abcdefg   ->   1100001a 10bcdefg
else
    0abcdefg   ->   0abcdefg


The other function is actually straightforward (despite the mess it looks in delphi - due to type-coercion) it's just the reverse process.


[Edit: added comments, and have now tested the code - it works!, plus explanation.]
No snowflake in an avalanche feels responsible.

ecube

Very nice job guys!!! very useful code  :bg the level of your guys skills is like...prolific  :green