below is delphi code that coverts a ascii string to UTF. I tried porting to masm but failed miserably :( I don't know delphi but is pretty easy to under this snipletthe delphi function ord "Provides the Ordinal value of an integer, character or enum" so if you did it on the letter A for instance it would return 65 (it's decimal value). Anyway if anyone can help me fix my code i'd really appreciate it.
delphi code that coverts ascii to UTF
function asc2utf(s:string):string;
var t:string;
i:integer;
begin
t:='';
for i:=1 to length(s) do begin
if ord(s[i])<128 then t:=t+s[i]
else begin
t:=t+chr(192+(ord(s[i]) div 64));
t:=t+chr(128+ord(s[i]) and 63);
end;
end;
asc2utf:=t;
end;
my attempt at converting it to masm(doesn't work)
.586
.model flat, stdcall
option casemap:none
include \masm32\include\windows.inc
include \masm32\include\kernel32.inc
include \masm32\include\user32.inc
includelib \masm32\lib\kernel32.lib
includelib \masm32\lib\user32.lib
Asc2utf proto :DWORD,:DWORD
CTEXT MACRO text:VARARG
local TxtName
.data
TxtName BYTE text,0
.code
EXITM <ADDR TxtName>
ENDM
.data?
mybuf db 1024 dup(?)
.code
start:
invoke Asc2utf,CTEXT("testing"),addr mybuf
invoke MessageBox,0,addr mybuf,addr mybuf,MB_OK
invoke ExitProcess,0
Asc2utf proc iString:DWORD,oString:DWORD
LOCAL icount:DWORD
LOCAL ilen:DWORD
invoke lstrlen,iString
mov ilen,eax
mov ebx,eax
mov icount,1
mov edi,iString
mov esi,oString
.While icount < ebx
cmp byte ptr [edi],128
jge @F
mov al,byte ptr [edi]
mov [esi],al
inc esi
;t:=t+chr(192+(ord(s[i]) div 64));
mov byte ptr [esi],192
inc esi
xor edx,edx
mov al,byte ptr [edi]
mov [eax],al
mov ecx,64
div ecx
mov [esi],eax
inc esi
;t:=t+chr(128+ord(s[i]) and 63);
mov byte ptr [esi],128
inc esi
xor edx,edx
mov al,byte ptr [edi]
mov [eax],al
mov ecx,63
and ecx,eax
mov [esi],eax
inc esi
@@:
inc icount
inc edi
.endw
ret
Asc2utf endp
end start
converting back(i didn't even attempt this :( )
function utf2asc(s:string):string;
var
t:string;
i:integer;
begin
i:=1;
t:='';
while i<=length(s) do begin
if ord(s[i])<128 then t:=t+s[i]
else begin
t:=t+chr(((ord(s[i]) and 3)*64)+(ord(s[i+1]) and 63));
inc(i);
end;
if t[length(t)]=#0 then t[length(t)]:=#255;
inc(i);
end;
utf2asc:=t;
end;
cube,
I am not familiar with this type of conversion but i wonder whether you can do it with the normal ANSI to UNICODE APIs ?
This appears to work in one direction, but I'm out of time to work on it for now.
Edit: Now both directions.
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
include \masm32\include\masm32rt.inc
CP_UTF8 equ 65001 ;; UTF-8 translation
HexAsciiDump PROTO :DWORD, :DWORD
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
.data
ascii db "abcd",128,129,130,0,8 dup(0)
utf8 db 100 dup(0)
wide dw 100 dup(0)
.code
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
start:
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
invoke HexAsciiDump,ADDR ascii,1
print chr$(13,10)
print "rv ascii to wide: "
invoke MultiByteToWideChar,CP_ACP,
MB_PRECOMPOSED,
ADDR ascii,
-1,
ADDR wide,
LENGTHOF ascii
print ustr$(eax),13,10
invoke HexAsciiDump,ADDR wide,1
print "rv wide to utf8: "
invoke WideCharToMultiByte,CP_UTF8,
0,
ADDR wide,
-1,
ADDR utf8,
100,
NULL,
NULL
print ustr$(eax),13,10
invoke HexAsciiDump,ADDR utf8,1
print chr$(13,10)
print "rv utf8 to wide: "
invoke MultiByteToWideChar,CP_UTF8,
0,
ADDR utf8,
-1,
ADDR wide,
LENGTHOF ascii
print ustr$(eax),13,10
invoke HexAsciiDump,ADDR wide,1
print "rv wide to ascii: "
invoke WideCharToMultiByte,CP_ACP,
0,
ADDR wide,
-1,
ADDR ascii,
LENGTHOF ascii,
NULL,
NULL
print ustr$(eax),13,10
invoke HexAsciiDump, ADDR ascii, 1
print chr$(13,10)
inkey "Press any key to exit..."
exit
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
HexAsciiDump proc uses ebx esi edi address:DWORD, nparagraphs:DWORD
mov esi, address
mov ecx, nparagraphs
test ecx, ecx
jnz rowLoop
ret
align 16
hex_table:
db "000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F"
db "202122232425262728292A2B2C2D2E2F303132333435363738393A3B3C3D3E3F"
db "404142434445464748494A4B4C4D4E4F505152535455565758595A5B5C5D5E5F"
db "606162636465666768696A6B6C6D6E6F707172737475767778797A7B7C7D7E7F"
db "808182838485868788898A8B8C8D8E8F909192939495969798999A9B9C9D9E9F"
db "A0A1A2A3A4A5A6A7A8A9AAABACADAEAFB0B1B2B3B4B5B6B7B8B9BABBBCBDBEBF"
db "C0C1C2C3C4C5C6C7C8C9CACBCCCDCECFD0D1D2D3D4D5D6D7D8D9DADBDCDDDEDF"
db "E0E1E2E3E4E5E6E7E8E9EAEBECEDEEEFF0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFF"
ascii_table:
db ".",".",".",".",".",".",".",".",".",".",".",".",".",".",".","."
db ".",".",".",".",".",".",".",".",".",".",".",".",".",".",".","."
db " ","!",'"',"#","$","%","&","'","(",")","*","+",",","-",".","/"
db "0","1","2","3","4","5","6","7","8","9",":",";","<","=",">","?"
db "@","A","B","C","D","E","F","G","H","I","J","K","L","M","N","O"
db "P","Q","R","S","T","U","V","W","X","Y","Z","[","\","]","^","_"
db "`","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o"
db "p","q","r","s","t","u","v","w","x","y","z","{","|","}","~","."
db ".",".",".",".",".",".",".",".",".",".",".",".",".",".",".","."
db ".",".",".",".",".",".",".",".",".",".",".",".",".",".",".","."
db ".",".",".",".",".",".",".",".",".",".",".",".",".",".",".","."
db ".",".",".",".",".",".",".",".",".",".",".",".",".",".",".","."
db ".",".",".",".",".",".",".",".",".",".",".",".",".",".",".","."
db ".",".",".",".",".",".",".",".",".",".",".",".",".",".",".","."
db ".",".",".",".",".",".",".",".",".",".",".",".",".",".",".","."
db ".",".",".",".",".",".",".",".",".",".",".",".",".",".",".","."
rowLoop:
push ecx
print uhex$(esi)," "
lea edi, hex_table
push esi
REPEAT 8
print " "
movzx ebx, BYTE PTR [esi]
movzx eax, BYTE PTR [edi+ebx*2]
invoke crt__putch, eax
movzx eax, BYTE PTR [edi+ebx*2+1]
invoke crt__putch, eax
inc esi
ENDM
print " - "
REPEAT 8
movzx ebx, BYTE PTR [esi]
movzx eax, BYTE PTR [edi+ebx*2]
invoke crt__putch, eax
movzx eax, BYTE PTR [edi+ebx*2+1]
invoke crt__putch, eax
print " "
inc esi
ENDM
pop esi
lea edi, ascii_table
print " "
REPEAT 16
movzx ebx, BYTE PTR [esi]
movzx eax, BYTE PTR [edi+ebx]
invoke crt__putch, eax
inc esi
ENDM
print chr$(13,10)
pop ecx
dec ecx
jnz rowLoop
ret
HexAsciiDump endp
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
end start
00403000 61 62 63 64 80 81 82 00 - 00 00 00 00 00 00 00 00 abcd............
rv ascii to wide: 8
00403074 61 00 62 00 63 00 64 00 - AC 20 81 00 1A 20 00 00 a.b.c.d.. ... ..
rv wide to utf8: 13
00403010 61 62 63 64 E2 82 AC C2 - 81 E2 80 9A 00 00 00 00 abcd............
rv utf8 to wide: 8
00403074 61 00 62 00 63 00 64 00 - AC 20 81 00 1A 20 00 00 a.b.c.d.. ... ..
rv wide to ascii: 8
00403000 61 62 63 64 80 81 82 00 - 00 00 00 00 00 00 00 00 abcd............
This is a quick conversion of the delphi one, but fixed for null-terminated strings (delphi use pascal-strings, which are length prefixed and have no terminator) and doesn't require the messing around converting data types :bg
utf8_to_asc proto pSrc:DWORD,pDest:DWORD
push esi
push edi
mov esi,pSrc ;pointer to input string
mov edi,pDest ;pointer to output string
xor eax,eax
@@:
mov al,[esi] ;get a character
add esi,1
test al,al
jz @out ;jump out if at end of input string
test al,80h ;check if the char is 'big' (7-bit ascii is left as-is, 'big' means top bit is set)
jnz @bigchar
mov [edi],al ;it's 'small' so just copy it
add edi,1 ;next position in output string
jmp @B ;loop!
@bigchar:
mov ecx,eax ;char needs to be stored as two bytes - make a copy and we can do both at the same time
shr al,6 ;get upper bits
and cl,3Fh ;get lower bits
or al,0C0h ;set as leading-byte
or cl,80h ;set as follower-byte
mov [edi],al ;store leader
mov [edi+1],cl ;store follower (after it)
add edi,2 ;adjust pointer
jmp @B ;loopy!
@out:
mov BYTE PTR[edi],0 ;null-terminate the output
mov eax,pDest ;return pointer to output
pop edi
pop esi
ret
utf8_to_asc endp
The unicode conversion is a little naive, but it should be fine in most cases (8-bit ascii, latin-1 codepage.)
For full usage I'd go with the conversion with the multibyte functions (as MichaelW's used.)
Basically all it's doing is..
if (top-most bit == 1):
ascii utf8
1abcdefg -> 1100001a 10bcdefg
else
0abcdefg -> 0abcdefg
The other function is actually straightforward (despite the mess it looks in delphi - due to type-coercion) it's just the reverse process.
[Edit: added comments, and have now tested the code - it works!, plus explanation.]
Very nice job guys!!! very useful code :bg the level of your guys skills is like...prolific :green