what's an effective way to differentiate an ansi string from a unicode?

ecube · July 21, 2010, 06:59:07 PM

there was a slight bug in the char above that I put on purpose to see if this guy was even testing the functions, clearly he was not, anyway here's the fixed version.

Code Select


data section
mybuff db 256 dup ?
unistr  dw "A",0

CODE SECTION
start:
invoke IsUniOrAscii,'A',addr mybuff
;eax= 1
invoke IsUniOrAscii,addr unistr,addr mybuff
;eax= 2
invoke ExitProcess

IsUniOrAscii FRAME a1,obuff
LOCAL notascii:D
invoke lstrcpy,[obuff],[a1]
invoke CharAbove,[obuff],256,127
cmp eax,0
jz >
mov D[notascii],1
jmp > @unicheck
:
invoke lstrlen,[obuff]
cmp eax,1
jle > @unicheck
mov eax,1
ret
@unicheck:
invoke UniToAscii,[a1],[obuff],256
cmp D[notascii],1
jne >
mov eax,2
ret
:
invoke lstrlen,[obuff]
test eax,eax
jz >
invoke FindChar,[obuff],256,63;?
cmp eax,3
jge >
mov eax,2
ret
:
invoke lstrcpy,[obuff],[a1]
mov eax,1
RET
ENDF

UniToAscii FRAME szAcii,szUnicodeBuffx,bufsizex
invoke WideCharToMultiByte,CP_ACP, 0, [szAcii], -1,[szUnicodeBuffx],[bufsizex],NULL,FALSE
ret
ENDF

FindChar FRAME iBuf,iLen,iChar
mov edi,[iBuf]
mov ecx,[iLen]
xor eax,eax
xor edx,edx
:
mov al,B[edi]
cmp al,0
je > @done
cmp al,[iChar]
jne > @notchar
add edx,1
@notchar:
dec ecx
test ecx,ecx
jz > @done
inc edi
jmp <

@done:
mov eax,edx
RET
ENDF


CharAbove FRAME iBuf,iLen,iChar
mov edi,[iBuf]
mov ecx,[iLen]
xor eax,eax
xor edx,edx
:
mov al,B[edi]
cmp al,0
je > @done2
cmp al,[iChar]
jbe > @notchar2
mov eax,1
ret
@notchar2:
dec ecx
test ecx,ecx
jz > @done2
inc edi
jmp <

@done2:
mov eax,edx
RET
ENDF

and here's a complete masm version,in these tests it doesn't convert the strings correctly, because I guess they don't fall under UTF-16, but it does detect em correctly which is my main goal, you can play with the WideCharToMultiByte fields below to do different conversions anyway.

Code Select


include \masm32\include\masm32rt.inc
FindChar proto :DWORD,:DWORD,:BYTE
CharAbove proto :DWORD,:DWORD,:BYTE
IsUniOrAscii proto :DWORD,:DWORD
UniToAscii proto :DWORD,:DWORD,:DWORD
.data
ascii db "ascii",0
uni db "unicode",0

thestring db 0C4h, 0B0h, 0A7h, 0E2h, 0C3h, 96h, 94h, 9Ch, 0C3h, 9Ch, 00h, 00h 
thestring2 db 0C5h, 9Eh, 0C4h, 9Eh, 00h, 00h

.data?
mybuff db 256 dup (?)
.code
start:
invoke IsUniOrAscii,addr thestring,addr mybuff
.if eax==1
	invoke MessageBox,0,addr mybuff,addr ascii,MB_ICONINFORMATION
.else
 	invoke MessageBox,0,addr mybuff,addr uni,MB_ICONINFORMATION
.endif

invoke IsUniOrAscii,addr thestring2,addr mybuff
.if eax==1
	invoke MessageBox,0,addr mybuff,addr ascii,MB_ICONINFORMATION
.else
 	invoke MessageBox,0,addr mybuff,addr uni,MB_ICONINFORMATION
.endif

invoke IsUniOrAscii,addr ascii,addr mybuff
.if eax==1
	invoke MessageBox,0,addr mybuff,addr ascii,MB_ICONINFORMATION
.else
 	invoke MessageBox,0,addr mybuff,addr uni,MB_ICONINFORMATION
.endif

invoke ExitProcess,0

FindChar proc iBuf:DWORD,iLen:DWORD,iChar:BYTE
mov edi,iBuf
mov ecx,iLen
xor eax,eax
xor edx,edx
@@:
mov al,byte ptr [edi]
cmp al,0
je @done
cmp al,iChar
jne @notchar
add edx,1
@notchar:
dec ecx
test ecx,ecx
jz @done
inc edi
jmp @B

@done:
mov eax,edx
RET
FindChar endp

CharAbove proc iBuf:DWORD,iLen:DWORD,iChar:BYTE
mov edi,iBuf
mov ecx,iLen
xor eax,eax
xor edx,edx
@@:
mov al,byte ptr [edi]
cmp al,0
je @done2
cmp al,iChar
jbe @notchar2
mov eax,1
ret
@notchar2:
dec ecx
test ecx,ecx
jz @done2
inc edi
jmp @B

@done2:
mov eax,edx
RET
CharAbove endp

IsUniOrAscii proc a1,obuff
LOCAL notascii:dword
invoke lstrcpy,obuff,a1
invoke CharAbove,obuff,256,127
cmp eax,0
jz @F
mov notascii,1
jmp @unicheck
@@:
invoke lstrlen,obuff
cmp eax,1
jle @unicheck
mov eax,1
ret
@unicheck:
invoke UniToAscii,a1,obuff,256
cmp notascii,1
jne @F
mov eax,2
ret
@@:
invoke lstrlen,obuff
test eax,eax
jz @F
invoke FindChar,obuff,256,63;?
cmp eax,3
jge @F
mov eax,2
ret
@@:
invoke lstrcpy,obuff,a1
mov eax,1
RET
IsUniOrAscii endp

UniToAscii proc szAcii,szUnicodeBuffx,bufsizex 
invoke WideCharToMultiByte,CP_ACP, 0, szAcii, -1,szUnicodeBuffx,bufsizex,NULL,FALSE
ret
UniToAscii endp
End start

ecube · July 21, 2010, 07:28:45 PM

Quote from: Geryon on July 21, 2010, 06:47:54 PM
Quote from: E^cube on July 21, 2010, 06:11:23 PM
whatever guy, you come on here posting in all caps, for no reason, insulting my code and making invalid claims, none of which you've backed up, and now you're name calling and insulting the forum.
I strongly recommend read messages.
Quote from: E^cube on July 21, 2010, 06:11:23 PM
By the way,you might of registered here a few years ago, but you've posted a total of what 5/6 thread/comments in all these years? How about you post some useful code instead of google links?
I was registered here when the win32asm-board was still alive. It's around 5-10 years ago.
Everybody who is old enough to remembers me. I don't have to prove myself to you.

On the other hand, There is no logical connection between when I registered or how long I have been using asm and validity of my claims. But It's obvious, try to help you is futile.
If you say 2 + 2 = 5, I completly agreee no matter what.

here are the facts
1)IsTextUnicode has failed to identify unicode strings in over 6 easy examples
2)my code so far has identified them ALL correctly
3)you have a total of < 15 posts in the time you registered here and now(2004 is earliest I see your nick)
4)I(not suprisingly) can't even find you on the other forum...
5)you've failed to provide any code what so ever, or any proof of your ridiculous claims

anyway i'm done wasting my time on you, and IMO just because you registered a nick here, that doesn't make you part of the community. Only when you contribute, will that change.

BogdanOntanu · July 21, 2010, 08:36:07 PM

Fair warning to both of you: the attitude on Campus is supposed to be friendly... behave and stop calling eachother names.
Start using logical arguments.

ecube · July 21, 2010, 08:46:08 PM

I don't need your warning BogdanOntanu...what you need to do is read his inital post and his use of caps and combative language.

BogdanOntanu · July 21, 2010, 09:34:06 PM

Quote from: E^cube on July 21, 2010, 08:46:08 PM
I don't need your warning BogdanOntanu...what you need to do is read his inital post and his use of caps and combative language.

BOTH of you have some valid points and some mistakes in your claims and statements fom a logical point of view. You could learn by understanding the other's "point of view" on this issue.

More exactly Unicode can be encoded in so many way (as it was pointer out to you here) that you can NOT surely deduct if some data stream is unicode or not without other external information or hints.

However you could make some empirical functions based on partial understandings of UNicode that would work apparently for some common cases. This is hardly corect and exact given the complexity of Unicode and it's variouse encodings but it might work in acceptable ways "for you".

As a last hint.... if I may point you to the fact that some binary combinations are invalid in UTF-8 encoding (same goes for UTF16, etc) and because of this a string can NOT be considered as beeing "unicode" simply because it has some binary byte values above 127 in it. Empirically this might be a hint and yes it might work in many examples but it is incorect by logic and standards.

Also some extended ASCII and codepages use the upper 128-255 binary values in order to encode specific Eastern Euopean special characters in text modes (Romanian for example... but also Hungariam Slovakian, etc) and again because of this finding an above 127 char inside a string is not a corect or certain way to detect an unicode string when in fact it could have been an extended ASCII string with special characters or ascii art included.

The best way to decide is to study Unicode standards and "code points" and speciifc code points encodings like UTF-8 and UTF16 as it was hinted to you in this thread. Another way is to check for Unicode BOM (when present).

Yes your interlocutor was slightly offensive at start but he also provided corect hints to such unicode matters. AND a single word in CAPS does not justify your later reactions and to be honest your reactions do not justify his later reactions...

Even if he is correct in his logical statements ... still this is the Campus and beginners posting here are expected to understand hardly or to have incorect personal points of view.

Also asking for code or rejecting truth based on posts count is not exactly nice either.

You could have maintained you corect attitude and leave it to moderators... and in the same time learn from his hints while not having to agree with his attitude.

Then I would have had warned only him the "behave" and respect the Campus "standard behaviour"...

Unfortunately you have choosen to escaladate the conflict ... and I was forced by the rules to warn you both ... and evem more unfortunately you have also choosen to argue agains an moderator acting as an moderator ... and this is not acceptable.

Under this circumstances I "have to" close your thread. Sorry...

News:

what's an effective way to differentiate an ansi string from a unicode?

ecube

ecube

BogdanOntanu

ecube

BogdanOntanu