Here is unicode version of hNullString. Slower than expected but, better than nothing.
Dependency:
MASM32 Lib
Low Level Function
ucFile struct
nHeader dword 0
lpData dword 0
lpLineTable dword 0
nSize dword 0
nLineCount dword 0
lpReserved dword 0
lpReserved2 dword 0
lpWordCount dword 0
ucFile ends
fWordS struct
nHeader dword 0
lpAddress dword 0
nSize dword 0
fWordS ends
.code
fucGetOffsetTable proc uses esi edi nPos:dword,lpTbl:dword,nMaxPos:dword
mov esi,lpTbl
mov ecx,nPos
mov edx,nMaxPos
cmp ecx,edx
jae brs
xor eax,eax
cmp ecx,eax
jle brs
dec ecx
mov eax,[esi+ecx*4]
ret
brs:
xor eax,eax
dec eax
ret
fucGetOffsetTable endp
fucGetLineLen proc lpsf:dword,dwInd:dword
LOCAL d:dword
mov esi,lpsf
inc dword ptr[esi+16]
inc dword ptr[esi+16]
invoke GetOffsetTable,dwInd,[esi+8],[esi+16]
mov d,eax
mov eax,dwInd
inc eax
cmp eax,[esi+16]
je it_last
cmp eax,[esi+16]
ja error
invoke GetOffsetTable,eax,[esi+8],[esi+16]
sub eax,d
sub eax,2
dec dword ptr[esi+16]
dec dword ptr[esi+16]
ret
it_last:
mov eax,[esi+12]
sub eax,d
dec dword ptr[esi+16]
dec dword ptr[esi+16]
ret
error:
dec dword ptr[esi+16]
dec dword ptr[esi+16]
xor eax,eax
ret
fucGetLineLen endp
fucWordCount proc uses esi edi lpWord:dword,nWord:dword
LOCAL stts,stts2:dword
LOCAL cntr:dword
mov esi,lpWord
xor ecx,ecx
mov stts,ecx
mov stts2,ecx
mov cntr,ecx
loop_word:
push ecx
mov dx,[esi+ecx]
.if dx==32
.if stts==0
inc stts
.else
.if stts2==1
; mov stts,0
mov stts2,0
.endif
.endif
.else
.if stts==1
.if stts2==0
inc stts2
inc cntr
.endif
.else
.if ecx==0
inc stts
inc stts2
inc cntr
.endif
.endif
.endif
pop ecx
add ecx,2
cmp ecx,nWord
jl loop_word
mov eax,cntr
ret
fucWordCount endp
fucGetWord proc uses esi edi lpucFile:dword,ln:dword,ind:dword
LOCAL wCnt:dword
LOCAL nOffs,nLine,nWC
mov esi,lpucFile
invoke fucGetOffsetTable,ln,[esi].ucFile.lpLineTable,[esi].ucFile.nLineCount
mov nOffs,eax
invoke fucGetLineLen,esi,ln
mov nLine,eax
invoke fucWordCount,nOffs,nLine
mov nWC,eax
.if eax==0
xor eax,eax
dec eax
ret
.endif
mov edx,[esi+24] ; lplinetbl
mov ecx,ln ; line count
cmp ecx,[esi+16]
ja no_word
dec ecx ;
mov edx,[edx+ecx*4] ; line structure, edx = lpword
mov ecx,ind ; word count
cmp ecx,nWC
ja no_word
dec ecx
mov eax,[edx+ecx*4] ; word structure, edx=the structur
ret
no_word:
xor eax,eax
dec eax
ret
fucGetWord endp
fucWordScaner proc uses esi edi lpWord:dword,nWord:dword,lpTable:dword
LOCAL stts,stts2:dword
LOCAL cntr:dword
LOCAL buff[256]:dword
mov esi,lpWord
mov edi,lpTable
xor ecx,ecx
mov stts,ecx
mov stts2,ecx
mov cntr,ecx
loop_word:
push ecx
mov dx,[esi+ecx]
.if dx==32
.if stts==0
inc stts
.else
.if stts2==1
; mov stts,0
mov stts2,0
inc cntr
.endif
.endif
.else
.if stts==1
.if stts2==0
inc stts2
mov eax,cntr
push ecx
add ecx,esi
mov [edi+eax*4],ecx
pop ecx
.endif
.else
.if ecx==0
inc stts2
inc stts
mov eax,cntr
push ecx
add ecx,esi
mov [edi+eax*4],ecx
pop ecx
;inc cntr
.endif
.endif
.endif
pop ecx
add ecx,2
cmp ecx,nWord
jl loop_word
mov eax,cntr
ret
fucWordScaner endp
fucWordLen proc uses esi edi lpucFile:dword,ln:dword,nIndex:dword
LOCAL llen:dword
LOCAL lpos1,lpos2:dword
LOCAL buff[256]:dword
LOCAL nOffs,nLine,nWC
mov esi,lpucFile
invoke fucGetLineLen,esi,ln
mov llen,eax
invoke fucGetWord,esi,ln,nIndex
mov lpos1,eax
inc nIndex
invoke fucGetWord,esi,ln,nIndex
mov lpos2,eax
invoke fucGetOffsetTable,ln,[esi].ucFile.lpLineTable,[esi].ucFile.nLineCount
mov nOffs,eax
invoke fucGetLineLen,esi,ln
mov nLine,eax
invoke fucWordCount,nOffs,nLine
mov nWC,eax
.if eax==1
mov eax,nLine
ret
.endif
.if lpos1==-1
xor eax,eax
ret
.endif
.if lpos2!=-1
mov eax,lpos2
sub eax,lpos1
ret
.else
mov eax,[esi].ucFile.lpData
sub lpos1,eax
mov eax,lpos1
sub eax,llen
ret
.endif
ret
fucWordLen endp
fucBuildWordTable proc uses esi edi lpucFile:dword
LOCAL nIndex,nOffs,nLine,nlpTbl:dword
LOCAL buff[256],dbW,dbWs:dword
mov esi,lpucFile
mov ecx,[esi].ucFile.nLineCount
shl ecx,2
invoke mAlloc,ecx
push eax
pop [esi].ucFile.lpReserved2
xor ecx,ecx
inc ecx
loop_each_line:
push ecx
mov nIndex,ecx
invoke fucGetOffsetTable,nIndex,[esi].ucFile.lpLineTable,[esi].ucFile.nLineCount
mov nOffs,eax
invoke fucGetLineLen,esi,nIndex
mov nLine,eax
invoke fucWordCount,nOffs,nLine
mov dbW,eax
mov ecx,eax
shl ecx,2
invoke mAlloc,ecx
mov nlpTbl,eax
mov ecx,nIndex
dec ecx
mov edx,[esi].ucFile.lpReserved2
mov [edx+ecx*4],eax
invoke fucWordScaner,nOffs,nLine,nlpTbl
mov dbWs,eax
pop ecx
inc ecx
cmp ecx,[esi].ucFile.nLineCount
jl loop_each_line
;invoke dw2a,ecx,addr buff
;invoke MessageBox,0,addr buff,0,0
ret
fucBuildWordTable endp
fucCreateOffsetTable proc uses esi ediĀ lpSou:dword,lpTbl:dword,FileSize:dword,lnCnt:dword
;LOCAL rslt,Pos:dword
LOCAL buff[256]:dword
mov esi,lpSou
mov edi,lpTbl
xor ecx,ecx
xor eax,eax
mov [edi+eax*4],esi
inc eax
loop_d:
push ecx
mov dx,[esi+ecx]
.if dx==10
push ecx
add ecx,esi
mov [edi+eax*4],ecx
pop ecx
inc eax
.endif
pop ecx
add ecx,2
cmp ecx,FileSize
jl loop_d
ret
fucCreateOffsetTable endp
fucLineCount proc uses esi edi lpTheFile:dword,FileSize:dword
mov esi,lpTheFile
xor ecx,ecx
xor eax,eax
inc eax
dloop:
mov dx,[esi]
cmp dx,13
jnz @f
inc eax
@@:
add esi,2
add ecx,2
cmp ecx,FileSize
jl dloop
ret
fucLineCount endp
fucLoadFile proc uses esi edi lpFileName:dword,lpucFile:dword
LOCAL hFile:dword
LOCAL nFile:dword
LOCAL buff[256]:dword
LOCAL nWord:dword
LOCAL nOffset:dword
;invoke mAlloc,sizeof ucFile
mov eax,lpucFile
mov hFile,eax
invoke exist,lpFileName
.if eax==-1
invoke MessageBox,0,CADD("File not Found"),0,0
ret
.endif
invoke filesize,lpFileName
mov nFile,eax
push eax
mov ecx,hFile
pop [ecx].ucFile.nSize
invoke mAlloc,eax
mov ecx,hFile
push eax
pop [ecx].ucFile.lpData
push ecx
invoke read_disk_file,lpFileName,addr [ecx].ucFile.lpData,addr nFile
pop ecx
push ecx
invoke fucLineCount,[ecx].ucFile.lpData,nFile
pop ecx
push eax
pop [ecx].ucFile.nLineCount
push ecx
shl eax,2
invoke mAlloc,eax
pop ecx
push eax
pop [ecx].ucFile.lpLineTable
push ecx
invoke fucCreateOffsetTable,[ecx].ucFile.lpData,[ecx].ucFile.lpLineTable,nFile,[ecx].ucFile.nLineCount
pop ecx
invoke fucBuildWordTable,hFile
ret
fucLoadFile endp
Example how to use it
ScanUnique proc uses esi edi lpFile:dword
LOCAL rFile,fInd,nOffs,nLine:dword
LOCAL nWC,nOW,wInd,wLen,rOff:dword
LOCAL buff[256]:dword
LOCAL buff2[256]:dword
mov esi,lpFile
mov ecx,[esi].ucFile.nSize
invoke mAlloc,ecx
mov rFile,eax
mov dx,0FEFFh
mov [eax],dx
add rFile,2
push rFile
pop rOff
xor ecx,ecx
inc ecx
loop_each_line:
push ecx
mov fInd,ecx
invoke fucGetOffsetTable,fInd,[esi].ucFile.lpLineTable,[esi].ucFile.nLineCount
mov nOffs,eax
invoke fucGetLineLen,esi,fInd
mov nLine,eax
invoke fucWordCount,nOffs,nLine
mov nWC,eax
xor ecx,ecx
inc ecx
loop_each_word:
push ecx
mov wInd,ecx
.if nWC==0 ; If word count is 0, it mean there is no word.
invoke dw2a,nWC,addr buff
invoke MessageBox,0,addr buff,0,0
jmp @f
.endif
invoke fucGetWord,esi,fInd,wInd
mov nOW,eax
invoke fucWordLen,esi,fInd,wInd
mov wLen,eax
invoke memfill,addr buff,1024,0
invoke MemCopy,nOW,addr buff,wLen
invoke ucRtrim,addr buff,addr buff
lea edx,buff
mov ax,[edx]
invoke ucFind,1,rFile,addr buff
.if eax==0
invoke ucLen,addr buff
shl eax,1
mov wLen,eax
invoke MemCopy,addr buff,rOff,wLen
mov eax,wLen
add rOff,eax
invoke MemCopy,CADD(0dh,0h,00h,00),rOff,4
add rOff,2
.endif
@@:
pop ecx
inc ecx
cmp ecx,nWC
jl loop_each_word
pop ecx
inc ecx
cmp ecx,[esi].ucFile.nLineCount
jl loop_each_line
sub rFile,2
mov ecx,rOff
sub ecx,rFile
invoke write_disk_file,CADD("Result.txt"),rFile,ecx
ret
ScanUnique endp
What you should put on your code
.data
fq fucFile <?>
.code
invoke fucLoadFile,CADD("YourUnicodeFile.txt"),addr fq
invoke ScanUnique,addr fq
This is the draft, I can make it faster. But not now. Please tell me if it doesnot work on your machine, or it causing a crash, but working on my XP.
[edit]Last version only allowed you to use arabic font, this one can use any unicode