News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

How to assign variables in My code ?(.code section)

Started by Rainstorm, April 04, 2007, 09:32:30 PM

Previous topic - Next topic

Rainstorm

hi,

In this sample code I've put an array of pointers to memory addresses of words,..into memory & want to use them to access the associated words. - what are the ways this is usually done ?
As you can see in the code, it accesses all the words onwards, following a the related address also.
how can i access just that word ?   
thanks.


    .data

search_string  db "hello there all of you",0
count_         dd 0
ptokenarray    dd 0

.code
start:
   mov esi, offset search_string                ; memory addresss
   
   invoke StrLen, esi
   mov ebx, eax
   add ebx, ebx
   print "total characters - "
   print ustr$(ebx),13,10

   mov ptokenarray, alloc(512)
   mov edi, ptokenarray

   mov [edi], esi
   add edi, 4
   
   sub esi,1

   seperate_words:
       add esi, 1
       cmp byte ptr [esi], 0        ; check for zero terminator
       je preexit_
       cmp byte ptr [esi], 32      ; look for space
       jne seperate_words

; -·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-
Store the addresses in memory
; -·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-
       add count_, 1                  ; keep count of the seperators(spaces)
       add esi, 1
       mov [edi], esi
       add edi, 4
       sub esi, 1
       jmp seperate_words
; -·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-
Displays the results       
; -·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-·-
   preexit_:     
       mov ecx, count_
       
      exit_:
       mov edx, ptokenarray
       mov ebx, [edx+ecx*4]
    push ecx
       print "string - "
       print ebx,13,10
    pop ecx
       sub ecx, 1
       jnc exit_

      print "total spaces - "
      print ustr$(count_),13,10
           
      add count_, 1
      print "Total Words - "
      print ustr$(count_),13,10
 
    inkey
    free ptokenarray         ; free the memory
    exit

end start

dsouza123

If you copy the text between pointers to a temporary string,
it will give you an individual word (and a space), which can then be printed.

You would also have to have a pointer to the zero terminator,
for it to work for the last word.

Rainstorm

thanks for the reply dsouza
was just wondering how its generally done,..accessing tokens once you have pointers to them.


sebart7

How to access words by ponters fast.
An example is more than words


.386
.model flat,stdcall
option casemap:none

GetWordByPointer        PROTO   :DWORD,:DWORD

include         \masm32\include\windows.inc
include         \masm32\include\kernel32.inc
includelib      \masm32\lib\kernel32.lib
include         \masm32\include\user32.inc
includelib      \masm32\lib\user32.lib

.data

pointersArray   dd      offset word0,\
                        offset word1,\
                        offset word2,\
                        offset word3
                       
word0           db      "ThisIsWord0",0
word1           db      "AnotherWord1",0
word2           db      "AndHereIsWord2",0
word3           db      "AndWord3Here",0

.code
start:

        invoke GetWordByPointer,addr pointersArray,3 ; lets get word Nr3
        ; eax = lpSelectedWord 
        invoke MessageBox,0,eax,eax,MB_OK or MB_ICONINFORMATION

        invoke GetWordByPointer,addr pointersArray,1 ; ok how about word Nr 1
        ; eax = lpSelectedWord 
        invoke MessageBox,0,eax,eax,MB_OK or MB_ICONINFORMATION


        invoke ExitProcess,0

GetWordByPointer PROC uses ebx esi lpPointersArray:DWORD,wordNr:DWORD

        mov esi,lpPointersArray
        mov ebx,wordNr                  ; one pointer is 4 bytes then
        shl ebx,2                       ; ebx = wordNr*4
                                        ; (shifting bits left
                                        ; is a simple multiply trick
                                        ; *2, *4, *8 ....ect)
        mov eax,dword ptr[esi+ebx]
        ret

GetWordByPointer ENDP

end start


This Example was to explain idea about pointers-words and fast accesing it.
Then oh yes, You dont want it static (like in this example) but dynamic created "somewhere" in memory, so :

You have to :
1. scan inputBuffer and exchange all bytes that are <020h and >07fh with 020h
(It make You able to give ANY data as inputBuffer and extract only words from it, for example kernel32.dll as a inputBuffer ,lol)
2. now count words in inputBuffer to know how much memory to alocate for outputBuffer.
(each time You encounter word begining You do    inc wordsCount)
3. allocate memory equal to (allWordsSize+wordsCount)+(wordsCount*4)+4
Lets say example input string is "word1 word2 someword3"
then
allWordsSize = 19
wordsCount = 3
4. Here You create output data in outputBuffer.
Scan inputBuffer and each time when You find begining of word, You have to create pointer (leading to outputBuffer)
and copy encountered word there, After You copy word, "close" copied word by chr(0)
Example of how this buffer should looks like for "word1 word2 someword3" :


0      dd  3                               ; words (pointers) in buffer. (for error check, do not try to read pointer 4 ,ect)
4      dd  16
8      dd  22
12    dd  28
16    db  word1 chr(0)
22    db  word2 chr(0)
28    db  someword3 chr(0)


Ok. So here You have just created outputBuffer somewhere in allocated memory. (of course You know Your lpYourAlocatedMemory)

Here is how You access this dynamic buffer, similar to example above.


GetWordByPointer PROC uses ebx esi lpPointersArray:DWORD,wordNr:DWORD

        ; here lpPointersArray is pointer to Your alocated memory with buffer inside

        mov esi,lpPointersArray
        mov eax,wordNr
        cmp eax,dword ptr[esi]
        ja    error_NoMoreWords
; --------
        mov ebx,wordNr                  ; one pointer is 4 bytes then
        shl ebx,2                       ; ebx = wordNr*4
                                        ; (shifting bits left
                                        ; is a simple multiply trick
                                        ; *2, *4, *8 ....ect)
        mov eax,dword ptr[esi+ebx+4]
        add eax,lpPointersArray
        ret
; --------
error_NoMoreWords:
        mov eax,0
        ret
GetWordByPointer ENDP


Rainstorm

sebart7, many thanks for the reply.

was digesting what you said & just need some clarification on some stuff.
Quote1. scan inputBuffer and exchange all bytes that are <020h and >07fh with 020h
I understand that, that's between ASCII 32 & 127 (all the text characters) -  but why replace them with 020h ?

QuoteAfter You copy word, "close" copied word by chr(0)
chr(0) = ASCII  0 value ? right ?

so my output buffer would look something like this ?
word1-0-word2-0-word3-0-word4-0.......
is that right ?

I should have named this thread how to make tokens.
-



sebart7

#20
Heres a small explanation to my previous post that i think may reply to Your questions :

As i say before in some other topic, in some cases (especialy here, while You cant tell whats in buffer because it wasnt created exclusively as a data pack, but its just a random text file or even a binary selected by user as a input) You should not expect input data in any normalised format. In that case If You will use simple algoritm that works the way "each time when You encounter chr(020h) that meant its end-of-the-word so You have to copy this word, then continue scan, and when encounter chr(0) that meant its end of input. It will work rite only for string in clear format like "word1 word2 word3",chr(0). Now consider that input was created in notepad that adds (00ah,00dh,) at end of line, and not put (0) at the end of file. Also lets say User did not press Enter in last line so last word have nothing at end (no CR, no (0) no nothing) In addition User that create this *.txt file put randomly more spaces betwin some words. Heres example of short text file:

"word1 word2   someword",0c,0d,
"and here     some   sentence"

Now lets analyse what simple algoritm will produce :
We starting scaning and encounter space after "word1". ok lets copy "word1" as found word, then continue scan,
We found space after word2, lets copy "word2" Then we move our pointer +1 and.... Encounter SPACE, so algoritm
interprete it as a EndOfWord, while You remember last place where You start scaning (after last SPACE) and current position,
as a EndOfWord, It copy the "thing" that is betwin this 2 spaces meant "" (nothing) wow, after we copy fiew "nothing"
into our output and create fiew dead-pointers that lead to "nothing" we find begining of "Someword...."
What simple algoritm do here. It scan it until find SPACE, and because of SPACE is after "and" word it copy all string "someword+0c+0d+and"<- at least SPACE here, Ok lets just skip the rest of words that will produce similar wrong output,
and lets move just to bottom of input data. There Is no Chr(0) and no chr(020h) What will simple scan routine do here ?
Of course it will wait for 0 or 020h and increment current pointer that will end up with pointer out-of-input-buffer, and will crash
everything at the end.  ::)
Ok here You say : "thats no problem, i just fix here and there by adding new conditions so i will be ready for it"
Um, Yes it do help, But its still expecting data in "known format". Each time when You will fix it by adding new exceptions
and rules Your algoritm will grow more and more complex, and still will encounter something that You not expected at first.
So whats now ?.... Maybe we should "normalize" somehow input buffer before we pass it to word-extraction routine.
Heres where this chr(020h) owerwriting comes out.
You should scan all bufer and each time when You encounter CHR less than 020h replace it with 020h, and when encounter CHR above 07fh also replace it with 020h. After You reach EndOfBuffer (That You know by its lenght)
Simply it will filter-out anything that is not a text and replace it with 020h. In this case no mater what You give as inputBuffer, text file/web page/random binary file,ect. after Pass1 you will ALWAYS receive inputData in normalised format, than You can safe pass to Extraction procedure. For example, if You give input buffer like this :
"someword",11h,9ah,b2h,0,0,0,12h,"someword2 word3",0ah,0dh"word4 word5" (wow thats complex)
after Pass1 You will receive this" :
"someword       someword2 word3  word4 word5"
And this is inputBuffer that allready folow some rules created by You, that You can be sure and expect in input buffer" :)
Now You can pass it to Extraction routine that respect fiew simple rules and no need to be complex, advanced and
expecting all-the-things-in-the-world-while-still-not-sure.
Now, whats will extraction routine do ?
We want it to create 2 arrays, array1 with pointers (where each pointer leads to each word) and array2 with words itself.
We should alocate memory for it first, But how to tell "how much memory we need ?" There are ponters, words, and we dont even
extract it yet so how to know it  :dazzled: ....
Here comes Pass2 :)
Pass 2 will scan inputBuffer (we allready have it in normalised format so there is no fear that something will not work)
and count how many words we can extract and how many chars it will get in total.
After pass 2 we should know 2 things :
for inputBuffer = "someword       someword2 word3  word4 word5"
wordsCount = 5
totalBytes = 32
Now we can calculate how much memory to allocate for outputBuffer :)

amountOfMemoryToAlocate = (totalBytes+wordsCount)+(wordsCount*4)+4
because :
(totalBytes+WordsCount) ; we need memory to store words (totalBytes), and we will "terminate" each word by chr(0)
                                    ; (add 0 at end of each word) so we need 5 aditional bytes because there are 5 words.
(wordsCount*4)              ; each word will have its pointer where one pointer is DWORD (4bytes) so its WordsCount*4
+4                                ; we add one more DWORD where we will store how many words are hold in dataBase (outputBuffer)
                                    ; we will need it to know later how many words is in database, so we dont need to count words each time
                                    ; only to reply "error : there is no word Nr 10" (while there are 8 words in database)  :)

Here an example on how dataBase outputBuffer may looks like :

for inputBuffer = "someword       someword2 word3  word4 word5"

outPut may look this way :

DWORD    valWordsCount       5       ; how many words (pointers) we have in dataBase
DWORD    lpWord1                24     ; pointer that leads to word1 terminated by 0 (szString)
DWORD    lpWord2                33     ; pointer to word 2...
DWORD    lpWord3                43     ; pointer to word 3...
DWORD    lpWord4                49     ; pointer to word 4...
DWORD    lpWord5                55     ; pointer to word 5...
BYTE       someword,0                    ; word1 as szString (terminated by 0)
BYTE       someword2,0                  ; word1 as szString (terminated by 0)
BYTE       word3,0                         ; word1 as szString (terminated by 0)
BYTE       word4,0                         ; word1 as szString (terminated by 0)
BYTE       word5,0                         ; word1 as szString (terminated by 0)

Small explanation why add 0 at the end of each word.
This way You store Your words in globaly-normalised format as a szString (null terminated string)
That give You wide adventages. Not only You can refer to any word just by pointer itself alone leading to it, and later find
where it finish by just finding (0) if need, but also You can pass this pointer to most of Windows APIs or even external
engines (other applications), and it will be properly recognised as a szString.
Simple and quick example of how its universal can be, receiving word pointer from Your GetWordByNumber Procedure
and pass it directly to example Windows API, MessageBox.

    invoke GetWordByNumber,2                  ; get pointer to szWord 2
    invoke MessageBox,0,eax,eax,MB_OK      ; pass it directly to WindowsAPI

This way (when Your string is terminated by 0) You can pass it to almost any Windows Function, Set it to text boxes, save to registry ect without worrying that in real its a part of Your large-database, how it will find where word ends ect. szString parameter is wide used standard.

Ok, back to the topic :
After PASS1 (that did normalise Your inputBuffer)
and PASS2 (that You did allready to know how much memory You need to allocate)
You can start to extract words.

PASS3 :
You can do this in any ways, here is just my example :
Initialise ESI to lpInputBuffer
and EDI to lpAllocatedMemory

Now we need 2 pointer modificators.
Let ECX points to pointersArray in lpAllocatedMemory, so ECX=4 because first 4 bytes are occupied by wordsCount allready.
(it will be used as [EDI+ECX]  meant lpAllocatedMemory+4)
EDX should point to wordsArray in allocatedMemory so it should be initialised to 4+(wordsCount*4) to skip first4 bytes
that are occupied by wordsCount value and all words pointers, wordsCount*4)
(it will be used as [EDI+EDX] meant lpAllocatedMemory + 24)

Lets start extracting :
1. examine input-buffer-char that ESI points to (You looking now for word begining)
2. If its 020h meant there is no word yet so INC ESI and go again to line 1
3. if its not a 020h meant we just found a begining of word,
   in that case pass EDX that now its (24) to pointersArray ([EDI+ECX]) and make ECX points to next pointer in array
   so we can store new pointer next time.
       ( mov dword ptr[EDI+ECX],EDX
         add  ECX,4 )
4. Now You copy each char to wordsArray ([EDI+EDX]) and increment EDX (and ESI ofcourse) each time You pass new char.
    if You encounter 020h in this moment, here You found end of word, and You should pass 0 to wordsArray
    (identical way like when You pass a normal char, where You increment EDX (and ESI) ect.)
    But after passing 0 You allready copied all word so You have to start looking for begining of a new word, so jump to point 1

And wow! heres im amazed  :eek because its.... ALL we have to do  :bg
Of course You should implement in PASS3, check for end of InputBuffer (You check it by lenght of inputBuffer)
like :  if lpInputBuffer > lenInputBuffer then Exit
you can do this in many ways, for example :
Do not increment ESI but use ESI+EBX where ESI holds static lpInputBuffer and EBX is changing (0,1,2,3,4,ect)
so all You have to do is check EBX. and when EBX=lenInputBuffer its end of inputBuffer. ect,
(but here its begin to be short of registers  :wink, maybe we should store somedata in memory ? maybe use push, pop, ?)
Anyway, You should remember that when You encounter end of inputBuffer You cant simply exit.
It depends where You did encountered end of inputBuffer. In part 1 (lines 1-3) You can exit because, there is no word
processed in that moment. But if You encounter end of inputBuffer while in part2 (line 4), keep in mind that You do procesing
word in that moment, so what to do ? Pass chr(0) to wordsArray that will close last word, and exit :)

After You create all Your dataBase (outputBuffer) You can put in position 0 (first 4 bytes) wordsCount value, that will be used
later by GetWordByNumber ,error-checking part.

As You see, 3 simple passes combined together, give You simple and error free, but powerful processing engine that
can accept input in ANY-ultra-complex format (try to give to it kernel32.dll ?, jpgs ? whatever. text files ? thats too easy :lol )
and process it without problems.
-------------------------------------------------------------------------------------------------------------------------------------------------------------------

We have allready our database in outputBuffer. How to access each word fast ? (no scaning all buffer each time,
searching for terminating 0 ect). Thats where pointers comes out :)
All we have to do is get word from wordsArray by its pointer.

As we know each pointer have 4 bytes and first 4 bytes are occupied by wordsCount value, then can easly calculate that pointers starts at lpBufferInMemory + 4. To read each word corresponding pointer (1,2,3,4,5) we can calculate its location by
lpBufferInMemory + 4 + (wordNumber*4)

Here how it can be done :
(While writing GetWordByNumber procedure we should also consider how we want assign numbers to words,
meant if we will use 0 as a word number or we rather chose human-readamble form and start count words from 1 ?
1,2,3,4,5 instead of 0,1,2,3,4. Lets chose human-readable form here where 1 = first word)

Example is more than words :

; ---------------------------------------------------------------------------
;       You need RadAsm IDE or anything that is masm32 compatible
;       to compile this example.
; ---------------------------------------------------------------------------
.386
.model flat,stdcall
option casemap:none

GetWordByNumber         PROTO           :DWORD,:DWORD


include                 \masm32\include\windows.inc
include                 \masm32\include\kernel32.inc
includelib              \masm32\lib\kernel32.lib
include                 \masm32\include\user32.inc
includelib              \masm32\lib\user32.lib
; ---------------------------------------------------------------------------

.data

szTitle         db      " ",0

; here is an virtual dataBase that have 5 words inside allready.
; ofcourse You have to allocate memory by Yourself
; and create Your own database inside.
; This is just example to test GetWordByNumber.
; ___________________________________________
; - Example dataBase ------------------------
dataBaseBuffer  dd      5,\
                        24,\
                        33,\
                        43,\
                        49,\
                        55
dbb_            db      "someword",0,\
                        "someword2",0,\
                        "word3",0,\
                        "word4",0,\
                        "word5",0
; -------------------------------------------

.code
; ---------------------------------------------------------------------------
start:
                        ; Lets Get wordNr 3
        invoke GetWordByNumber,addr dataBaseBuffer,3   
                        ; now You have lpszWord in eax,
                        ; You can do with it anything You want
                        ; or pass it to Windows API for example.
        cmp eax,0       ; if Error then donot show MessageBox
        je  @F
        invoke MessageBox,0,eax,addr szTitle,MB_OK or MB_ICONINFORMATION
        @@:
                        ; Now lets try to stress it :P
                        ; There is no wordNr 10 in our database
                        ; lets see what will hapen :)
        invoke GetWordByNumber,addr dataBaseBuffer,10   
                        ; it returned 0 :P
        cmp eax,0       ; if Error then donot show MessageBox
        je  @F
        invoke MessageBox,0,eax,addr szTitle,MB_OK or MB_ICONINFORMATION
        @@:


                        ; Exit now
        invoke ExitProcess,0

; ___________________________________________________________________________
; ---------------------------------------------------------------------------
; Heres an GetWordByNumber example

.data
GWBN_szMessageOops   db      "oops, there is no such word in database. Error",0
.code
GetWordByNumber PROC uses ebx esi lpPointersArray:DWORD,wordNr:DWORD
; INPUT PARAMETERS :
; lpPointersArray     = pointer to allocated memory with dataBase inside
; wordNr              = requested word number
; RETURN :
; lpszWord            if succed
; or NULL             if error (no word with given number in database)
; ------===------------------------
        mov esi,lpPointersArray         ; check for errors here
        mov eax,dword ptr[esi]          ; if wordNr = 0 or
        cmp wordNr,eax                  ; wordNr > wordsCount
        ja  GWBN_ErrorExit              ; then return 0 instead of lpszWord
        cmp wordNr,0
        je  GWBN_ErrorExit
; ------===
        mov ebx,wordNr                  ; ebx=wordNr*4
        shl ebx,2                       ; shifting bits left is a simple multiply trick
        mov eax,dword ptr[esi+ebx]      ; get relative pointer from array
        add eax,esi                     ; add relative pointer+lpAllocatedMemory
        ret
; ------===
GWBN_ErrorExit:
                                        ; remark it if You dont want to show Error Message
        invoke MessageBox,0,addr GWBN_szMessageOops,addr szTitle,MB_OK or MB_ICONWARNING
        mov eax,0                       ; return NULL (error)
        ret                             ; (no such word number. error.)
GetWordByNumber ENDP
; ---------------------------------------------------------------------------

end start




Rainstorm

sebart7, Thanks a lot ! for that detailed eplanation about how its done & the perspective on the whole thing. : )
- that normalising thing would keep things simple, & make things easier.

is    inc ecx   faster than   add ecx, 1  ?

same question again, you use
   shl ebx,2   
   mov eax,dword ptr[esi+ebx]


instead of..
mov eax,dword ptr[esi+ebx*4]

because its faster ?

I have some more ascii/txt related questions like about searching, & large txt files  but think i'll post
them in another thread.

-
Rainstorm

sebart7

Quoteis    inc ecx   faster than   add ecx, 1  ?
Hm, when i look into technical documentation, it shows that in current case its equal in processor cycles.
However, after dissasembly You can see that its shorter  :bg
00401002 90                   nop
00401003 40                   inc         eax
00401004 90                   nop
00401005 90                   nop
00401006 83 C0 01          add        eax,1
00401009 90                   nop

Quote
same question again, you use
   shl ebx,2   
   mov eax,dword ptr[esi+ebx]
instead of..
   mov eax,dword ptr[esi+ebx*4]
because its faster ?
No, ofcourse You can do same thing in many ways, use different registers, algorithms, ect.
Here i think
    mov eax,dword ptr[esi+ebx*4]
can be faster than
   shl ebx,2   
   mov eax,dword ptr[esi+ebx]
especialy while shifting bits gets additional processor cycles, but for sure its 3 bytes shorter  :U
0040107C C1 E3 02          shl        ebx,2