News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

a counter of lines in mmx instructions

Started by ToutEnMasm, March 18, 2009, 09:20:18 AM

Previous topic - Next topic

ToutEnMasm

Thanks,
now
;CompteurLignes:
;3615541 cycles for 22274 lines, 849788 bytes

;Scrutation esi edi:
;5727865 cycles for 22274 lines, 849788 bytes

UtillMasm

CountLi.exe Crash

@@:
  pxor xmm5, xmm5
  ;i think it is the unrolling that is hosing things.
  i = 0
  WHILE i LT (16*128)
   movdqu xmm0, [edx + i +  0]
;Memory access violation
                         ECX 00000001

   movdqu xmm1, [edx + i + 16]
   movdqu xmm2, [edx + i + 32]
   movdqu xmm3, [edx + i + 48]

   pcmpeqb xmm0, xmm7
   pcmpeqb xmm1, xmm7
   pcmpeqb xmm2, xmm7
   pcmpeqb xmm3, xmm7

   paddb xmm0, xmm1
   paddb xmm2, xmm3

   paddb xmm0, xmm2

   ; can't you do this step outside the loop?  I am pretty sure you can.
   psubb xmm5, xmm0 ; total 128*8 max = 1K
   i=i+16*4
  ENDM

; unpack MM5 to get sum in 1K block
pxor xmm0, xmm0
psadbw xmm5, xmm0
paddd xmm6, xmm5

dec ecx
lea edx, [edx + 128*16]
jne @B

lingo

ToutEnMasm,
You can try my code too:  :wink
option prologue:none
option epilogue:none
align 16
db 8Dh, 0A4h, 24h, 0,0,0,0, 8Dh, 0A4h, 24h, 0, 0, 0,0,0
getlines proc pBuffer : dword, nSize:dword
pop ecx ; ecx -> return address
mov edx, 0a0a0a0ah ; 0Ah->second byte to looking for
pop eax ; eax->pBuffer-> eax->16 bytes aligned !!!ÿ
movd xmm0, edx ; edx = 0a0a0a0ah
mov [esp-4],esi ; preserve esi register
pop esi         ; esi -> size of buffer
mov [esp-4], ebx ; preserve ebx register
add esi, eax ;
pshufd xmm0, xmm0, 0 ; xmm0=0a0a0a0a0a0a0a.....
movdqa xmm1, xmm0 ; xmm1=xmm0=0a0a0a0a0a0a0a.....
xor ebx, ebx ; ebx->counter=0

;Fast part - 1-> looking for 0Ah simbol in next 16 bytes
LoopA:
cmp eax, esi ; Is it End of Buffer?
jg ExLoop ; Yes, exit
pcmpeqb xmm0, [eax] ;  [eax]->16bytes aligned!!!
add eax, 16 ;  next qword
pmovmskb ecx, xmm0 ; test mask for zero
movdqa xmm0, xmm1 ; xmm0=xmm1=0a0a0a0a0a0a0a.....
test ecx, ecx ; if zero loop again
jnz Loli

;Fast part - 2
cmp eax, esi ; Is it End of Buffer?
jg ExLoop ; Yes, exit
pcmpeqb xmm0, [eax] ; [eax]->16bytes aligned!!!
add eax, 16 ; next qword
pmovmskb ecx, xmm0 ; test mask for zero
movdqa xmm0, xmm1 ; xmm0=xmm1=0a0a0a0a0a0a0a.....
test ecx, ecx ; if zero loop again
jz LoopA ; if not zero found 0Ah simbol

;Slow part-2
bsf edx, ecx
cmp word ptr [edx+eax-17], 0A0Dh
jne @f
add ebx, 1 ; inc counter
@@:
btr ecx, edx ; clear bit
test ecx, ecx ; is there more bits?
je LoopA ; no, loop again
bsf edx, ecx ; Yes, more...
cmp word ptr [edx+eax-17], 0A0Dh
jne @b
add ebx, 1 ; inc counter
jne @b

align 16
;Slow part-1
Loli:
bsf edx, ecx
cmp word ptr [edx+eax-17], 0A0Dh
jne @f
add ebx, 1 ; inc counter
@@:
btr ecx, edx ; clear bit
test ecx, ecx ; is there more bits?
je LoopA ; no, loop again
bsf edx, ecx ; Yes, more...
cmp word ptr [edx+eax-17], 0A0Dh
jne @b
add ebx, 1 ; inc counter
jne @b

;End
align 16
ExLoop:
mov esi, [esp-8]
mov eax, ebx
mov ebx, [esp-4]
jmp dword ptr [esp-12]
getlines endp
option prologue:prologuedef
option epilogue:epiloguedef   


jj2007

#18
Contrary to what is frequently stated in the Campus in replies to noobs, there are mindreaders among us. Lingo, how dare you steal my ideas :naughty:
At least, when you copy from your own code in other threads, do it properly:
Quotepmovmskb ecx, xmm0      ; test mask for zero 0a

Jokes apart, this is nice code, and getting faster (EDIT: on a P4):
Counting lines of \masm32\include\windows.inc:

markl_CountFileLines (Mark Larson):
582     kilocycles for 22272 lines, 849759 bytes

getlines (Lingo):
1412    kilocycles for 22272 lines, 849759 bytes

CompteurLignes: (ToutEnMasm)
5361    kilocycles for 22272 lines, 849759 bytes


Unfortunately, I was not able to get ToutEnMasm's implementation of...
   movups xmm1,[edx]         ;load 16 bytes, e..g
   movups xmm2,[edx+1]      ;charger (presque) les mêmes 16 bytes ;-)
...running - see the "if 0   ; new version, to be completed" branch in attachment.

[attachment deleted by admin]

ToutEnMasm

Best result i have is:
Quote
2 639 Kcycles for 22274 lines, 849788 bytes

Granted is no test crash
The memory isn't read outside the loaded file
Counted are the  13,10  (line return) ,no count is made of 10 alone or 13 alone
I have added a normal scrutation test with esi edi,gain of speed is about twice.
That is sure that without searching the couple 13,10 speed is better.


[attachment deleted by admin]

ecube

heh, never seen lingo get beat in a speed test before, but Mark Larson did write that optimization article so I guess it's not so shocking. I think Mark Larson is one of those quiet genesises, to where he doesn't post code very often but when he does it's golden. I have high hopes for agner too but haven't seen anything yet.  :bg

ToutEnMasm


this one is more risqued,but faster
It take the lingo method
pcmpeqb  xmm1,[esi]      ; dangerous,need 16 byte align


Quote
;1709746 cycles for 22274 lines, 849788 bytes
;align 16
CompteurLignes PROC uses ebx edi esi pmem:DWORD,taille:DWORD
         Local valuemmx:QWORD
         local  theEnd:dword,GroupCount,decalage
         Local  retour:DWORD
   ;init of various register
   mov GroupCount,0
   mov decalage,0
   lea eax,sse2entree
   movups xmm6,[eax]
   lea eax,sse2line
   movups xmm7,[eax]
   mov eax,taille         ;the size is a multiple of 16
   add eax,pmem         ;
   mov theEnd,eax
   mov esi,pmem      
   mov eax,0   ;line counter
   ;ready
   NewBloc:
   movdqa xmm1,xmm6         ;charge 13
   pcmpeqb  xmm1,[esi]      ;13 ,xmm1 modified
   pmovmskb ecx, xmm1 ; result for 13,10 in ecx
   test ecx,ecx   ;no line ?,continue
   jz suite
   NewSeriede16:
   bsf   edx,   ecx
   cmp   word   ptr [edx+esi], 0A0Dh
   jne   @f
   inc   eax
@@:
   btr   ecx,   edx      ; clear bit
   test   ecx,   ecx      ; is there more bits?
   je   suite         ; no,   loop again
   bsf   edx,   ecx      ; Yes, more...
   cmp   word   ptr [edx+esi], 0A0Dh
   jne   @b            
   inc   eax         ;inc  counter
   jne   @b      
   suite:
   lea esi,[esi+16]
   .if esi < theEnd
      jmp NewBloc
   .endif

FindeCompteurLignes:

lingo

E^cube,
"but Mark Larson did write that optimization article so I guess it's not so shocking"

1. this code is from bitRAKE:
http://www.asmcommunity.net/board/index.php?topic=13727.0

2. I wonder why it is included in the test program because it count just 0Ah bytes in the
buffer rather than  0D+0A bytes. It is easy to look for and count just 1 byte rather than 2 bytes


jj2007

#23
Quote from: lingo on March 19, 2009, 12:51:15 PM
E^cube,
"but Mark Larson did write that optimization article so I guess it's not so shocking"

1. this code is from bitRAKE:
http://www.asmcommunity.net/board/index.php?topic=13727.0
bitRAKE's full code (not yet SSE2) can be seen here. Mark apparently adapted it to SSE2.

Quote
2. I wonder why it is included in the test program because it count just 0Ah bytes in the
buffer rather than  0D+0A bytes. It is easy to look for and count just 1 byte rather than 2 bytes


It's included because for all text files on Windows and Linux, it does the job. But that question has to be negotiated with the guy who started this thread :wink

EDIT: It's included for historical reasons. In fact, it fails miserably for winextra.inc, see here.

ToutEnMasm

Hello,
Quote
It's included because for all text files on Windows and Linux, it does the job. But that question has to be negotiated with the guy who started this thread

Seems i recognize me,I have a tool like helphelp who write help index reading the title of html page,i have one another who read rtf file,another that read header files,perhaps i will write another tool that read another type of text files ....
Each text files have some special particularity that made it unreadable without a rule.
The rule is simple,a line break is 13,10 that is carridge return followed by a line feed or in c++ ..(i have forgotten how they call that).The format is given in hexadecimal or decimal  13,10=0dh,0ah
Like that we can read html,rtf,header files without bad suprise like stay in a middle of a line or in the midlle of a file.
I have forgotten linux that have is own format of text.
Speed isn't all,usefull code  is also something to consider.






ToutEnMasm


This one is to make a choice,dangerous or not dangerous,see nb of cycles

Quote
ALIGN16 equ
CompteurLignes PROC uses ebx edi esi pmem:DWORD,taille:DWORD
         local  theEnd:dword
         Local  retour:DWORD
         
   ;init of various register
   lea eax,sse2entree
   movups xmm6,[eax]
   mov eax,taille         ;the size is a multiple of 16
   add eax,pmem         ;
   mov theEnd,eax
   mov esi,pmem      
   mov eax,0   ;line counter
   ;ready
   NewBloc:
   IFDEF ALIGN16
      ;------ align 16 needed ----------
      ;1731187 cycles for 22274 lines
      movdqa xmm1,xmm6         ;charge 13
      pcmpeqb  xmm1,[esi]      ;13 ,xmm1 modified ,align 16      
   ELSE
      ;1828067 cycles for 22274 lines, mem align 16
      ;2312768 cycles for 22274 lines, mem non align
      movdqu xmm1,[esi]         ;charge 13
      pcmpeqb  xmm1,xmm6      ;13 ,xmm1 modified ,align 16      
   ENDIF
      
   pmovmskb ecx, xmm1 ; result for 13,10 in ecx
   NbLineBreak:
   bsf   edx,   ecx
   jz suite
   .if    word   ptr [edx+esi] == 0A0Dh
      inc   eax
   .endif
   btr   ecx,   edx
   jmp NbLineBreak

   suite:
   lea esi,[esi+16]
   .if esi < theEnd
      jmp NewBloc
   .endif

FindeCompteurLignes:
         ret
CompteurLignes endp


jj2007

There is still some room for improvement :wink

Intel(R) Celeron(R) M CPU        420  @ 1.60GHz (SSE3)
Tests for correctness - 2*100 lines expected:
Mark Larson=    - /  - (throws exception)
jj2007=         100 / 100 lines
Lingo=          105 / 102 lines
ToutEnMasm=     105 / 102 lines

Counting lines of \masm32\include\windows.inc:

markl_CountFileLines (Mark Larson):
248     kilocycles for 22274 lines, 849788 bytes

getlinesJJ: (jj2007)
525     kilocycles for 22274 lines, 849788 bytes

getlines (Lingo):
683     kilocycles for 22274 lines, 849788 bytes

CompteurLignes: (ToutEnMasm)
1956    kilocycles for 22274 lines, 849788 bytes

[attachment deleted by admin]

Rainstorm

a small related question, is the MMX instruction set present in all CPUs after its introduction ?

t y

ToutEnMasm

I don't get the same results as jj2007 on my machine with the same executable
Quote
getlinesJJ: (jj2007)
1225    kilocycles for 22274 lines, 849788 bytes

getlines (Lingo):
1376    kilocycles for 22274 lines, 849788 bytes
Quote
CompteurLignes:
1378 Kcycles for 22274 lines, 849788 bytes

seems there is nothing really new this time


ToutEnMasm

I have gain a little time ,just making that
compare is made on 32 bytes,instead of 16
align 16 of memory is made by globalAlloc and it is not necessary to relign it as getlinesJJ do.
The minimum size of memory must be 32 bytes or there is  read memory outside the buffer
And memory allocation must rounded by 32 bytes

Quote
;1378 Kcycles
ALIGN16 equ
CompteurLignes PROC uses ebx edi esi pmem:DWORD,taille:DWORD
         local  theEnd:dword
         Local  retour:DWORD
         
   ;init of various register
   lea eax,sse2entree
   movups xmm6,[eax]
   mov eax,taille         ;the size is a multiple of 16
   add eax,pmem         ;
   mov theEnd,eax
   mov esi,pmem      
   mov eax,0   ;line counter
   ;ready
   NewBloc:
   IFDEF ALIGN16
      ;------ align 16 needed ----------
      movdqa xmm1,xmm6         ;charge 13
      movdqa xmm2,xmm6         ;charge 13      
      pcmpeqb  xmm1,[esi]      ;cmp with memory align 16      
      pcmpeqb  xmm2,[esi+16]      ;cmp with memory ,align 16            
   ELSE
      movdqu xmm1,[esi]         ;charge 13
      movdqu xmm2,[esi+16]         ;charge 13      
      pcmpeqb  xmm1,xmm6      ;13 ,xmm1 modified ,align 16   
      pcmpeqb  xmm2,xmm6      ;13 ,xmm1 modified ,align 16            
   ENDIF
      
   pmovmskb ecx, xmm1 ; result in ecx
   pmovmskb edx, xmm2 ; result +16 edx
   shl edx,16
   add ecx,edx
      
   NbLineBreak:
   bsf   edx,   ecx
   jz suite
   .if    word   ptr [edx+esi] == 0A0Dh
      inc   eax
   .endif
   btr   ecx,   edx
   jmp NbLineBreak

   suite:
   lea esi,[esi+32]
   .if esi < theEnd
      jmp NewBloc
   .endif

FindeCompteurLignes:
         ret
CompteurLignes endp