The MASM Forum Archive 2004 to 2012

General Forums => The Laboratory => Topic started by: denise_amiga on May 31, 2005, 07:42:44 PM

Title: szLen optimize...
Post by: denise_amiga on May 31, 2005, 07:42:44 PM
StrLen1 <--- from agner
StrLen2 <--- modification from original
szLen1 <--- original
szLen2 <--- modification from original
lstrlen <--- original from system

Quote
1472 -- 1378 clocks StrLen1
1472 -- 1196 clocks StrLen2
1472 -- 3077 clocks szLen1
1472 -- 2233 clocks szLen2
1472 -- 5870 clocks lstrlen


; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««

    include     masm32rt.inc

    ;include    timers.asm

    StrLen1 proto   :DWORD
    StrLen2 proto   :DWORD
    szLen1  proto   :DWORD
    szLen2  proto   :DWORD

; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
    .data
      str0 db 64 dup ("my other brother darryl"),0

    .code
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
start:
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
    ;repeat 3          ; Used to check sensitivity to alignment
    ;  nop
    ;endm

    invoke  StrLen1, addr str0
    print ustr$(eax)
    print chr$(" -- ")
    counter_begin 10000000, REALTIME_PRIORITY_CLASS
      invoke StrLen1, addr str0
    counter_end
    print ustr$(eax)
    print chr$(" clocks StrLen1",13,10)

    invoke  StrLen2, addr str0
    print ustr$(eax)
    print chr$(" -- ")
    counter_begin 10000000, REALTIME_PRIORITY_CLASS
      invoke StrLen2, addr str0
    counter_end
    print ustr$(eax)
    print chr$(" clocks StrLen2",13,10)

    invoke  szLen1, addr str0
    print ustr$(eax)
    print chr$(" -- ")
    counter_begin 10000000, REALTIME_PRIORITY_CLASS
      invoke szLen1, addr str0
    counter_end
    print ustr$(eax)
    print chr$(" clocks szLen1",13,10)

    invoke  szLen2, addr str0
    print ustr$(eax)
    print chr$(" -- ")
    counter_begin 10000000, REALTIME_PRIORITY_CLASS
      invoke szLen2, addr str0
    counter_end
    print ustr$(eax)
    print chr$(" clocks szLen2",13,10)

    invoke  lstrlen, addr str0
    print ustr$(eax)
    print chr$(" -- ")
    counter_begin 10000000, REALTIME_PRIORITY_CLASS
      invoke lstrlen, addr str0
    counter_end
    print ustr$(eax)
    print chr$(" clocks lstrlen",13,10)

    exit
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««

OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE

align 4

StrLen1 proc item:DWORD

    push    ebx

    mov     eax, [esp+2*4]          ; get pointer to string
    lea     edx, [eax+3]            ; pointer+3 used in the end
align 4
@@:
    mov     ebx, [eax]              ; read first 4 bytes
    add     eax, 4                  ; increment pointer
    lea     ecx, [ebx-01010101h]    ; subtract 1 from each byte
    not     ebx                     ; invert all bytes
    and     ecx, ebx                ; and these two
    and     ecx, 80808080h
    jz      @B                      ; no zero bytes, continue loop

    test    ecx, 00008080h          ; test first two bytes
    jnz     @F

    shr     ecx, 16                 ; not in the first 2 bytes
    add     eax, 2

@@:
    shl     cl, 1                   ; use carry flag to avoid branch
    sbb     eax, edx                ; compute length

    pop     ebx

    ret     1*4

StrLen1 endp

align 4

StrLen2 proc src:DWORD

    mov     ecx, [esp+1*4]
    test    ecx, 3
    jz      @max8

@bucle:
    mov     al, [ecx]
    add     ecx, 1
    test    al, al
    jz      @lb1

    test    ecx, 3
    jnz     @bucle
align 4
@max8:
    mov     eax, [ecx]
    mov     edx, 7EFEFEFFh
    add     edx, eax
    xor     eax, 0FFFFFFFFh
    xor     eax, edx
    add     ecx, 4
    test    eax, 81010100h
    jz      @max8

    mov     eax, [ecx-4]
    test    al, al
    jz      @lb4

    test    ah, ah
    jz      @lb3

    test    eax, 0FF0000h
    jz      @lb2

    test    eax, 0FF000000h
    jnz     @max8

@lb1:
    lea     eax, [ecx-1]
    mov     ecx, [esp+1*4]
    sub     eax, ecx
    ret     1*4

@lb2:
    lea     eax, [ecx-2]
    mov     ecx, [esp+1*4]
    sub     eax, ecx
    ret     1*4

@lb3:
    lea     eax, [ecx-3]
    mov     ecx, [esp+1*4]
    sub     eax, ecx
    ret     1*4

@lb4:
    lea     eax, [ecx-4]
    mov     ecx, [esp+1*4]
    sub     eax, ecx
    ret     1*4

StrLen2 endp

align 4

szLen1 proc src:DWORD

    mov     eax, [esp+1*4]    ; src
    sub     eax, 4
align 4
@@:
    add     eax, 4
    cmp     BYTE PTR [eax], 0
    je      @lb1

    cmp     BYTE PTR [eax+1], 0
    je      @lb2

    cmp     BYTE PTR [eax+2], 0
    je      @lb3

    cmp     BYTE PTR [eax+3], 0
    jne     @B

    sub     eax, [esp+1*4]    ; src
    add     eax, 3
    ret     1*4

@lb3:
    sub     eax, [esp+1*4]    ; src
    add     eax, 2
    ret     1*4

@lb2:
    sub     eax, [esp+1*4]    ; src
    add     eax, 1
    ret     1*4

@lb1:
    sub     eax, [esp+1*4]    ; src
    ret     1*4

szLen1 endp

align 4

szLen2 proc src:DWORD

    mov     eax, [esp+1*4]    ; src
    sub     eax, 4
    xor     ecx, ecx
align 4
@@:
    add     eax, 4
    cmp     BYTE PTR [eax], cl
    je      @lb1

    cmp     BYTE PTR [eax+1], cl
    je      @lb2

    cmp     BYTE PTR [eax+2], cl
    je      @lb3

    cmp     BYTE PTR [eax+3], cl
    jne     @B

    sub     eax, [esp+1*4]    ; src
    add     eax, 3
    ret     1*4

@lb3:
    sub     eax, [esp+1*4]    ; src
    add     eax, 2
    ret     1*4

@lb2:
    sub     eax, [esp+1*4]    ; src
    add     eax, 1
    ret     1*4

@lb1:
    sub     eax, [esp+1*4]    ; src
    ret     1*4

szLen2 endp

OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef

end start

Title: Re: szLen optimize...
Post by: James Ladd on May 31, 2005, 09:18:45 PM
Have a look at Donkeys string utils for a truely fast length api.
(you will have to search this board)
Title: Re: szLen optimize...
Post by: denise_amiga on June 01, 2005, 07:44:12 AM
hi striker

I have looked for in the forum and only I have found a few references to donkey but they do not speak to anything referring, if you could say something to me, you help me, since I am beginning with the assembler and I like much.

right now I do not have long time, and I think that was faster to write here, but will look for the network.

Quote

1472 -- 1378 clocks StrLen1
1472 -- 1196 clocks StrLen2
1472 -- 3077 clocks szLen1
1472 -- 2233 clocks szLen2
1472 -- 2241 clocks szLen3
1472 -- 2104 clocks szLen4
1472 -- 5870 clocks lstrlen




OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE

align 4

szLen3 proc src:DWORD

    mov     eax, [esp+1*4]    ; src
    sub     eax, 4
    xor     ecx, ecx
align 4
@@:
    add     eax, 4
    mov     ecx, [eax]
    ;cmp    BYTE PTR [eax], 0
    test    cl, cl
    je      @lb1

    ;cmp    BYTE PTR [eax+1], 0
    test    ch, ch
    je      @lb2

    mov     ecx, [eax+2]
    ;cmp    BYTE PTR [eax+2], 0
    test    cl, cl
    je      @lb3

    ;cmp    BYTE PTR [eax+3], 0
    test    ch, ch
    jne     @B

    sub     eax, [esp+1*4]    ; src
    add     eax, 3
    ret     1*4

@lb3:
    sub     eax, [esp+1*4]    ; src
    add     eax, 2
    ret     1*4

@lb2:
    sub     eax, [esp+1*4]    ; src
    add     eax, 1
    ret     1*4

@lb1:
    sub     eax, [esp+1*4]    ; src
    ret     1*4

szLen3 endp

align 4

szLen4 proc src:DWORD

    mov     eax, [esp+1*4]    ; src
    sub     eax, 4
    ;xor    ecx, ecx
align 4
@@:
    add     eax, 4
    movzx   ecx, word ptr [eax]
    ;cmp    BYTE PTR [eax], 0
    test    cl, cl
    je      @lb1

    ;cmp    BYTE PTR [eax+1], 0
    test    ch, ch
    je      @lb2

    movzx   ecx, word ptr [eax+2]
    ;cmp    BYTE PTR [eax+2], 0
    test    cl, cl
    je      @lb3

    ;cmp    BYTE PTR [eax+3], 0
    test    ch, ch
    jne     @B

    sub     eax, [esp+1*4]    ; src
    add     eax, 3
    ret     1*4

@lb3:
    sub     eax, [esp+1*4]    ; src
    add     eax, 2
    ret     1*4

@lb2:
    sub     eax, [esp+1*4]    ; src
    add     eax, 1
    ret     1*4

@lb1:
    sub     eax, [esp+1*4]    ; src
    ret     1*4

szLen4 endp

OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef



(sorry by the language, I am learning english and assembler  :dazzled:)
Title: Re: szLen optimize...
Post by: MichaelW on June 01, 2005, 09:04:14 AM
I couldn't find any reference to Donkey's libraries on this forum, but they are available from his web site:

http://donkey.visualassembler.com/

Donkey uses GoAsm, so the library code will need to be converted to MASM syntax. I seem to recall that Donkey or someone else posted instructions for performing the conversion, but I couldn't find the post. In any case the conversion is not difficult.

Another readily available string length procedure is the strlen function from MSVCRT.DLL. In the MASM32 include file it is named crt_strlen. I don't recall how it compares speed-wise to the optimized procedures.

Here is a small app that performs a function test of the procedures. I moved your procedures into an include file that contains the procedure code with a leading ".code" directive (and no END directive).

; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
    .486                       ; create 32 bit code
    .model flat, stdcall       ; 32 bit memory model
    option casemap :none       ; case sensitive

    include \masm32\include\windows.inc
    include \masm32\include\masm32.inc
    include \masm32\include\kernel32.inc
    include \masm32\include\msvcrt.inc
    includelib \masm32\lib\masm32.lib
    includelib \masm32\lib\kernel32.lib
    includelib \masm32\lib\msvcrt.lib
    include \masm32\macros\macros.asm
    include procs.asm
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
    .data
        str0    db 0
        str1    db 'X',0
        str2    db 'XX',0
        str3    db 'XXX',0
        str4    db 'XXXX',0
        str5    db 'XXXXX',0
        str15   db 15 dup('X'),0
        str16   db 16 dup('X'),0
        str17   db 17 dup('X'),0
        str255  db 255 dup('X'),0
        str1000 db 250 dup('X')
                db 250 dup('X')
                db 250 dup('X')
                db 250 dup('X'),0
    .code
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
start:
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
    FOR teststr,<str0,str1,str2,str3,str4,str5,\
            str15,str16,str17,str255,str1000>
        FOR testproc,<StrLen,StrLen1,StrLen2,szLen,szLen1,\
                szLen2,szLen3,szLen4,crt_strlen,lstrlen>
            invoke testproc,ADDR teststr
            print ustr$(eax),32
        ENDM
        print chr$(13,10)
    ENDM
    mov   eax,input(13,10,"Press enter to exit...")
    exit
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
end start


Title: Re: szLen optimize...
Post by: hutch-- on June 01, 2005, 01:59:30 PM
I have seen a lot of string length algos in my time but they still break down to 2 types, variations of Agner Fog's strlen DWORD version and various forms of byte scanners. The DWORD version is faster but it has problem with alignment and it can under some circumstances fail if the length is at the end of a memory page. The "szLen" algo is a classic byte scanner unrolled by 4 that is both insensitive to alignment and has no page ending problems so it is properly general purpose.

From memory Donkey had a variation of the Agner Fog design where he aligned the beginning of the algo then read in DWORD size chunks which solves the alignment problem but not the page end problem.

Most zero terminated strings are relatively short (< 128  bytes) and with a byte scanner running at something like 100 meg/sec, I wonder if there is a gain in chasing faster but less general purpose algos.
Title: Re: szLen optimize...
Post by: denise_amiga on June 01, 2005, 02:42:12 PM
hello MichaelW

thx for the link to "donkey´s stable"  and thx very much for the small app for test the procedures, it´s help me  :U
the StrLen1 that i post is from msvcr70.dll

I am finishing modifying the masm32.lib, and your small application is to me helpful for test the different versions.
the main changes are changes of the comparisons memory-immediate in the loops, elimination of stack´s frame in algo without locals, and some  small tricks.
the average speed has raised in a 37% and I am very happy.

I like to learn from the bests as some gurus. like (hutch-, you, etc)

and I continue learning

Title: Re: szLen optimize...
Post by: denise_amiga on June 01, 2005, 05:33:57 PM
thx MichaelW


; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««

include masm32rt.inc
include procs.asm

; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
.data
    str0    db 0
    str1    db 128  dup ('Xx'),0
    str2    db 512  dup ('Xx'),0
    str3    db 1024 dup ('Xx'),0
align 4
    txt0    db  "0 bytes",0
    txt1    db  "256 bytes",0
    txt2    db  "1024 bytes",0
    txt3    db  "2048 bytes",0
align 4
    tb_txt  dd  txt0,txt1,txt2,txt3
    alg0    db  "StrLen",0
    alg1    db  "StrLen1",0
    alg2    db  "szLen",0
    alg3    db  "lstrlen",0
align 4
    tb_alg  dd  alg0,alg1,alg2,alg3
.code
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
start:
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
    push    edi
    push    esi
    lea     edi, [tb_alg]
    lea     esi, [tb_txt]
    FOR teststr,<str0,str1,str2,str3>
        FOR testproc,<StrLen,StrLen1,szLen,lstrlen>
            counter_begin 10000000, REALTIME_PRIORITY_CLASS
                invoke  testproc,ADDR teststr
            counter_end
            print   ustr$(eax),9
            print   " clocks for proc - "
            print   [edi],13,10
            add     edi, 4
        ENDM
        lea     edi, [tb_alg]
        print   "--------------------------------  "
        print   [esi],13,10
        add     esi, 4
        print   chr$(13,10)
    ENDM
    mov     eax, input("Press enter to exit...")
    pop     esi
    pop     edi
    exit
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
end start

Title: Re: szLen optimize...
Post by: Mark Jones on June 02, 2005, 04:40:46 AM
Here's my hacked and botched attempt. Be nice to the new guy. :) Results on an AMD XP 1800+. Two results shown, with and without the Sleep API.

Zero Sleep between tests:

            StrLen StrLen1 StrLen2 szLen szLen1 szLen2 szLen3 szLen4 lstrlen
0 bytes:      12      10     10     8      5      6      8      5      33
1 bytes:      12      10     11     9      9      8      9      8      39
2 bytes:      14      12     11     13     9      8      10     10     42
3 bytes:      14      12     11     13     9      10     11     10     56
4 bytes:      18      16     13     18     11     10     11     12     45
5 bytes:      18      16     14     18     14     13     13     12     48
15 bytes:     25      22     22     30     24     23     29     29     105
16 bytes:     31      25     25     32     26     25     30     33     96
17 bytes:     31      25     25     34     27     29     34     35     110
255 bytes:    283     249    248    359    342    355    435    451    822
1023 bytes:   1086    933    945    1346   1332   1345   1728   1734   3208


250ms Sleep between tests:

            StrLen StrLen1 StrLen2 szLen szLen1 szLen2 szLen3 szLen4 lstrlen
0 bytes:      12      10     9      8      8      8      6      7      33
1 bytes:      12      10     11     9      8      7      7      9      39
2 bytes:      14      13     12     13     8      9      10     10     42
3 bytes:      14      13     12     13     9      10     11     10     44
4 bytes:      18      16     15     18     10     11     12     14     45
5 bytes:      18      16     16     18     12     14     13     14     48
15 bytes:     25      23     23     30     23     24     26     29     93
16 bytes:     31      28     28     32     25     27     30     31     96
17 bytes:     31      28     29     33     26     29     32     32     99
255 bytes:    283     282    296    347    344    345    425    409    823
1023 bytes:   1064    1076   1137   1324   1333   1332   1659   1589   3203


Interesting. Unsure what's going on there. It might be interesting to calculate the mean deviation of some datasets in an effort to determine if Sleep is increasing or decreasing accuracy.

Michael, could I request a CPUID function in timers.asm, so we can have our CPU details right in the console? :toothy

[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: MichaelW on June 02, 2005, 06:29:55 AM
Quote from: Mark Jones on June 02, 2005, 04:40:46 AM
Michael, could I request a CPUID function in timers.asm, so we can have our CPU details right in the console? :toothy

Good idea. I already have one, but it's a hack (the original meaning of the term :eek). I'll see what I can do.
Title: Re: szLen optimize...
Post by: denise_amiga on June 02, 2005, 07:53:34 AM
result on   PIV 2800@3300

Zero Sleep tests:
Quote
                 StrLen  StrLen1 StrLen2  szLen   szLen1  szLen2  szLen3  szLen4 lstrlen
0 bytes:            7       2       1       2       10      -1      -3      8       41
1 bytes:            1       9       2       -2      10      2       1       -1      49
2 bytes:            2       14      10      0       2       1       5       0       40
3 bytes:            2       1       14      15      5       4       11      11      45
4 bytes:            4       5       10      11      12      4       3       10      48
5 bytes:            8       13      6       4       8       4       14      4       56
15 bytes:           13      5       26      19      33      36      22      18      62
16 bytes:           16      16      19      18      32      22      28      29      90
17 bytes:           17      16      27      38      58      24      33      27      78
255 bytes:          238     236     229     353     548     419     422     352     620
1023 bytes:         844     854     849     1341    2077    1573    1586    1333    2325

250ms   Sleep   tests:
Quote
                 StrLen  StrLen1 StrLen2  szLen   szLen1  szLen2  szLen3  szLen4 lstrlen
0 bytes:            3       2       0       -3      -3      -3      6       -1      34
1 bytes:            1       1       3       -1      -1      -1      1       -3      38
2 bytes:            14      3       3       3       2       0       0       0       42
3 bytes:            2       5       5       2       14      2       3       0       46
4 bytes:            4       5       6       3       6       3       8       3       47
5 bytes:            4       4       6       4       9       5       5       4       47
15 bytes:           24      13      16      19      32      26      18      19      81
16 bytes:           18      17      19      17      33      22      23      20      75
17 bytes:           16      17      17      26      57      23      39      30      85
255 bytes:          228     222     221     353     535     407     414     349     647
1023 bytes:         838     819     809     1350    2079    1562    1551    1331    2379

note: in my system, StrLen = Strlen1 and szLen = szLen4 (masm32.lib modified), szLen1 is the original without stack´s frame
Title: Re: szLen optimize...
Post by: Maelstrom on June 10, 2005, 03:45:33 AM
Dont know if you have seen this one or not...
I saved this from quite awile back when the same thing was tried on another board.

I think the tail end can be modified for faster performance (I remember playing with it) but the inner loop is where this one shines...
If I remember correctly the inner loop, because of the U/V pipes, ran 1 DWORD per 2 cycles?  Maybe my memories faulty :green2


; FStrLen - buliaNaza, Lingo12
;
; Returns the length of a null terminated
; string not including the null.

FStrLen proto :dword

.code

option prologue:none
option epilogue:none
FStrLen proc string:dword
        push   esi
        push   ebx
        mov    esi, [esp+8]
        mov    ebx, 80808080h
        mov    eax, [esi]
        xor    edx, edx

    FStrLen_Loop:
        lea    ecx, [eax-10101010h]
        inc    edx
        and    ecx, ebx
        mov    eax, [esi+edx*4]
        jz     FStrLen_Loop

        bsf    ebx, ecx
        dec    edx
        shr    ebx, 3
        lea    eax, [ebx+edx*4]
        pop    ebx
        pop    esi
    ret
FStrLen endp
option prologue:PrologueDef
option epilogue:EpilogueDef
Title: Re: szLen optimize...
Post by: Maelstrom on June 10, 2005, 12:27:15 PM
Did some searching on the old boards and found the original posting with comments...

Would be interesting to do a compare with this one :wink


FStrLen proc string:dword
        mov     esi, [esp+8]
        mov     eax, [esi]          ; get a dword (buffer is aligned)
        mov     ebx, 80808080h      ; we'll use register ebx rather then immediate 80808080h
        xor     edx, edx            ; edx=0
    C2_loop:
        lea     ecx, [eax-1010101h] ; sub 1 from each byte in eax
        inc     edx                 ; ready for next dword
        test    ecx, ebx            ; test  sign ; ebx= 80808080h     
        mov     eax, [esi+edx*4]    ; get next dword
        jz      C2_loop             ; if not loop again

        test    eax, 000000FFh      ; is al zero?
        jz      C2_minus4           ;
        test    eax, 0000FF00h      ; is ah zero?
        jz      C2_minus3           ;
        test    eax, 00FF0000h      ; is zero?
        jz      C2_minus2           ;
        test    eax, 0FF000000h     ; is zero?
        jnz     C2_loop             ; if not zeroes loop again
        lea     eax, [edx-1]        ; eax= length of string
        ret
    C2_minus2:
        lea     eax, [edx-2]        ; eax= length of string
        ret
    C2_minus3:
        lea     eax, [edx-3]        ; eax= length of string
        ret
    C2_minus4:
        lea     eax, [edx-4]        ; eax= length of string
        ret
FStrLen endp
Title: Re: szLen optimize...
Post by: hutch-- on June 10, 2005, 12:29:22 PM
First thing to do with that version is to swap EAX and EDX, then use ADD or SUB for the corections instead of LEA which is slow on post PIII hardware.
Title: Re: szLen optimize...
Post by: denise_amiga on June 11, 2005, 08:22:36 AM
Maelstrom



option prologue:none
option epilogue:none
FStrLen proc string:dword
        push   esi
        push   ebx
        mov    esi, [esp+8]    ; <<< It´s wrong, It has to be 12, because 2 push

Title: Re: szLen optimize...
Post by: hutch-- on June 11, 2005, 08:54:56 AM
hmmmm,

In its current form, it always returns zero.
Title: Re: szLen optimize...
Post by: Maelstrom on June 11, 2005, 03:51:12 PM
Yeah your right, what a mess.  So much for copying and pasting from older threads...
I went through the thread from the beginning and debugged it though.  This one is corrected.


FStrLen proc string:dword
        mov     esi, [esp+8]
        mov     ebx, 80808080h      ; we'll use register ebx rather than immediate 80808080h
        mov     eax, [esi]
        xor     edx, edx            ; edx = 0
    @@:
        lea     ecx, [eax-1010101h] ; sub 1 from each byte in eax
        inc     edx                 ; ready for next dword
        and     ecx, ebx            ; test  sign ; ebx= 80808080h     
        mov     eax, [esi+edx*4]    ; get next dword
        jz      @B                  ; if not loop again

        test    ecx, 000000FFh      ; is al zero?
        jnz     C_minus4            ;
        test    ecx, 0000FF00h      ; is ah zero?
        jnz     C_minus3            ;
        test    ecx, 00FF0000h      ; is zero?
        jnz     C_minus2            ;
    C_minus1:
        lea     eax, [edx*4-1]
        ret
    C_minus2:
        lea     eax, [edx*4-2]
        ret
    C_minus3:
        lea     eax, [edx*4-3]
        ret
    C_minus4:
        lea     eax, [edx*4-4]
        ret
FStrLen endp
Title: Re: szLen optimize...
Post by: Phil on June 12, 2005, 06:15:42 AM
Finally got FStrLen working. Looks like it's faster on my P3 for everything larger than 3 byte strings with the modifications that I made. Originally, It was about the same as StrLen1 when it had to preserve ebx and esi to be a 'fair' comparison with the other procedures. I re-wrote it but maintained the spirit of doing 4 bytes at a time. Here's the modified procedure:
FStrLen proc string:dword
        mov     ecx, [esp+4]
        xor     eax, eax            ; clear initial size
    @@:
        mov     edx, [ecx+4*eax]
        lea     edx, [edx-1010101h] ; sub 1 from each byte in eax
        add     eax,1               ; ready for next dword
        and     edx, 80808080h      ; check sign bit in all byte
        jz      @B                  ; more to do until we have a sign
        test    edx, 000000FFh      ; is byte-1 zero?
        jnz     C_minus4            ;
        test    edx, 0000FF00h      ; is byte-2 zero?
        jnz     C_minus3            ;
        test    edx, 00FF0000h      ; is byte-3 zero?
        jnz     C_minus2            ;
    C_minus1:
        lea     eax,[4*eax-1]
        ret     1*4
    C_minus2:
        lea     eax,[4*eax-2]
        ret     1*4
    C_minus3:
        lea     eax,[4*eax-3]
        ret     1*4
    C_minus4:
        lea     eax,[4*eax-4]
        ret     1*4

FStrLen endp

I should point out that this procedure isn't always safe. If the final null byte of the string is at the end of an allocated memory segment then the last dword mov could cause a general protection fault if the string wasn't aligned on a 4-byte boundry. To make it safe, more code would be needed up front to do things a byte at a time until 4-byte alignment was assured. It would probably still be faster on strings larger than 6 or 7 bytes but it would slow it down a bit. Finally, the timings on a 996 MHz P3:
Bytes    0     1     2     3     4     5     15    16    17    255   1023
         ===== ===== ===== ===== ===== ===== ===== ===== ===== ===== =====
FStrLen  6     9     9     9     9     15    16    18    21    209   785
StrLen   15    15    16    15    18    18    25    28    28    278   1046
StrLen1  12    12    12    12    16    16    23    25    25    276   1042
StrLen2  11    10    13    13    15    14    25    27    26    286   1058
szLen    8     10    12    14    17    19    32    35    36    465   1808
szLen1   5     7     9     11    13    15    28    30    31    374   1449
szLen2   5     8     11    11    13    15    29    30    32    403   1560
szLen3   8     7     9     10    14    15    28    34    31    412   1595
szLen4   5     7     10    10    13    14    29    27    29    353   1349
lstrlen  44    44    46    47    49    62    92    95    98    812   3120


The crossover point where it becomes consistently faster might be higher than 3 bytes on some machines but I think it's a general win for average and large string sizes. Thanks for posting the algorithm and also the szlen program that I also modified and renamed fszlen to avoid confusion.


[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: Phil on June 12, 2005, 10:55:31 AM
I changed the lea adjustments to sub's because they should be faster on many machines.
FStrLn2 proc string:dword
        mov     ecx, [esp+4]
        xor     eax, eax            ; clear initial size
    @@:
        mov     edx, [ecx+eax]
        lea     edx, [edx-1010101h] ; sub 1 from each byte in eax
        add     eax, 4              ; ready for next dword
        and     edx, 80808080h      ; check sign bit in all byte
        jz      @B                  ; more to do until we have a sign
        test    edx, 000000FFh      ; is byte-1 zero?
        jnz     C_minus4
        test    edx, 0000FF00h      ; is byte-2 zero?
        jnz     C_minus3
        test    edx, 00FF0000h      ; is byte-3 zero?
        jnz     C_minus2
    C_minus1:
        sub     eax,1
        ret     1*4
    C_minus2:
        sub     eax,2
        ret     1*4
    C_minus3:
        sub     eax,3
        ret     1*4
    C_minus4:
        sub     eax,4
        ret     1*4

FStrLn2 endp


And the timing, again on a 996 MHz P3
Bytes    0     1     2     3     4     5     15    16    17    255   1023
         ===== ===== ===== ===== ===== ===== ===== ===== ===== ===== =====
FStrLen  6     9     10    9     9     15    16    18    21    209   785
FStrLn2  7     8     9     9     18    11    15    19    20    209   785
StrLen   15    15    16    15    18    18    25    28    28    278   1047
StrLen1  11    11    12    12    15    15    21    24    24    231   856
StrLen2  12    12    12    13    14    14    24    26    26    269   986
szLen    8     10    12    14    17    19    32    35    37    464   1808
szLen1   5     7     9     11    13    15    28    28    31    371   1443
szLen2   6     7     9     12    13    15    30    30    31    408   1580
szLen3   5     8     9     10    13    15    28    33    30    413   1589
szLen4   6     7     9     10    13    14    27    30    31    403   1553


Notice that StrLen1 and StrLen2 changed dramatically from the previous version. I carefully removed the new function FStrLn2 from the test and commented the procedure definition so it wouldn't be loaded into memory and the StrLen functions went back to their previous slower timings. It's unusual and I repeated the test by carefully editing the code to put my updated function in and they when back to the more likely speeds that we see here. I don't understand how the position in memory can affect the timing this dramatically.


[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: hutch-- on June 12, 2005, 11:07:02 AM
Phil,

In the short term, try aligning each procedure entry at align 16 and if that does not help, try another dirty trick,


REPEAT 1024
  nop
ENDM


After each ALIGN 16 but before the procedure. The idea is to isolate each procedure.
Title: Re: szLen optimize...
Post by: Jimg on June 12, 2005, 03:30:02 PM
QuoteI should point out that this procedure isn't always safe. If the final null byte of the string is at the end of an allocated memory segment then the last dword mov could cause a general protection fault if the string wasn't aligned on a 4-byte boundry.

It should also be noted that these routines do not work on ascii character > 128

For example, change test string 5 to

    str5        db 'X',130,'XXX',0   ; was 'XXXXX',0

run the test and look at the length printed in the headers.
Title: Re: szLen optimize...
Post by: Phil on June 12, 2005, 05:16:42 PM
JimG: Thanks for pointing out that many of these functions only work with 7-bit ASCII. I hadn't noticed that until you mentioned it.

Hutch: Thanks for the suggestions on issolating the functions. I'll try the align 16 and modifiy this post later with the results.
Title: Re: szLen optimize...
Post by: donkey on June 12, 2005, 05:30:24 PM
I wrote this some time ago, it is for exceeding long strings (>128bytes) using MMX. Note that it is assumed the the entry of the proc is on a paragraph boundary, something that is gauranteed if it is in a static lib using GoAsm. Also the string must start on a 16 byte boundary.

lszLenMMX FRAME pString

mov eax,[pString]
nop
nop ; fill in stack frame+mov to 8 bytes

pxor mm0,mm0
nop ; fill pxor to 4 bytes
pxor mm1,mm1
nop ; fill pxor to 4 bytes

: ; this is aligned to 16 bytes
movq mm0,[eax]
pcmpeqb mm0,mm1
add eax,8
pmovmskb ecx,mm0
or ecx,ecx
jz <

sub eax,[pString]

bsf ecx,ecx
sub eax,8
add eax,ecx

emms

   RET

ENDF
Title: Re: szLen optimize...
Post by: Ratch on June 13, 2005, 04:14:07 PM
To the Ineffable All,
     Perhaps all those interested should peruse this link. Ratch 
http://board.win32asmcommunity.net/index.php?topic=16299.msg128369;topicseen#msg128369
Title: Re: szLen optimize...
Post by: Jimg on June 13, 2005, 06:04:00 PM
Very nice Ratch, the fastest one that works on all ascii characters so far.

I couldn't leave good enough alone, of course, so I played a little with your code.  First my appologies for messing with your code, I don't get along well with loops and repeats and such, so I coverted it to brute force code to play with.

I'm getting some strange results.  My first try only cut a few cycles off  ( proc RatchX), but then I inserted a nop in preparation for some other tests, and on my screwy athlon, it made the code run much faster on the large string (proc RatchX2).  The nop misaligned the main loop, so it should have made it run much slower.  Here are my results:
Bytes    0     1     2     3     4     5     15    16    31    255   1023
         ===== ===== ===== ===== ===== ===== ===== ===== ===== ===== =====
StrLen2  11    11    11    11    13    14    22    25    36    283   927
Ratch    8     11    12    15    10    14    26    21    39    243   875
RatchX   8     10    13    13    13    14    22    22    35    234   864
RatchX2  8     11    14    14    14    14    20    21    35    217   800

Press enter to exit...

  What's going on here.

If someone with an intel chip would try this out to see if there is a similar effect, I'd appreciate it.

[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: Phil on June 13, 2005, 06:19:07 PM
Here are the results on a 996 MHz P3
Bytes    0     1     2     3     4     5     15    16    31    255   1023
         ===== ===== ===== ===== ===== ===== ===== ===== ===== ===== =====
StrLen2  9     11    12    13    13    15    24    42    56    301   1047
Ratch    18    25    32    39    22    29    49    31    72    258   893
RatchX   18    24    31    33    21    29    41    31    63    253   884
RatchX2  19    23    30    33    20    27    41    33    68    247   881



And a complete pass for all procedures included in this thread.
Proc/Bytes    0    1    2    3    4    5   15   16   17   31 1023
========== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
FStrLen       7    7   10    8   18   10   17   19   19   43  786
FStrLn2       7    8    9    8   13   11   15   19   20   43  786
Ratch        18   25   32   39   22   29   49   31   38   72  893
RatchX       18   24   31   33   21   29   41   31   38   63  883
RatchX2      19   23   30   33   20   27   41   33   37   68  882
StrLen       15   15   16   16   18   18   25   28   28   53 1047
StrLen1      12   12   12   12   16   16   23   25   25   53 1042
StrLen2       9   11   11   13   13   15   24   39   27   56 1047
szLen         8   10   12   14   17   19   32   49   48   73 1808
szLen1        5    7    9   11   13   15   28   30   32   65 1566
szLen2        6    7    9   11   13   15   29   43   45   67 1580
szLen3        6    6   10   10   12   14   27   51   72   65 1571
szLen4        5    7    8   10   14   14   27   30   46   63 1552
lstrlenA     48   41   44   45   48   62   91   94   97  139 3117


Also, I defined a fld$ macro that produces a fixed length field and allows right alignment. The code is list driven so it is easier to modify. The complete source is included in the zip but here are the defining elements that show the idea:
    include     \masm32\include\masm32rt.inc    ; defaults to .386

.586

    include     timers.asm      ; requires a pentium

    LOOPCOUNT equ 1000000

    PROCS TEXTEQU <FStrLen,FStrLn2, \
                   Ratch,RatchX,RatchX2, \
                   StrLen,StrLen1,StrLen2, \
                   szLen,szLen1,szLen2,szLen3,szLen4, \
                   lstrlen >

    SIZES TEXTEQU <0,1,2,3,4,5,15,16,17,31,1023>

    %FOR proc,<PROCS>
         proc proto :DWORD
    ENDM

    MAXWIDTH  equ 20
    HDRWIDTH  equ 10
    COLWIDTH  equ 5

.data

    %FOR len,<SIZES>
         align 16
         str&len& db len dup ('X'),0
    ENDM

fld$ MACRO DDpointer,DDwidth,DDalign:=<0>
    LOCAL rvstring
    .data
        rvstring db MAXWIDTH+4 dup (0)
        align 16
    .code
    invoke sxFld,reparg(DDpointer),ADDR rvstring,DDwidth,DDalign
    EXITM <ADDR rvstring>
ENDM

.code
start:

    print fld$(chr$("Proc/Bytes"),HDRWIDTH)
    %FOR len,<SIZES>
        invoke StrLen,ADDR str&len&
        print fld$(ustr$(eax),COLWIDTH,1)
    ENDM
    print chr$(13,10)

    print fld$(chr$("=========="),HDRWIDTH)
    %FOR len,<SIZES>
        invoke StrLen,ADDR str&len&
        print fld$(" ====",COLWIDTH)
    ENDM
    print chr$(13,10)

    %FOR proc,<PROCS>
        print fld$(chr$("&proc&"),HDRWIDTH)
        %FOR len,<SIZES>
            counter_begin LOOPCOUNT, HIGH_PRIORITY_CLASS
                invoke proc,ADDR str&len&
            counter_end
            mov ebx,eax
            print fld$(ustr$(ebx),COLWIDTH,1)
        ENDM
        print chr$(13,10)
    ENDM
    mov   eax,input(13,10,"Press enter to exit...")
    exit

[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: hutch-- on June 13, 2005, 11:38:55 PM
Here are the timings on my Prescott PIV.


Bytes    0     1     2     3     4     5     15    16    31    255   1023
         ===== ===== ===== ===== ===== ===== ===== ===== ===== ===== =====
StrLen2  6     9     8     6     10    9     27    25    38    302   862
Ratch    1     1     13    4     4     6     19    13    43    260   910
RatchX   -1    0     1     1     7     3     16    13    30    256   910
RatchX2  -1    2     1     1     3     4     13    13    30    248   882
Title: Re: szLen optimize...
Post by: Jimg on June 14, 2005, 12:04:59 AM
Hutch-
Strange. The misaligned code is faster on yours also.  Not as dramatic, but suprising anyway.  Thanks.
Title: Re: szLen optimize...
Post by: Phil on June 15, 2005, 05:59:03 AM
Ratch, RatchX, RatchX2, StrLen1, and StrLen2 all seem to be quite sensitive to alignment. For some reason FStrLen and FStrLen1 were not. The following table was produced by loading 13 different copies of each procedure into memory at various alignments. Before each procedure is:

align 16

repeat PAD
  nop
endm

Ratch&PAD& proc arg:DWORD ; procedure definition and entry begins here.
.... full procedure definition
Ratch&PAD& endp

The &PAD& appends the pad value to the procedure name to avoid duplicate names as the same procedure is loaded with different padding. This is the last of PAD values from 4 to 16. The values are easily alterable in the source if you care to play with them. I was quite surprised because the 16-byte alignment doesn't always seem to be best. Also, the timings are for a 1023 dup('X') string that was used in many of the preceeding tests and the different timings at various alignments help explain wide differences as we would add or remove the 250 ms sleep which changed the alignment.
Pad nops      4    5    6    7    8    9   10   11   12   13   14   15   16
========== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
Ratch      1067 1066 1332 1077 1335 1333 1333 1334 1078  889 1067  883  893
RatchX     1061 1061 1313 1059 1313 1313 1313 1313 1060  882 1059  872  883
RatchX2    1061 1313 1058 1312 1312 1312 1314 1059  885 1059  879  880  882
StrLen1    1298 1046 1048 1046 1043  870  873  874  859 1046 1046 1043 1043
StrLen2     987  978  977  992 1059 1059 1058 1059 1302 1302 1302 1302 1048


RatchX2 will be fastest when it is preceeded by 'align 16' followed by 14 or 15 'nops'.
StrLen2 is fastest when preceeded by 'align 16' followed by 2 nops, etc.

I don't understand but it is interesting!


[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: Jimg on June 15, 2005, 03:03:23 PM
Nice technique Phil, I kept thinking I was gonna get around to something similar :U

Here are my results, except I changed 16 to zero to test against original configuration:

Pad nops      4    5    6    7    8    9   10   11   12   13   14   15    0
========== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
Ratch      1058  904 1063 1060 1058 1059 1189 1059  933  892  893  809  873
RatchX     1056  894 1053 1052 1055 1054 1180 1054  929  888  884  801  864
RatchX2     887 1052 1052 1053 1057 1180 1056  926  881  884  798  800  800
StrLen1    1058  931  928  930  927  870  865  866  865 1053 1056 1054 1056
StrLen2     909  910  913  910 1007 1007 1008 1007 1122 1122 1122 1122  930

I would have thought 12 pads on the Ratch procs would have been the best since it would align the loop at 16 byte, but didn't work out that way.  Still a mystery here :dazzled:
Also, I think looping a million times is probably not the best indication of how fast a routine can be used in a normal program.  Perhaps calling each routine once in sequence, loop back, call each one again, etc., keeping a running average?
Title: Re: szLen optimize...
Post by: Phil on June 16, 2005, 12:03:46 AM
JimG: Doing what you suggest would well produce a better indication of how the code might perform in a typical program. However, I think doing it like this is probably better when comparing the raw speed of an algorithm. It produces values which can be reproduced and that is very important when we are trying to tune, or slightly de-tune, our code for greater performance! Like you said, we still have mysteries here. I still don't know how mis-aligning a loop by adding a nop can make it run faster ... but it certainly seems to be the case occasionally!

Donkey: Finally got your MMX strlen for long strings into the test procedure. Sorry it took me so long to get around to it. As you said earlier, the strings must begin at a 16-byte boundry and it gains it's speed using MMX to load and examine 8-bytes at a time. Great job! It certainly seems to fly! I've added some commentary at the end of this block ... please let me know if I've missed anything or mis-understood anything. Here's your routine in MASM format:
align 16

lszLenMMX proc pString:DWORD    ; Donkey's MMX strlen for long strings
.mmx
.xmm
        mov eax,[esp+4]
nop
        nop                     ; fill in stack frame+mov to 8 bytes

pxor mm0,mm0
        nop                     ; fill pxor to 4 bytes
pxor mm1,mm1
        nop                     ; fill pxor to 4 bytes

    @@:                         ; this is aligned to 16 bytes
movq mm0,[eax]
pcmpeqb mm0,mm1
add eax,8
        pmovmskb ecx,mm0        ; edit: by Phil - is this Polish instruction?
        or ecx,ecx              ;       ... ah, here's one that i understand!
        jz @B

        sub eax,[esp+4]

bsf ecx,ecx
sub eax,8
add eax,ecx

emms

        ret 1*4

lszLenMMX endp

First, the pxor's clear both mm0 and mm1. Then 8-bytes are loaded into mm0 with movq. Next the index is updated and pcmpeqb compares all 8-bytes to the zeros stored in mm1. If any of the 8-bytes are equal to nul then the corresponding byte in mm0 is set to all ones. Otherwise, the corresponding byte in mm0 is set to all zeros. Finally, the pmovmskb (pmov mask byte) copies the most significant bit of each byte in mm0 into the destination register which is ecx. If any of the bits are set then the end of the zero terminated string has been found. The bsf instruction scans forward (right-to-left) thru the bits in ecx and returns the index of the first bit that was set. The index returned would be 0 if bit 0 was set, indicating that the terminator was in the first (least significant) byte of the 8-bytes being checked. Subtracting 8 and adding the index to the difference between the original and final pointers produces the string length that is returned in eax!

Here are the timings for all routines now included in timesz.asm using the longer strings. Again, many of them only support 7-bit ASCII. Donkey's lszLenMMX procedure nicely handles 8-bit extended ASCII and there is no danger of page boundry over-run at the end because the source string must be aligned on a 16-byte boundry. Thanks for sharing your MMX code with us Donkey!
Proc/Bytes    0    1    2    3    4    5   15   16   17  127  255 1023 2047
========== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
lszLenMMX    16   16   16   16   16   16   20   22   22   76  122  411  795
FStrLen       7    7   10    8   18   10   17   19   19  113  210  785 1554
FStrLn2       7    8    9    9   18   11   15   19   20  113  210  785 1555
Ratch        18   25   32   39   22   29   49   31   38  148  259  895 1746
RatchX       18   24   31   33   21   29   41   31   38  142  253  884 1721
RatchX2      19   23   31   33   20   27   41   33   37  146  248  881 1722
StrLen       15   15   16   16   18   18   25   28   28  150  278 1047 2071
StrLen1      11   11   12   12   15   15   21   24   24  127  231  857 1701
StrLen2      13   13   12   14   15   15   24   27   27  142  264  978 1929
szLen         8   10   12   14   17   19   32   49   47  241  464 1809 3604
szLen1        5    7    9   11   13   15   28   30   43  210  406 1566 3113
szLen2        6    7    9   12   13   15   29   30   45  212  408 1580 3149
szLen3        6    6   10   10   12   15   27   55   61  213  409 1571 3148
szLen4        5    7    8   10   14   14   26   30   30  208  399 1551 3090
lstrlenA     48   41   44   45   48   62   91   94   97  427  812 3117 6192
scasx        45   48   53   57   59   63  102  105  109  548 1055 4109 8179


The slowest algorithm by far is the 'repnz scasb' cutie that I also added to this version. The files in the zip have the same names as the previous timesz.asm. Be sure to rename your older files if you'd like to save them before you extract from this archive.


[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: roticv on June 16, 2005, 12:05:34 PM
I think it is wrong to call donkey's routine MMX though it uses MMX register as pmovmskb is an SSE opcode and not part of the original MMX instruction set. Therefore older processors without SSE would be unable to use the routine.
Title: Re: szLen optimize...
Post by: Phil on June 16, 2005, 01:05:15 PM
roticv: Thanks for pointing that out that it does require SSE. I hadn't realized that until I started reading about the instructions to figure out how it worked and then I failed to mention it in the commentary. It would also have be nice if I had posted the info link (http://www.tommesani.com/SSEPrimer.html#PMOVMSKB) since it clearly identified it as an SSE instruction.
Title: Re: szLen optimize...
Post by: Codewarp on June 19, 2005, 09:40:35 PM
Now that the contest for the main loop seems to be decided in favor of
the aligned dword search implementation, how about a little different
spin on the code around it--namely aligning the search pointer, and
locating the zero in the last dword.

The idea (as always) is to do things in the biggest chunks possible
and eliminate 8-bit operations completely from the code.  The cpu does
like it when you switch between 8/32 bit modes in her registers, and
will take it out on your clock counts without telling you.

So, to align the search, we back up :eek to the dword we are starting in,
load the dword, stuff 1's into the leading bytes to skip over, then drop
into the middle of the normal dword search and continue.  It is faster than
the byte search method on three bytes, it is likely slower than a 1-byte
search.

To locate the zero in the last dword, we handle the upper and lower halves
of the dword separately, computing the final lengths with straight 32-bit
operations directly.  This new method seems to have lower clocks than any
other method I have seen.

Incidentally, I should point out that strlen( ) is a special case of memchr( ),
that searches memory for any character, not just zero.  This work on strlen( )
can easily be extended for a fast memchr( ) implementation.

Please forgive the C++ decoration, I use this routine as a direct
replacement for strlen( ) in my C++ work--enjoy...


// search to find length of a null-terminated string
// fast performance: strings of 1/10/100/1000 bytes require 7/13/103/776 cycles (Athlon64)
int __declspec( naked ) szLength (const void* src) {
_asm {
mov  edx, [esp + 4]     ; point edx to start of string
test  edx, 3
jz   lenscan            ; branch ahead if already aligned
mov  ecx, edx
and  edx, ~3            ; edx points to aligned addr
and  ecx, 3             ; ecx = bytes to skip
shl  ecx, 3             ; ecx = bits to skip
mov  eax, 1
shl  eax, cl            ; put cl 1's into eax from the bottom up
sub  eax, 1
or   eax, [edx]         ; combine with first four bytes
jmp  resume             ; catch up with the aligned search...

align 4
lenscan: mov  eax, [edx]        ; load next four bytes into eax
resume: add  edx, 4             ; advance ptr
lea  ecx, [eax - 1010101h]
not  eax                ; for each byte in ecx: (charval-1) & ~charval & 80h
and  eax, ecx
and  eax, 80808080h
jz   lenscan            ; repeat while no zeros found

sub  edx, [esp + 4]     ; subtract the base address
test  eax, 8080h        ; test first two bytes
jz   upper2             ; jmp if not found in the lower 2 bytes
shr  eax, 8             ; set carry from bit7 of 1st byte
sbb  edx, 3             ; edx = (edx-4) + (1-carry)
mov  eax, edx           ; return as the result
ret

upper2:  shl  eax, 9            ; set carry from bit7 of 3rd byte
sbb  edx, 1         ; edx = (edx-2) + (1-carry)
mov  eax, edx           ; return as the result
ret
}
}

]
Title: Re: szLen optimize...
Post by: Phil on June 20, 2005, 07:09:24 AM
Codewarp: I haven't taken the time yet to have a close look at your algorithm but I stripped the headings and added it to the test set. I also removed the other tests that are not listed here. The FStrLen figures may not be a fair comparison because it only does 7-bit ASCII ... Ratch, and lszLenSSE both do 8-bit extended ASCII. Here are the results for various string sizes.
Proc/Bytes    0    1    2    3    5    8   13   21   34   55   89  144  233
========== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
lszLenSSE    16   16   16   16   17   19   20   22   28   48   63   86  116
FStrLen       7    7   10    9   10   13   16   37   47   59   86  128  194
Ratch        18   25   32   39   29   25   35   51   66   92  105  144  227
szLength     19   20   19   19   23   25   30   51   60   81  121  176  264
szLen         8   10   12   14   19   22   30   56   77  115  175  270  429


Tune it up a bit if you like and see if you can top Ratch! I'll have a closer look at your algorithm later. I used it to print the lengths that are displayed in the header so I know it works, at least with 7-bit ASCII.


[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: Jimg on June 20, 2005, 02:23:19 PM
I think Codewarp's idea was to make a faster routine for unaligned strings.  I inserted 1-3 bytes before each test string using for example    %FOR len,<SIZES>
         align 16
         db 0
         str&len& db len dup ('X'),0
    ENDM
to see the effect and got:

Aligned
Proc/Bytes    0    1    2    3    5    8   13   21   34   55   89  144  233
========== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
lszLenSSE    25   25   25   25   25   28   28   32   38   47   82  117  169
FStrLen       6    8   10    9   11   12   16   21   32   61   89  132  199
Ratch         8   11   12   15   14   14   20   28   39   78  101  142  221
szLength     10   11   10   11   13   17   20   28   38   77  112  168  257
szLen         8    9   13   12   18   22   28   38   54   92  139  207  321

Misaligned by 1 byte
Proc/Bytes    0    1    2    3    5    8   13   21   34   55   89  144  233
========== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
lszLenSSE    28   28   28   28   28   30   30   37   41   55   90  126  178
FStrLen       6    8   10    9   11   12   17   24   35   72  102  150  230
Ratch         8   11   12   15   18   15   23   30   40   85  109  154  241
szLength     14   13   13   16   17   20   22   32   44   85  116  174  262
szLen         8    9   13   13   18   22   28   39   57   93  139  207  323

Misaligned by 2 bytes
Proc/Bytes    0    1    2    3    5    8   13   21   34   55   89  144  233
========== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
lszLenSSE    28   28   28   28   28   30   30   37   41   55   89  125  177
FStrLen       6    8   10    9   11   12   17   24   35   71  102  149  229
Ratch         8   11   12   15   18   16   23   30   40   85  109  155  240
szLength     14   13   16   16   17   20   22   32   64   84  116  172  262
szLen         8    8   13   13   18   22   28   39   54   93  139  208  328

Misaligned by 3 bytes
Proc/Bytes    0    1    2    3    5    8   13   21   34   55   89  144  233
========== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
lszLenSSE    28   28   28   28   28   30   30   35   45   54   91  123  174
FStrLen       6    7   10    9   11   12   17   25   35   71  102  149  229
Ratch         8   11   12   15   18   15   23   30   41   85  109  154  240
szLength     13   16   16   17   21   21   29   38   64   83  121  172  266
szLen         8    9   13   13   18   22   28   38   54   93  139  207  321

So his code is less affected by misalignment than the others.
Title: Re: szLen optimize...
Post by: Codewarp on June 20, 2005, 09:29:49 PM
Jimg:  Nice work! I am unclear as to nature of this Ratch algorithm.  But it doesn't
matter anyway, since further work on szLength( ) makes it the fastest :dance:.  You were
right--my algorithm is intended to enhance performance of both misaligned strings
and of short strings.  The new version below ratchets up :wink the performance for long
strings with a little tuck-and-unroll.  I have also exchanged the roles of
eax and edx, to avoid one extra mov instruction at the end of every call.


; search to find length of a null-terminated string
; fast performance: strings of 1/10/100/1000 bytes require 7/13/101/653 cycles (Athlon64)
__declspec( naked ) int szLength (const void* src) {
    _asm {
            mov     eax, [esp + 4]          ; point eax to start of string
            test    eax, 3
            jnz     fixalign                ; jmp to fix misalignment
            align   4
lenscan:    mov    edx, [eax]               ; load next four bytes
resume:     lea     ecx, [edx - 1010101h]
            not     edx                     ; on each byte in ecx: (byte-1) & ~byte & 80h
            and     ecx, edx
            and     ecx, 80808080h
            jnz     found                   ; branch if found

            mov     edx, [eax + 4]          ; load next four bytes
            add     eax, 8                  ; advance ptr (twice)
            lea     ecx, [edx - 1010101h]
            not     edx                     ; on each byte in ecx: (byte-1) & ~byte & 80h
            and     ecx, edx
            and     ecx, 80808080h
            jz      lenscan                 ; repeat while no zeros found
            sub     eax, 4                  ; back off to last dword

found:      sub     eax, [esp + 4]          ; subtract the base address
            test     ecx, 8080h             ; test first two bytes
            jz      upper2                  ; jmp if not found in the lower 2 bytes
            shr     ecx, 8                  ; set carry from bit7 of 1st byte
            sbb     eax, -1                 ; return eax = eax + (1-carry)
            ret
upper2:     shl     ecx, 9                  ; set carry from bit7 of 3rd byte
            sbb     eax, -3                 ; return eax = (eax+2) + (1-carry)
            ret

fixalign:   mov     ecx, eax
            and     eax, ~3                 ; eax points to aligned addr
            and     ecx, 3                  ; ecx = bytes to skip
            shl     ecx, 3                  ; ecx = bits to skip
            mov     edx, 1
            shl     edx, cl                 ; put cl 1's into edx from the bottom up
            sub     edx, 1
            or      edx, [eax]              ; combine with first four bytes
            jmp     resume                  ; start up in the aligned search
    }
}

Title: Re: szLen optimize...
Post by: Jimg on June 21, 2005, 12:59:37 AM
Codewarp-

On first test, it seems much faster.  Unfortunately, it's crashing on misaligned strings.  I haven't had a chance to figure out why yet.

Later...

ok, it the last fixalign instruction-
            or      edx, [eax]              ; combine with first four bytes
Title: Re: szLen optimize...
Post by: Phil on June 21, 2005, 02:23:24 AM
Jimg: I had expected that there would have been trouble in your earlier alignment tests with the lszLenSSE routines. I had thought that it required 16-byte alignment for the strings. Anyway, I saw that it ran okay for you and I haven't looked into it further. I just wanted to say something in case the crashes you are encountering are caused by it and not szLength.
Title: Re: szLen optimize...
Post by: Codewarp on June 21, 2005, 03:11:01 AM
Quote from: Jimg on June 21, 2005, 12:59:37 AM
Codewarp-

On first test, it seems much faster.  Unfortunately, it's crashing on misaligned strings.  I haven't had a chance to figure out why yet.

Later...

ok, it the last fixalign instruction-
            or      edx, [eax]              ; combine with first four bytes


Jimg -

I have been unable to reproduce any crash, nor can I find any erroneous result.  That is not to say there isn't one,
but I can assure you that the or edx, eax is the essence of the fixalign, and not an oversight.  What it does, is
to fill the beginning ragged edge with 1's just in case those bytes contained zeros.  Remember, on misalignments, I
back-up the pointer to the starting dword, then I force 1-2-3 bytes to 1's, depending on the misalignment.

I have single-stepped through this process and found that it does exactly what it should (at the register level).
I am calling it from within larger software that passes a diversity of string lengths and alignments to it, and it all executes
without any apparent difficulty.

I am eager to repair this algorithm if it is indeed in error, but I need some evidence to take your claim seriously. :naughty:
Please be sure that you have included all my lines.  Note that the VC++ compiler places all routines on 16-byte
boundaries--your tests should do the same.  I see that same unexpectedly large swings in clock counts with only
minor changes in code and entry point alignments, reported in many postings.
Title: Re: szLen optimize...
Post by: Phil on June 21, 2005, 03:40:54 AM
Codewarp: Here's an updated zip including your recent modifications. The results weren't all that different from your previous version but in these tests all strings and procedures are aligned on 16-byte boundries.
Proc/Bytes    0    1    2    3    5    8   13   21   34   55   89  144  233
========== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
lszLenSSE    16   16   16   16   16   19   20   22   28   48   63   86  116
FStrLen       7    7   10    9   10   13   16   35   47   59   86  128  194
Ratch        18   25   32   39   29   25   35   52   66   92  105  144  227
szLength     19   20   19   19   23   25   30   51   60   81  121  176  265
szLength2    19   19   19   19   23   24   30   37   47   81  116  173  261
szLen         8   10   12   14   19   22   30   56   77  115  175  270  428


Jimg: Could you zip up and post your recent test source and results so we could use the same methods to test various alignments? I know you said that you just added 1-3 nops before each string in your runs and that sounds easy enough but the actual source would be helpful in case something unusual is happening.


[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: Codewarp on June 21, 2005, 07:05:50 AM
Phil--

I am getting different clock counts than you are, which is likely due to different CPUs (mine=athon64, yours=?) and
different benchmarking technique.  What I am seeing with the different strlen( ) methods is this:

  (1)  The DWORD-101010h & ~DWORD method is the basis for all the highest performing strlen( ) methods.  I
        use it in all of my implementations, as does Rachet (i.e. identical code).

  (2)  This method takes 15-25% longer on long misaligned strings, without realignment.  This loss occurs in all
        implementations of it, but does not seem to show up well in your tests.

  (3)  By unrolling the loop for 8 bytes per iteration, the method gains noticable speed, using 15-25% less time.  This
       does not show up at all on your tests, but I was testing 1000 byte searches, yours were 233.

  (4)  Handling misalignment costs 2-3 cycles up front, on every call, misaligned or not.  On misaligned calls it
        costs an additional 4-5 clocks.  This 6-8 cycle overhead is paid by the first 3 or 4 dwords.  The overhead
        of byte-at-a-time realignment is excessive after the first byte because of the jmps involved.  Memory
        misalignment costs vary between processors, but typical "abc" quoted strings are not aligned.

  (5)  Cost of locating the byte in the last dword shows up in short strings of a few bytes.  Short strings are far more
        commonly passed to strlen( ) than long strings.  The fewest number of jmps to do this runs the fastest.

My benchmark practice is as follows:

  (1)  t1 = clocks  for 1000 empty, baseline iterations
  (2)  t2 = clocks for 1000 iterations of a target routine
  (3)  t3 = overhead reading the clock
  (4)  report (t2 - t1 - t3)/1000 as the time for the target routine.

I run this whole thing at least three times, and note the result stability, rerunning as needed to get a stable
lowest clock count benchmark.  This all assures that I am not simply measuring the efficiency of my memory
system (or of interrupt processing), but getting as close as possible to the algorithmic cost itself.  I have
validated this approach by benchmarking routines of known cost, and observing just that.

I believe it is wise to be suspicious of all benchmarks, questioning their basis and understanding their limitations.
A vigorous discussion of these topics will benefit everyone.





Title: Re: szLen optimize...
Post by: Phil on June 21, 2005, 08:21:16 AM
I certainly agree that discussion is key to understanding what's happening here. I plugged your unrolled code into the test as szLength2 and didn't see much of a difference as indicated by the result. I'm testing on a 996 MHz P3 and the trials I see are very consistent on this machine. I'm using MichaelW's timer macros that just repeat an invoke szLength the number of times specified by LOOP_COUNT which then returns the average number of CPU clocks for each iteration as the result. I've added complexity by defining list driven macros that, hopefully, make it easier to modify the tests and add procedures. As far as I can tell there is no compensation in MichaelW's timer macros to discount the result based on the overhead costs of reading the clock. The math to compute the averages is obviously done outside the loop after the final clock is read.

Are you able to assemble and run these tests directly on your machine? The only part of the source that I did not include are Michael's timer macros that I'll zip up and include here for your convenience. They are also posted in several other places and have been used and tested extensively. We do, however, see unusual results from time to time but I don't think it's associated with his macros.

I'd be glad to run your benchmark on my machine if you care to post the source and executable. I can see that unrolling the loop should indeed make a big difference in the timing as you have said.


[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: MichaelW on June 21, 2005, 08:31:31 AM
Phil,

My macros compensate for the loop and timing overhead by timing an empty loop that is otherwise identical to the test loop, and subtracting the cycle count for the empty loop from the cycle count for the test loop.

Title: Re: szLen optimize...
Post by: hutch-- on June 21, 2005, 08:35:03 AM
hmmmm,

Quote
My benchmark practice is as follows:

  (1)  t1 = clocks  for 1000 empty, baseline iterations
  (2)  t2 = clocks for 1000 iterations of a target routine
  (3)  t3 = overhead reading the clock
  (4)  report (t2 - t1 - t3)/1000 as the time for the target routine.

I run this whole thing at least three times, and note the result stability, rerunning as needed to get a stable
lowest clock count benchmark.  This all assures that I am not simply measuring the efficiency of my memory
system (or of interrupt processing), but getting as close as possible to the algorithmic cost itself.  I have
validated this approach by benchmarking routines of known cost, and observing just that.

This is in fact an interesting notion but I am wary of what is left as it will still depend on the opcode implimentation from processor to processor which differ substantially over time and between different manufacturers. Usually the reference to a known code is more useful but this also has its limitations in that an algo that is fast one one machine can be slow on another if its written to use a specific characteristic of one form of hardware.

Memory speed is of course a factor but on the same box testing two different routines, one known and the other developmental there is no advantage or disadvantage to either.  What I am inclined to trust is algo comparison on a range of different boxes with different processors to see which works better on what box which is the basics of writing mixed model code that is general purpose.
Title: Re: szLen optimize...
Post by: Jimg on June 21, 2005, 01:12:08 PM
Codewarp-
Sorry, I misinterpreted one of your instructions-
     and     eax, ~3
I haven't seen the tilde uesd before, and thought it was just a spurious character and deleted it :red.  Phil fixed it properly-
     and     eax, 0FFFFFFFCh ; ~3    ; eax points to aligned addr
Works perfectly so far and is the fastest (at least on my machine).


Phil-

   The code I used is identical to yours, I just ran it four separate times, inserting one more byte in the string definition macro each time as I said, I didn't automate it at all-
    %FOR len,<SIZES>
         align 16
         db 0  ;,0,0
         str&len& db len dup ('X'),0
    ENDM

Title: Re: szLen optimize...
Post by: Jimg on June 21, 2005, 01:18:32 PM
Phil-
QuoteI certainly agree that discussion is key to understanding what's happening here. I plugged your unrolled code into the test as szLength2 and didn't see much of a difference as indicated by the result. I'm testing on a 996 MHz P3 and the trials I see are very consistent on this machine.

The routine is definately the fastest on my athlon (other than the sse code).  I've seen these timing differences between the P3 and Athlons before....
Title: Re: szLen optimize...
Post by: ic2 on June 21, 2005, 02:05:01 PM
Here are the two block of code that that deserve serious notice.   I hope this may be included into your (today's) advancements on this subject ...  I founded it though serious searching.  Could this be included in your test. We all really need to see the results here.

Thank you

Jens_Duttke_StrLen proc PROC Source:DWORD

mov ecx, Source

@@:
mov eax, dword ptr [ecx]
add ecx, 4

lea edx, [eax - 01010101h]
xor eax, edx
and eax, 80808080h
jz @B
and eax, edx
jz @B

bsf edx, eax

sub edx, 4
shr edx, 3

lea eax, [ecx + edx - 4]
sub eax, Source

RET

Jens_Duttke_StrLen endp




Jens_fast_strlen PROC item:DWORD

mov ecx, item

@@:
mov eax, dword ptr [ecx]
add ecx, 4

lea edx, [eax - 01010101h]
xor eax, edx
and eax, 80808080h
jz @B
and eax, edx
jz @B

bsf edx, eax

sub edx, 4
shr edx, 3

lea eax, [ecx + edx - 4]
sub eax, item

RET

Jens_fast_strlen ENDP


Title: Re: szLen optimize...
Post by: Jimg on June 21, 2005, 02:24:02 PM
Results on Athlon XP 3000+


Test routines for correctness:
lszLenSSE     0    1    2    3    5    8   13   21   34   55   89  144  233
FStrLen       0    1    2    3    5    8   13   21   34   55   89  144  233
Ratch         0    1    2    3    5    8   13   21   34   55   89  144  233
szLength      0    1    2    3    5    8   13   21   34   55   89  144  233
szLen         0    1    2    3    5    8   13   21   34   55   89  144  233
Jens_fast_    0    1    2    3    5    8   13   21   34   55   89  144  233

Strings aligned:
Proc/Bytes    0    1    2    3    5    8   13   21   34   55   89  144  233
========== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
lszLenSSE    25   26   22   25   25   28   29   32   38   48   80  118  169
FStrLen       6    7   12   12   11   13   16   21   32   63   88  135  201
Ratch         7   12   12   15   14   14   20   29   39   77  101  142  220
szLength      9   10    9   10   11   15   16   26   34   49   90  132  198
szLen         7    8   13   13   18   23   28   38   54   91  140  207  323
Jens_fast_   20   20   20   20   21   27   29   36   46   69   99  145  217

Strings misaligned by 1 byte:
Proc/Bytes    0    1    2    3    5    8   13   21   34   55   89  144  233
========== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
lszLenSSE    30   28   28   28   29   30   30   37   42   55   90  124  179
FStrLen       6    7    9    8   11   12   17   26   35   71  102  150  230
Ratch         8   10   11   15   18   16   23   31   40   86  108  156  241
szLength     13   14   14   15   15   18   21   28   37   56   96  137  207
szLen         9    9   12   13   18   22   29   39   54   93  140  207  322
Jens_fast_   21   21   20   20   28   29   33   42   50   77  107  156  234

Strings misalinged by 2 bytes:
Proc/Bytes    0    1    2    3    5    8   13   21   34   55   89  144  233
========== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
lszLenSSE    28   28   28   28   28   30   30   37   41   55   90  126  178
FStrLen       6    7    9   10   10   12   19   25   35   72  102  150  229
Ratch         7   11   12   15   18   15   23   30   40   85  109  155  240
szLength     14   14   15   15   15   19   21   27   52   54   96  139  206
szLen         8    9   13   13   18   22   28   38   54   94  139  207  323
Jens_fast_   20   21   20   20   29   29   33   42   51   77  107  157  235
Title: Re: szLen optimize...
Post by: ic2 on June 21, 2005, 03:36:10 PM
Jimg, I made a mistake and posted identical Jens Duttke code.  Below is the one that was supposed to be slower.  Funny it gave slightly difference results for the same code.  Could it be back to back run in.  I guest it really don’t matter seeing that FstrLen is the fastest anyway.  This is really great.

Also i see you caught the flaw.

Thanks a lot for displaying the results quickly

Jens_Duttke_StrLen proc PROC item:DWORD

mov ecx, item

@@:
mov eax, dword ptr [ecx]
add ecx, 4

lea edx, [eax - 01010101h]
xor eax, edx
and eax, 80808080h
and eax, edx
jz@B

bsf edx, eax

sub edx, 4
shr edx, 3

lea eax, [ecx + edx - 4]
sub eax, item

ret
Jens_Duttke_StrLen endp

Title: Re: szLen optimize...
Post by: Jimg on June 21, 2005, 03:41:59 PM
Phil-  Ok, here is a version that tests the string misalignment automatically.  I also added a print to verify that the routines were working correctly, and I added a string with all the possible ascii characters (the 999 string).  As you can see, the FStrLen routine stops at the first ascii character over 128 an so it's cycle counts for that string are not correct.

Test routines for correctness:
lszLenSSE    0    1    2    3    5    8   13   21   34   55   89  144  233  999
FStrLen      0    1    2    3    5    8   13   21   34   55   89  144  233  128
Ratch        0    1    2    3    5    8   13   21   34   55   89  144  233  999
szLength     0    1    2    3    5    8   13   21   34   55   89  144  233  999
szLen        0    1    2    3    5    8   13   21   34   55   89  144  233  999
Jens_fast    0    1    2    3    5    8   13   21   34   55   89  144  233  999

Proc/Byte    0    1    2    3    5    8   13   21   34   55   89  144  233  999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====

Misaligned by 0 bytes:
lszLenSSE   25   25   27   25   25   28   29   32   38   47   82  118  168  596
FStrLen      6    8    9    9   11   12   16   21   33   60   89  131  199  119
Ratch        9   10   12   15   14   14   20   29   39   77  101  142  220  857
szLength    10    8    9   10   11   15   18   26   33   64   90  133  199  784
szLen        6   11   12   15   19   23   30   61   80   93  140  208  323 1293
Jens_fast   20   20   20   21   21   26   29   36   44   69  100  146  217  923

Misaligned by 1 bytes:
lszLenSSE   28   28   28   28   28   30   31   33   43   55   92  123  179  623
FStrLen      6    8    9    9   11   12   17   25   35   71  101  149  229  135
Ratch        8   11   11   15   18   15   23   30   40   85  108  154  240  955
szLength    14   13   14   15   15   18   21   28   38   55   96  135  205  785
szLen        7   11   12   15   20   23   33   39   54   92  185  208  351 1293
Jens_fast   20   20   20   20   25   28   32   40   48   75  105  154  233 1001

Misaligned by 2 bytes:
lszLenSSE   27   29   27   28   28   31   30   36   41   56   89  127  179  621
FStrLen      6    7    9    9   12   12   17   25   35   73  102  149  228  136
Ratch        8   11   12   14   18   16   23   30   40   87  110  157  241  954
szLength    14   14   15   15   15   21   21   28   40   55   97  140  207  787
szLen        8   11   12   12   18   22   27   40   54   93  139  213  322 1291
Jens_fast   20   20   20   20   24   28   32   40   47   75  105  153  232  994

Misaligned by 3 bytes:
lszLenSSE   28   28   28   28   28   30   31   33   41   56   91  124  177  629
FStrLen      7    8    9    9   10   12   17   27   35   71  103  151  230  136
Ratch        8   11   13   15   18   15   23   31   40   85  110  156  242  953
szLength    14   15   16   15   17   20   24   31   40   55   98  140  207  792
szLen        8   11   12   15   20   23   28   61   69   92  148  208  321 1291
Jens_fast   20   20   21   20   24   28   32   41   47   76  104  154  234  998

Press enter to exit...

[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: Phil on June 21, 2005, 09:04:07 PM
Jimg: Thanks for automating ... Especially for the verification routine and string 999 that shows FStrLen 7-bit short-comings!

Here are the results for a 996 MHz P3:
Test routines for correctness:
lszLenSSE    0    1    2    3    5    8   13   21   34   55   89  144  233  999
FStrLen      0    1    2    3    5    8   13   21   34   55   89  144  233  128
Ratch        0    1    2    3    5    8   13   21   34   55   89  144  233  999
szLength     0    1    2    3    5    8   13   21   34   55   89  144  233  999
szLen        0    1    2    3    5    8   13   21   34   55   89  144  233  999
Jens_fast    0    1    2    3    5    8   13   21   34   55   89  144  233  999

Proc/Byte    0    1    2    3    5    8   13   21   34   55   89  144  233  999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====

Misaligned by 0 bytes:
lszLenSSE   16   16   16   16   17   19   19   22   29   48   63   86  117  404
FStrLen      7    7   10    8   10   13   16   37   47   59   86  128  194  116
Ratch       18   25   32   39   29   25   35   51   66   95  105  144  227  872
szLength    19   19   19   19   23   25   30   37   47   81  116  173  261 1025
szLen        8   10   12   14   19   22   30   56   77  115  175  270  428 1767
Jens_fast   12   12   12   12   15   18   19   38   47   63   88  130  196  870

Misaligned by 1 bytes:
lszLenSSE   16   16   16   16   16   26   19   40   32   54   93  113  171  649
FStrLen      7    7   10    8   10   13   18   53   49   59   87  133  210  125
Ratch       18   25   32   39   29   25   35   74   77   96  124  182  286 1149
szLength    24   25   25   30   29   30   33   41   54   88  120  176  265 1032
szLen        8   10   12   14   19   22   30   56   77  115  175  270  428 1768
Jens_fast   12   12   12   12   15   18   19   63   51   66   99  145  224  981

Misaligned by 2 bytes:
lszLenSSE   16   17   16   16   16   26   19   40   32   54   93  113  251  648
FStrLen      7    7   10    8   10   13   16   53   49   59   87  133  288  124
Ratch       18   25   32   39   29   25   35   73   77   96  124  182  371 1149
szLength    25   25   30   30   29   31   33   41   58   88  120  177  265 1032
szLen        8   10   12   14   19   22   30   56   77  115  175  270  428 1770
Jens_fast   12   12   12   12   15   18   19   63   51   64   99  145  310  978

Misaligned by 3 bytes:
lszLenSSE   16   16   16   16   16   26   19   22   45   60   81  117  184  627
FStrLen      7    7   10    8   10   13   16   37   48   66   96  144  209  123
Ratch       18   25   32   39   29   25   35   51   73  110  135  191  287 1140
szLength    25   30   30   29   30   31   37   45   58   88  125  178  269 1033
szLen        8   10   12   14   19   22   30   59   76  115  176  270  428 1767
Jens_fast   12   12   12   12   15   18   19   38   53   77  106  155  227 1005

Press enter to exit...
Title: Re: szLen optimize...
Post by: Codewarp on June 21, 2005, 10:10:02 PM
Quote from: ic2 on June 21, 2005, 02:05:01 PM
Here are the two block of code that that deserve serious notice.   I hope this may be included into your (today's) advancements on this subject ...  I founded it though serious searching.  Could this be included in your test. We all really need to see the results here.

Thank you

ic2:  Interesting algorithm, though is has some shortcomings:

  (1)  Is seems to work only on 7-bit ascii, not 8-bit.
  (2)  Its loop uses two jmps instead of one.  I believe the first one is unnecessary.
  (3)  the BSR implementation has been tried and examined thoroughly.  It looks so elegant...
        Too bad the BSR is such a dog, see the szLength( ) for a better impl. of this tail-end part
        of the routine.
  (4)  No misalignment handling makes this method slow for long misaligned strings.
Title: Re: szLen optimize...
Post by: Codewarp on June 21, 2005, 10:16:04 PM
Quote from: hutch-- on June 21, 2005, 08:35:03 AM
Quote
This is in fact an interesting notion but I am wary of what is left as it will still depend on the opcode implementation from processor to processor which differ substantially over time and between different manufacturers. Usually the reference to a known code is more useful but this also has its limitations in that an algo that is fast one one machine can be slow on another if its written to use a specific characteristic of one form of hardware.

Memory speed is of course a factor but on the same box testing two different routines, one known and the other developmental there is no advantage or disadvantage to either.  What I am inclined to trust is algo comparison on a range of different boxes with different processors to see which works better on what box which is the basics of writing mixed model code that is general purpose.

Hutch --

This benchmarking thing really gets down to the heart of the matter, doesn't it?  I agree with everything you have said, and it gets right down to what your code is written for.  Code tends to stick around, but processors tend to fade away.  There simply isn't any way to code something so that is runs the fastest on all CPUs.  You have to pick and choose, and to know what your strategy is.  Several strategies come to mind:

  (1)  Separate libraries for each processor
  (2)  An Intel library, and an AMD library
  (3)  Single library optimized for the present day hardware, but compatible back to the PII.
  (4)  Single library like (3), with dynamic inclusion of advanced cpu features (like sse, etc)
  (5)  Single library optimized with every trick from tomorrows hardware.

Actually, all of these are desirable, each with serious benefits and baggage.  However, clients on 5 year old hardware don't tend to complain about software performance too much.  It's the one's driving the shiny new XP-zazz that want all that speed.  Do you really want to avoid MUL instructions, simply because somebody might run it on a P4?  I think not, and as for my own effort, most of it goes in the direction of approach (3)--as in my szLength( ) routine, and in (4) when needed.

I been so pleased with the szLength( ) results, that I turned it into a killer memchr( ) implementation (faster than anything I had before).  Memchr( ) is a much more useful function than strlen( ) that can have a bigger impact on overall sofware speed than strlen( ).  Should I post this as a new topic, or as further evolution in szLen( ) ??
Title: Re: szLen optimize...
Post by: Phil on June 21, 2005, 11:07:04 PM
Quote from: Codewarp on June 21, 2005, 10:16:04 PM
I been so pleased with the szLength( ) results, that I turned it into a killer memchr( ) implementation (faster than anything I had before).  Memchr( ) is a much more useful function than strlen( ) that can have a bigger impact on overall sofware speed than strlen( ).  Should I post this as a new topic, or as further evolution in szLen( ) ??

My vote would be a new topic. That would allow others to pick up the new discussion from the beginning. We already have a great deal of discussion going on here and a lot to be considered.

Quote from: Codewarp on June 21, 2005, 10:10:02 PM
Quote from: ic2 on June 21, 2005, 02:05:01 PM
Here are the two block of code that that deserve serious notice.   I hope this may be included into your (today's) advancements on this subject ...  I founded it though serious searching.  Could this be included in your test. We all really need to see the results here.

Thank you

ic2: Interesting algorithm, though is has some shortcomings:

(1) Is seems to work only on 7-bit ascii, not 8-bit.
(2) Its loop uses two jmps instead of one. I believe the first one is unnecessary.
(3) the BSR implementation has been tried and examined thoroughly. It looks so elegant...
Too bad the BSR is such a dog, see the szLength( ) for a better impl. of this tail-end part
of the routine.
(4) No misalignment handling makes this method slow for long misaligned strings.


Thanks to JimG's validation it's clear that FStrLen is the only procedure with the 7-bit ASCII limitation. Also, on the P3 I am using Jens_fast is quicker than szLength with all alignments. JimG's results show that szLength is quicker on an Atholon. I'm not sure if that is related to the BSR usage or not. Anyway, that's my two-cents worth for the moment.
Title: Re: szLen optimize...
Post by: Codewarp on June 22, 2005, 01:51:36 AM
Phil,

First of all, thank you for your response to all of this, along with everyone else too, of course.

I wanted to point out some things regarding (what I call) the DWORD search method, which is used by all of the faster strlen( ) implementations.  Let's look at logic of it:

  [<fix alignment>]       optional misalignment fixup

  <locate dword>          find the dword containing a zero

  <locate byte>           find the first zero in the dword

  <return len>            return the byte address - string base


You will notice that the <fix align> is optional, but all other steps are mandatory--you cannot omit any to speed it up without breaking it.

Now, the point of all this that <locate byte> has a variety of implementations, soom good, some not so good, but every call passes through it, so clocks saved here speed up every call :thumbu.

There are a number of methods for <locate byte>:

  (1) inc, test and jz each byte (3 times)
  (2) bsr div 8
  (3) inc, shr 8 and jc each byte (3 times)
  (4) separate upper/lower, add 1-bit7 to address

Ratch uses (3), szLength uses (4).  I use (4) because substituting the other methods in anybody's implementation will increase clock counts (by 2-5), and because it requires fewer jmps.  BSR would be perfect, if it were not so poorly ScotchTaped to the CPU as an afterthought :tdown--its performance is an extreme disapointment.  BSR seems marginally useful when you have no idea where the bit of interest resides within the dword.  If you know more than that, shifts and masks will be faster.  Method (1) looks promising, because no shifts are involved, but both (1) and (3) suffer from having so many instructions.

So, for example, you could take Ratch, substitute its <locate byte> method (3) with (4), and voila, you shave 2 or 3 cycles off every call (for faster short strings).  This is where my comments to ic2 came from--no method using BSR will ever beat method (4), unless a future CPU changes things.

Title: Re: szLen optimize...
Post by: Phil on June 22, 2005, 02:49:54 AM
Quote from: Codewarp on June 22, 2005, 01:51:36 AM
Ratch uses (3), szLength uses (4). I use (4) because substituting the other methods in anybody's implementation will increase clock counts (by 2-5), and because it requires fewer jmps. BSR would be perfect, if it were not so poorly ScotchTaped to the CPU as an afterthought :tdown--its performance is an extreme disapointment. BSR seems marginally useful when you have no idea where the bit of interest resides within the dword. If you know more than that, shifts and masks will be faster. Method (1) looks promising, because no shifts are involved, but both (1) and (3) suffer from having so many instructions.

So, for example, you could take Ratch, substitute its <locate byte> method (3) with (4), and voila, you shave 2 or 3 cycles off every call (for faster short strings). This is where my comments to ic2 came from--no method using BSR will ever beat method (4), unless a future CPU changes things.


Thank you for your analysis. What you've said makes sense but it doesn't seem to flow with the results I'm seeing on this machine.

Please download the attached zip, browse thru the source to make sure I have incorporated your routine correctly, assemble if you like or run the included exe file and share the results on your machine with us. I bumped LOOP_COUNT back up to 1000000 and ran the test 3 times to make sure my results were consistent. They varied in some cases by 4 or 5 clocks but the trends are quite consistent. Again, for *some reason* Jens_fast is topping szLength in all cases on a 996 MHz P3. I removed the unnecessary jz as you and P1 suggested and it slowed it down considerably for mis-aligned strings. szLength is certainly least affected by the alignments as you can see from these results but all of the other procedures use BSF and Jens_fast is always slightly faster than szLength. The SBB instruction that you use is slower on this machine ... maybe that's the difference?
Proc/Byte    0    1    2    3    5    8   13   21   34   55   89  144  233  999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====

Misaligned by 0 bytes:
szLength    19   19   19   19   23   24   30   37   47   81  116  173  261 1026
Ratch       18   25   32   39   29   25   35   51   66   92  105  144  227  871
Jens_fast   12   12   12   12   15   18   19   41   47   63   88  130  196  870
Jens_slow   10   10   10   10   15   17   20   41   52   68   99  146  220  849

Misaligned by 1 bytes:
szLength    24   25   25   30   29   30   33   41   54   88  120  176  265 1033
Ratch       18   25   32   39   29   25   35   72   77   96  124  182  286 1150
Jens_fast   12   12   12   12   15   17   20   63   51   65   99  145  224  978
Jens_slow   10   10   10   10   15   17   20   56   62   75  113  181  283 1146

Misaligned by 2 bytes:
szLength    25   25   30   30   29   31   33   41   58   88  120  177  266 1033
Ratch       18   25   32   39   29   25   35   72   77   96  124  182  371 1150
Jens_fast   12   12   12   12   15   18   19   63   51   65   99  145  310  979
Jens_slow   10   10   10   10   15   17   20   53   62   74  113  181  362 1147

Misaligned by 3 bytes:
szLength    25   30   30   29   30   31   37   45   58   88  125  177  269 1032
Ratch       18   25   32   39   29   25   35   51   73  110  135  191  287 1141
Jens_fast   12   12   12   12   15   18   19   38   53   77  106  155  227 1005
Jens_slow   10   10   10   10   15   17   20   41   57   85  128  192  282 1140


To me, it's not about who's got the fastest procedure or algo here ... it's about understanding what some of the differences in our architectures or CPU's are that cause us the see things that don't fully make any sense until we understand why and what's happenin'  :dance:

[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: Codewarp on June 22, 2005, 04:39:02 AM
Phil --

I think we are talking about different things.  At the moment, let me address the p3 issue...  I love the p3, it has everything that is necessary, its fast, and it doesn't heat up the room.  But to go ever faster, the silicon guys had to start slanting things.  Certain instructions, the basic ones like add, adc, and, or, not, mov, cmp, etc get the serious silicon, while others get the micro-coded put-on.  By sticking to the basic set, your code fits into the groove that the CPU has been finely tuned to perform.  Add to this, some careful instruction ordering to keep multiple execution units humming, and you have code that executes considerably faster on a contemporary CPU. 

The p3 doesn't know how to take advantage of all that.  If you want the fastest code on a p3, then--hands down--use a p3-only library and optimize the @#$%@% out of it :bdg!  However, my interest is in code that runs the fastest on today's machines, but compatible all the way back to the PII.  If that code ran really terrible on a p3 :red, a compromise might be order--but that doesn't appear to be an issue in this case.

======================
By the way, there is actually another idea for an even faster szLength( ):

    - start off with the 7-bit search
    - when the "zero" if found, return if it really is a zero
    - otherwise continue from there with an 8-bit search to completion

For the vast majority of arguments to strlen( ) which are 7-bit sz, the faster search will suffice.  But as soon as bit7=1, it would switch over to 8-bit.  The 7-bit search would be unrolled like the 8-bit search, so it would be faster than any of the 7-bit impl we have seen so far.

Title: Re: szLen optimize...
Post by: Phil on June 22, 2005, 05:51:24 AM
Codewarp: Thanks for the 7-bit to 8-bit suggestion. I've been considering ways that to fit FStrLen so it can handle 8-bit ASCII.

I've also found this All About Strings (http://win32asmcommunity.net/phpwiki/index.php?pagename=AllAboutStrings) link that was written by tenkey, roticv, and others. It also contains many algorithms that aren't in our tests yet.

To make sure we are talking about the same thing, can you download the test suite and post the results on your machine? You said earlier that it's not good to use BSR because it's slow but the routines that are using it in this test suite on the P3 I am using appear to be faster than the one that doesn't.  I understand what you are saying about many non-crucial instructions being relagated to microcode and that can, in some instances, slow them down considerably. However, the bit instructions are crucial to many operating systems and the trace cache might just help make it fast enough in short loops like this that it might be okay to use. I'm just looking for results that confirm much of what you are saying. It seems that you are quite happy with szLength as it is and it is faster on the Athlon XP 3000+. I don't recall seeing any results for these recent tests from a PIV yet and I'm curious to know what the results would be. In trying to determine where the differences are I'm guessing that the SBB might be slowing your routine down on my machine ... but then, I think it is also slow on the PIV.

I'm going to play with a new test that incorporates some of the procedures described in the previous link and see if I can fix the FStrLen procedure so that handles 8-bit ASCII. For me, this is all about learning more about the various architectures, limitations, and advantages and certainly what you have said has been quite helpful. Thanks again.

It's okay if you are using Linux and can't run the tests. It's, obviously, okay too if you just don't have the time or if you just don't want to. I had offered earlier to produce the results of your benchmark on this machine if you could zip it up and post it but I obviously can't do that if its not Windows or Dos. It just helps to know some of the story behind the story sometimes. I am reading what you are saying, understanding, and learning as much as I can ... but without an apples to apples comparison of the same procedures in different orchards (various machine architectures) our words are just that. Food for thought.

I would also like to see a new thread for your memchr algorithm as well. I'm sure others would also be interested.
Title: Re: szLen optimize...
Post by: Codewarp on June 22, 2005, 08:30:38 AM
Phil --

What's happening is this:  I thought your tests are not valid because Jen-fast/slow are both 7-bit routines.  You are pitting 8-bit strlen( ) calls (i.e. szLength( ) and ratch( )) against 7-bit routines, then declaring the 7-bit routines the fastest--that's utter nonsense, I thought.  But I had actually misinterpreted Jens as 7-bit, but it was actually 8-bit, creating confusion in my mind--my apologies Phil.  The only difference between szLength loop and Jens (now) is szLength uses NOT EDX, and Jens uses XOR EDX, ECX, for the same effect. The NOT is necessary in later processors to avoid a register dependency and subsequent slowdown.

Further, don't get hung up on one SBB instruction at the very end--the loop is where all the action is.  BSR remains a poor choice, and you could speed up Jens a tiny amount using the byte locator from my code.
Title: Re: szLen optimize...
Post by: Phil on June 22, 2005, 08:42:54 AM
Codewarp: I certainly hope that are not raving mad! Both Jens_fast and Jens_slow handle 8-bit extended ASCII. JimG put in the validation routine before the timing tests and added the 999 byte string with 8-bit ASCII. I removed the 7-bit FStrLen test.

It's okay, Bud. You can can be right and have your cake too. I understand.



Title: Re: szLen optimize...
Post by: ic2 on June 22, 2005, 01:00:17 PM
QuoteI not taking credit for anything other than finding the link.  Here is where I founded Jens Duttke code.  It's the biggest discussion ever when it come to stlen in asm.  Make sure you have a big pot of coffee ready.  You got to read before you start the new thread.

http://board.win32asmcommunity.net/index.php?PHPSESSID=abcf67ef9a161ce95dd0c8f181663739&topic=4058.0
Title: Re: szLen optimize...
Post by: Jimg on June 22, 2005, 02:25:34 PM
Hmmm.  So all the big guns are just sitting back and grinning at us because they went through all this two years ago???
Title: Re: szLen optimize...
Post by: hutch-- on June 22, 2005, 02:25:39 PM
 :bg

I would not lose sleep over treading the same ground, if no-one did it you would never get improvements.
Title: Re: szLen optimize...
Post by: roticv on June 22, 2005, 02:58:10 PM
 :toothy I remember the thread very clearly, but it was nice to see another discussion on it.  :green2
Title: Re: szLen optimize...
Post by: ic2 on June 22, 2005, 03:35:47 PM
You can't expect hutch-- to direct you to threads.  I know for sure he wanted to, i could feel it in his first few posts.

Rule one, search the world first own your own.  My teacher did not give me the test before i studied.  Scientist work from ground up.  If it an new discussion going on do you think it should stop just because of an old dead one.  Life goes on with new and old members from around the world.

I learned more from this thread than the link i founded, read and posted. 

It's the little things that count...  it's all about improvements so i hope you will continue or i will never post any thread ever again to help people who seek improvement.  I know where everything is but this don't mean i know the meaning of it all.  I love searching but i love The Laboratory more.

Title: Re: szLen optimize...
Post by: hutch-- on June 22, 2005, 03:44:57 PM
The laboratory is basically a place for bashing algos to death to get them faster or smaller or smarter or whatever else can be done with them. While not everyone has the time to track the discussions in real detail, battering around the edges of algos is the way they have been made faster over time so for those who have the time and the interest, its a worthwhile passtime as it is basically research that is being shared.

I am much easier to please with string length algos, I prefer a classic byte scanner for general purpose work and when I have the luxury of working with aligned data with a buffer that is safely larger then the source in it, I use the Agner Fog versions as it is a good average performer on most hardware.

One thing that is worth stressing with algos pointed at general purpose work is to try them out across different hardware and you learn all of the joys of writing mixed model code that has to perform reasonably on most hardware.
Title: Re: szLen optimize...
Post by: Maelstrom on June 22, 2005, 03:59:10 PM
Dont wanna burst anyones bubble because, of course, Jens code rocked but there was another thread after that...
Jens came in with the original thread.  This routine was improved upon by buliaNaza.  After that Lingo12 rocked the boat with the last one posted to any thread ive seen.

Should be noted that FStrLen is, basically, the exact same routine as Lingo12's...

The continuance to Jens thread was here for reference:
http://board.win32asmcommunity.net/index.php?topic=8330.msg60805#msg60805
Title: Re: szLen optimize...
Post by: Phil on June 22, 2005, 04:19:02 PM
ic2: Wow .. what a link! The thread that just won't die! Looks like FStrLen in this thread came from buliaNaza in 2002 and then modified by Lingo! Small world we live in!

Title: Re: szLen optimize...
Post by: roticv on June 22, 2005, 05:32:00 PM
buliaNaza is the same person as lingo if I am not wrong. Oh well.
Title: Re: szLen optimize...
Post by: ic2 on June 22, 2005, 06:08:26 PM
What got me intersted was timelen.
if so im glad he stuck around.  To me buliaNaza is one of THE greatest.
Hope we can see the new results on Intel and AMD and improvements if possible.  Old news was igood news in this case.

Good luck
Title: Re: szLen optimize...
Post by: Codewarp on June 22, 2005, 06:52:58 PM
Phil,

Let me take another crack at how it is that Jens is faster on p3 while szLength is faster on Athlon 32/64.  The code fragments below are from the heart of the algo for both methods (with identical register for this discussion).  The only difference is the NOT ECX vs XOR ECX,EDX in szLength and Jens, respectively.


    ; Jens method
    lea  edx, [ecx-01010101h]
    xor  ecx, edx
    and  ecx, 80808080h
    and  ecx, edx

   ; szLength method
    lea  edx, [ecx-01010101h]
    not  ecx
    and  ecx, 80808080h
    and  ecx, edx


The problem for the Athlon is the LEA/XOR pair has a register dependency, so LEA has to finish before the XOR can start.  The NOT ECX can start at the same time as the LEA, causing its 1 cycle to disappear on every iteration (in Athlons).

The p3 may not be able to take advantage of this opportunity, so its clock cycle mix is determined by other things, like instruction times, pipelining, etc.  This is why it is next to impossible to develop single-routines that execute "the fastest" on a broad range of CPUs.

If you are a p3 fan, then Jens is for you.  If you are fan of multiple execution units and parallel computation, then methods like szLength are for you. You can't make a judgement on this one, without choosing sides and without being biased.  I freely admit to being biased toward using as much of recent advances in CPU architecture as reasonable compatibility will permit, and if there is a better place to draw the line, I would like to hear about it.
Title: Re: szLen optimize...
Post by: Jimg on June 22, 2005, 07:38:26 PM
Ok then...  would someone run the dang thing on a P5??
Title: Re: szLen optimize...
Post by: Codewarp on June 22, 2005, 08:09:02 PM
Jimg -

Here is the timelen for my Athlon64, with the shocking details.  Also, I have attached a new timelen3.zip with a new version of szLength that may do better on the p3.  I am not currently able to rebuild timelen, so if someone could add the new .exe back to this file, I would be grateful.



Running on Athlon64, family.model.step = 15.4.10

Test routines for correctness:
lszLenSSE    0    1    2    3    5    8   13   21   34   55   89  144  233  999
FStrLen      0    1    2    3    5    8   13   21   34   55   89  144  233  128
Ratch        0    1    2    3    5    8   13   21   34   55   89  144  233  999
szLength     0    1    2    3    5    8   13   21   34   55   89  144  233  999
szLen        0    1    2    3    5    8   13   21   34   55   89  144  233  999
Jens_fast    0    1    2    3    5    8   13   21   34   55   89  144  233  999

Proc/Byte    0    1    2    3    5    8   13   21   34   55   89  144  233  999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====

Misaligned by 0 bytes:
lszLenSSE   19   19   19   21   19   22   22   25   30   35   69   89  123  408
FStrLen      4    6    8    7    9   10   15   21   48   63   89  129  199  117
Ratch        4    7   14   16   10   10   16   22   34   70   93  132  201  779
szLength     6    6    6    6    9   12   13   18   27   59   83  126  182  708
szLen        3    7    8    9   16   20   49   63   88  129  197  309  494 2019
Jens_fast   17   17   17   17   18   19   21   26   33   65   92  134  200  858

Misaligned by 1 bytes:
lszLenSSE   19   20   19   19   19   25   25   28   33   38   71   91  125  411
FStrLen      4    6    8    7    9   10   15   21   52   68   92  131  199  120
Ratch        4    7   14   16   10   11   16   22   34   79   93  133  201  784
szLength    12   12   12   12   12   17   17   22   34   64   84  119  174  655
szLen        3    7    8    9   16   20   48   63   88  129  197  309  485 2020
Jens_fast   17   17   17   17   21   22   23   32   37   71   95  140  203  884

Misaligned by 2 bytes:
lszLenSSE   19   19   19   20   19   25   25   28   33   38   71   91  125  409
FStrLen      4    6    8    7    9   10   15   21   52   68   92  131  199  120
Ratch        4    7   14   16   10   11   16   22   34   76   93  133  201  784
szLength    12   12   12   12   12   17   17   22   52   64   84  119  174  655
szLen        3    7    8    9   16   20   47   63   88  129  197  309  485 2019
Jens_fast   17   17   17   17   21   22   23   32   37   71   95  140  203  883

Misaligned by 3 bytes:
lszLenSSE   19   20   19   19   19   25   25   29   33   38   71   92  125  409
FStrLen      4    6    8    7    9   10   17   21   52   68   92  131  199  120
Ratch        4    7   14   16   10   11   16   22   34   76   93  133  201  784
szLength    12   12   12   12   17   17   22   27   52   64   87  119  177  655
szLen        3    7    8    9   16   20   49   63   88  129  197  309  486 2019
Jens_fast   17   17   17   17   21   22   23   32   37   71   95  140  203  885




[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: Jimg on June 23, 2005, 01:08:56 AM
Thanks.  What I meant was you and I both have athlons, and Phil has the P3.  We need someone with a later pentium to run the timings.

You new routine runs about 20 cycles faster for the 999 string on my Athlon.  Good job!
Title: Re: szLen optimize...
Post by: hutch-- on June 23, 2005, 01:20:30 AM
2.8 gig Prescott.


Test routines for correctness:
lszLenSSE    0    1    2    3    5    8   13   21   34   55   89  144  233  999
FStrLen      0    1    2    3    5    8   13   21   34   55   89  144  233  128
Ratch        0    1    2    3    5    8   13   21   34   55   89  144  233  999
szLength 4294942949    2    3    5    8   13   21   34   55   89  144  233  998
szLen        0    1    2    3    5    8   13   21   34   55   89  144  233  999
Jens_fast    0    1    2    3    5    8   13   21   34   55   89  144  233  999

Proc/Byte    0    1    2    3    5    8   13   21   34   55   89  144  233  996
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====

Misaligned by 0 bytes:
lszLenSSE   1642949   11   11   25   15   16   31   28   47   53  142  204  663
FStrLen      2    3    5   15    6    5    9   30   33   65  109  153  234  150
Ratch        1    1   18    8    8    5    1   17   37   74  123  156  218  944
szLength     1    1    142949   12    9   25   24   29   63  107  144  221  841
szLen    42949    3    5   27   11   17   39   69  110  180  216  353  527 2186
Jens_fast42949    142949   11    4   33   26   42   59   73   79  154  210  967

Misaligned by 1 bytes:
lszLenSSE   13   11    0   11   11   43   15   31   38   56   80  178  235  794
FStrLen      3   12    7    3    5   18   18   23   43   64  127  194  264  173
Ratch    42949    1    5    742949   46   25   18   34   73  143  206  309 1156
szLength     942949   10   15   22   16   20   64   31   62  106  149  227  870
szLen    42949    5    842949   11   18   18   81   95  151  217  326  538 2196
Jens_fast    3    04294942949    4    5   37   42   92  140  184  179  300 1233

Misaligned by 2 bytes:
lszLenSSE   14   11   24   11   11   28   17   33   41   37   85  174  261  787
FStrLen      3    3   16    2    5    5   21  112   28   66  138  170  291  208
Ratch        1    1   16    7    7    5   21   23   35   49  149  183  339 1163
szLength     9   10   15    1   10   13   27   52   59   50  106  148  204  891
szLen       11    3    7    6   29   18   27   83  107  171  205  348  529 2280
Jens_fast    3   114294942949    3    5   24   71   50   71  127  208  281 1170

Misaligned by 3 bytes:
lszLenSSE   53   24   13   11   23   49   18   30   27   54  119  192  236  782
FStrLen      3    4    5    242949    4   11   32   35   66  176  192  313  167
Ratch       13    2    5    7   19    7   12   17   16   78  155  194  320 1170
szLength     9   12   22   10   15   25   22   25   57   52  109  153  221  861
szLen    42949    5    7   20   11   18   39   68  106  182  216  348  563 2324
Jens_fast    342949    442949    3    6   41   40   60  109  139  172  264 1176
Title: Re: szLen optimize...
Post by: Ratch on June 23, 2005, 01:32:20 AM
I optimized my version of STRLEN on a 32-bit Athlon. I refuse to chase a peculiar hardware speed with software .  I try to code using rules that apply to most every processor, and ignore the anomalies. You can't optimize everything all the time.  Ratch
Title: Re: szLen optimize...
Post by: Jimg on June 23, 2005, 04:45:02 AM
Codewarp--

I've found a small problem with your latest version.  Please check the lengths reported rather then the cycle times.  It has something to do with high ascii and/or a string following the test string, not sure which.

[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: Codewarp on June 23, 2005, 05:47:55 AM
Jimg--

No, it had to do will my a-little-to-quick transcription of the latest changes--sorry :red.  This one should be fixed now--but there's no telling what else I've broke if I can't type it straight... :wink.

[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: Jimg on June 23, 2005, 02:06:28 PM
Codewarp-

Still a small problem on my machine:
Test routines for correctness:
lszLenSSE    0    1    2    3    5    8   13   21   34   55   89  144  233  999
Ratch        0    1    2    3    5    8   13   21   34   55   89  144  233  999
szLength     0    0    0    0    4    8   12   20   32   52   88  144  232  996
Jens_fast    0    1    2    3    5    8   13   21   34   55   89  144  233  999
lszLenSSE    0    1    2    3    5    8   13   21   34   55   89  144  233  999
Ratch        0    1    2    3    5    8   13   21   34   55   89  144  233  999
szLength    -1   -1   -1    3    3    7   11   19   31   55   87  143  231  999
Jens_fast    0    1    2    3    5    8   13   21   34   55   89  144  233  999
lszLenSSE    0    1    2    3    5    8   13   21   34   55   89  144  233  999
Ratch        0    1    2    3    5    8   13   21   34   55   89  144  233  999
szLength    -2   -2    2    2    2    6   10   18   34   54   86  142  230  998
Jens_fast    0    1    2    3    5    8   13   21   34   55   89  144  233  999
lszLenSSE    0    1    2    3    5    8   13   21   34   55   89  144  233  999
Ratch        0    1    2    3    5    8   13   21   34   55   89  144  233  999
szLength    -3    1    1    1    5    5   13   21   33   53   89  141  233  997
Jens_fast    0    1    2    3    5    8   13   21   34   55   89  144  233  999

The previous version is working and still seems to be the fastest non-sse, even on Hutch's pentium  :wink
Title: Re: szLen optimize...
Post by: Jimg on June 23, 2005, 02:09:17 PM
Hutch-

Are the number glitchs (the ones printing  4294942949) repeatable on your machine?  It doesn't seem to be related to any routine?
Title: Re: szLen optimize...
Post by: hutch-- on June 23, 2005, 03:02:19 PM
Jim,

The repeat number is 42949 and it is not consistent across different runs of the test piece. I downloaded Michaels timing code so I could run the test.

The machine is a 2.8 gig Prescott on an 800 meg FSB Intel board with 2 gig of DDR400 and it runs faultlessly, particularly when making timings. I may be worth getting someone else with a reasonably late pentium to test it as well.
Title: Re: szLen optimize...
Post by: Codewarp on June 23, 2005, 07:40:01 PM
Jimg --

I guess this is what I get for keeping separate versions of szLength( ) code for c++ and masm... This one is supposed to work ::).  It has another cycle knocked out of every JNZ FOUND (with yet another align 4), and another cycle evaporated by replacing:

           OR EDX, [EAX]

with:   MOV ECX, [EAX]
          OR EDX, ECX

then hiding the MOV in the shadow of a non-dependent instruction.  Once again, if you wouldn't mind inserting the .exe for this new code and try again...  :red :red

[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: Jimg on June 24, 2005, 01:23:56 AM
Perfect now :bg

Here's my results, and a copy of the code with an exe for those wanting to try it without building the exe.


Proc/Byte    0    1    2    3    5    8   13   22   39   55   89  144  233  999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====

0 byte misalignment
lszLenSSE   25   25   25   25   25   28   28   32   38   47   82  119  166  587
Ratch        8   11   12   15   14   14   20   30   64   77  103  141  219  852
szLength     8    8    9   10   11   15   17   23   34   47   89  134  199  777
Jens_fast   20   20   20   20   21   26   29   36   57   69   99  145  217  925

1 byte misalignment
lszLenSSE   28   28   28   28   29   30   31   33   41   54   92  125  177  617
Ratch        7   10   12   15   18   17   23   32   69   85  108  154  240  952
szLength    13   14   14   16   15   20   19   26   56   67   92  135  201  782
Jens_fast   20   20   21   20   24   28   32   40   62   76  105  153  233  999

2 byte misalignment
lszLenSSE   28   28   28   28   28   30   30   31   42   55   92  124  176  621
Ratch        8   10   11   15   18   16   23   32   69   88  109  155  243  953
szLength    15   13   15   15   15   19   21   29   39   52   92  135  200  783
Jens_fast   19   19   19   21   24   28   32   41   61   77  105  155  235 1002

3 byte misalignment
lszLenSSE   27   27   28   29   28   31   29   35   43   56   91  124  175  626
Ratch        7   11   12   15   18   16   24   32   69   86  110  155  243  953
szLength    13   16   16   15   19   18   24   29   41   52   94  134  202  790
Jens_fast   19   19   19   20   24   28   32   40   61   75  104  153  230  995




[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: hutch-- on June 24, 2005, 02:13:07 AM
This looks a lot better, I just ran the EXE and there are no "funny" numbers.

PIV Prescott 2.8 gig, 80 meg FSB board with 2 gig of DDR400.


Test routines for correctness:
0 byte misalignment
lszLenSSE    0    1    2    3    5    8   13   22   39   55   89  144  233  999
Ratch        0    1    2    3    5    8   13   22   39   55   89  144  233  999
szLength     0    1    2    3    5    8   13   22   39   55   89  144  233  999
Jens_fast    0    1    2    3    5    8   13   22   39   55   89  144  233  999
1 byte misalignment
lszLenSSE    0    1    2    3    5    8   13   22   39   55   89  144  233  999
Ratch        0    1    2    3    5    8   13   22   39   55   89  144  233  999
szLength     0    1    2    3    5    8   13   22   39   55   89  144  233  999
Jens_fast    0    1    2    3    5    8   13   22   39   55   89  144  233  999
2 byte misalignment
lszLenSSE    0    1    2    3    5    8   13   22   39   55   89  144  233  999
Ratch        0    1    2    3    5    8   13   22   39   55   89  144  233  999
szLength     0    1    2    3    5    8   13   22   39   55   89  144  233  999
Jens_fast    0    1    2    3    5    8   13   22   39   55   89  144  233  999
3 byte misalignment
lszLenSSE    0    1    2    3    5    8   13   22   39   55   89  144  233  999
Ratch        0    1    2    3    5    8   13   22   39   55   89  144  233  999
szLength     0    1    2    3    5    8   13   22   39   55   89  144  233  999
Jens_fast    0    1    2    3    5    8   13   22   39   55   89  144  233  999

Proc/Byte    0    1    2    3    5    8   13   22   39   55   89  144  233  999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====

0 byte misalignment
lszLenSSE   31   11   11   23   12   15   27   20   26   49   54  154  193  665
Ratch        2    1    5   -4   10    5   22   23   41   71  118  146  207  900
szLength   -10    1    1    2   17    9   15   31   42   66  102  145  237  895
Jens_fast    9   -1   -1    6    4    5   32   40   69   74   79  152  214  994

1 byte misalignment
lszLenSSE   13   25   16   13   23   16   15   29   24   59   80  174  237  779
Ratch        1   16    6    7    7   -6   13   63   48   79  135  196  310 1180
szLength    10   22    9   17   23   16   19   34   40   61  107  160  227  860
Jens_fast    4    0   -1   -1   -7    6   12   53   69   99  144  172  269 1173

2 byte misalignment
lszLenSSE   25   12   13   12   11   15   53   21   24   57   85  168  247  788
Ratch       12    2    6    4   22    6   35   64   42   67  136  204  310 1171
szLength    21   11   19   10   -1   19   18   16   37   71  107  149  226  885
Jens_fast    3   -1  -12   -1    3    7   62   40   69   85  126  193  282 1184

3 byte misalignment
lszLenSSE   27   11    0   12   12   27   19   20   48   43  126  200  281  813
Ratch        2    1    6   -5   11    5   26   22   86   59  136  219  291 1140
szLength    -2   17   10   21   15   14   31   31   77   86  132  149  231  837
Jens_fast  -10   -1    0   -1   14    5   34   51   94   92  116  182  252 1169
Title: Re: szLen optimize...
Post by: Phil on June 24, 2005, 03:02:09 AM
Looking great guys! Hutch, the funny numbers that you saw in the previous run, 42949's, were an attempt to display unsigned numbers in a 5 character field when print ustr$(ebx) is working correctly. The timelen.asm source files should be corrected so they use 'print sstr$(ebx)' instead of the unsigned version that I used incorrectly. The 42949 is actually the first 5 digits of -1 when it is displayed as unsigned. My mistake.

JimG and Codewarp: Glad to see you are moving right along here. I'm sorry if I slowed the flow here trying to understand things that are probably still just a bit beyond my abilities at the moment. Thank you all for your patience and help. I'll just keep re-reading the posts and scratching my head occasionally until it makes a little more sense to me. I'm still working on understanding how to cure the register stalls that Hutch had pointed out in some other code that I'm working on ... And, I just understand things a lot better when I see results like these posted that generally agree with the words and symbols that I'm trying to fit into my mind. Thanks again.
Title: Re: szLen optimize...
Post by: Codewarp on June 24, 2005, 03:03:21 AM
Would it be easy/legal/appropriate to incorporate the IdCPU code into the developing standard benchmarking code being used here on the strlen( ) code?  That way, every report says what it is--it would also be cool...
Title: Re: szLen optimize...
Post by: hutch-- on June 24, 2005, 03:04:03 AM
Thanks Phil, for a moment I thought my PIV had developed a maths bug.  :bg
Title: Re: szLen optimize...
Post by: roticv on June 24, 2005, 03:23:35 AM
I am quite surprised that Jens_mmx version is not found in the test bed (Too bad the graphs that used to be found there were gone).


strlen proc lpString:DWORD
mov eax, 1 ; request CPU feature flags
cpuid ; CPUID instruction

;- Pre-Scan to align the string-start ----
mov ecx, lpString
mov eax, ecx
cmp byte ptr [eax], 0
je done
and ecx, 0FFFFFFF8h
add ecx, 8
sub ecx, eax
cmp ecx, 8
je aligned
@@:
inc eax
cmp byte ptr [eax], 0
je done
dec ecx
jnz @B
aligned:
mov ecx, eax
;-----------------------------------------

test edx, 800000h ; test bit 23 to see if MMX available
jz no_mmx ; jump if no MMX is available
pxor mm0, mm0

@@:
movq mm1, qword ptr [ecx]
movq mm2, qword ptr [ecx + 8]
movq mm3, qword ptr [ecx + 16]
movq mm4, qword ptr [ecx + 24]
movq mm5, qword ptr [ecx + 32]
movq mm6, qword ptr [ecx + 40]

pcmpeqb mm1, mm0
pcmpeqb mm2, mm0
pcmpeqb mm3, mm0
pcmpeqb mm4, mm0
pcmpeqb mm5, mm0
pcmpeqb mm6, mm0

por mm1, mm2
por mm3, mm4
por mm5, mm6
por mm1, mm3
por mm1, mm5

add ecx, 48

packsswb mm1, mm1
movd eax, mm1
test eax, eax
jz @B

sub ecx, 48

emms ; Empty MMX state
no_mmx:

@@:
mov eax, dword ptr [ecx]
add ecx, 4

lea edx, [eax - 01010101h]
xor eax, edx
and eax, 80808080h
and eax, edx
jz @B

bsf edx, eax

sub edx, 4
shr edx, 3

lea eax, [ecx + edx - 4]

done:

sub eax, lpString

ret
strlen endp


Bitrake's strlen for AMD Athlon and small strings could not be found too.


StrLen MACRO lpString:REQ
LOCAL _0,_1
mov ecx,lpString
pxor MM0,MM0
pxor MM1,MM1

mov ebx,16
ALIGN 16
_0: pcmpeqb MM1,[ecx+8]
pcmpeqb MM0,[ecx]
nop

add ecx,ebx
packsswb MM1,MM1
packsswb MM0,MM0

movd edx,MM1
movd eax,MM0
or edx,eax

je _0
bsf eax,eax
jne _1
add ecx,8
bsf eax,edx
_1: sub ecx,lpString
shr eax,2

lea eax,[ecx+eax-16]
ENDM

His footnotes says
"- Instructions packaged/aligned to 8 bytes offer highest decode bandwidth.
- Branch targets aligned to 16 bytes boundaries
- Use when average string is >32 bytes"
Title: Re: szLen optimize...
Post by: Mark Jones on June 24, 2005, 05:07:43 AM
Codewarp, I think MichaelW is working on that. :)
Title: Re: szLen optimize...
Post by: MichaelW on June 24, 2005, 06:45:55 AM
Quote from: Codewarp on June 24, 2005, 03:03:21 AM
Would it be easy/legal/appropriate to incorporate the IdCPU code into the developing standard benchmarking code being used here on the strlen( ) code?  That way, every report says what it is--it would also be cool...

I am working on it, but I have a problem with obtaining a brand identification string for recent Intel processors that return a brand index of zero. Unlike the AMD processors, where CPUID functions 80000002h-80000004h return a 48-byte processor name string (starting with the K5 Model 1), the Intel processors return a brand string that encodes the rated FSB frequency and the multiplier. The name string is not absolutely necessary, but I would like to provide a nice "friendly" name for all of the recent processors, and I would like to use a method that would not require constant updating. If Intel would just follow in AMD's footsteps for a change :bg
Title: Re: szLen optimize...
Post by: Jimg on June 24, 2005, 02:42:56 PM
roticv-

Thanks for the new routines.  Jens_mmx is clearly faster on large strings (>250 bytes or so).  The CPUID instruction is too big a penalty to pay on small strings.  I've included it in the test routines.  I also tried it without the cpuid (Jens_mmx2, not currently selected for test), and that made it better for strings about 150 or so.  Bitrake's routine is just too specialized for the general purpose test being run here as we are testing alignment errors as well as raw speed.

New code, updated per Phil's correction and added Jens mmx-

[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: roticv on June 24, 2005, 04:59:32 PM
The penalty for CPUID is ~500 and I don't think it should be included in the strlen routine as donkey's routine does not include it anyway.

When I have time, I would tweak my own routine and see how it compares. Below are the timings for different routines on my computer.

0 byte misalignment
lszLenSSE   35   27   20   36   35   30   37   44   56   72  160  228  305  990
Jens_mmx   546  605  616  606  612  612  614  630  621  625  643  653  694 1031
Ratch       20   14   22   21   25   19   31   41   64   83  151  195  293 1003
szLength    15   22   15   15   10   25   27   28   58   68  136  171  270  908
Jens_mmx2   11   53   53   54   57   61   62   69   78   73   96  108  153  497

1 byte misalignment
lszLenSSE   19   28   26   33   34   30   40   44   57   84  125  254  364 1172
Jens_mmx   552  553  558  557  569  657  646  657  654  666  674  703  745 1077
Ratch       19   14   24   19   27   20   31   37   64   85  180  272  426 1575
szLength    26   17   26   22   25   36   26   40   56   67  139  190  274  923
Jens_mmx2   16   15   28   25   56   79   95  104  121  114  138  169  197  533

2 byte misalignment
lszLenSSE   35   32   27   28   26   30   43   36   55   65  122  259  361 1154
Jens_mmx   552  541  544  544  569  638  635  654  659  656  669  693  719 1066
Ratch        3   14   24   19   11   26   33   37   61   90  185  275  428 1566
szLength    26   18   27   19   12   31   25   41   55   66  148  184  277  914
Jens_mmx2   11   14   29   32   34   95   90   94  118  107  132  155  192  534

3 byte misalignment
lszLenSSE   36   32   27   34   34   29   38   37   51   82  123  272  372 1178
Jens_mmx   544  548  558  546  575  629  642  649  654  646  675  686  723 1050
Ratch       11   22   17   27   19   34   34   40   75  106  181  302  429 1538
szLength    11   27   18   28   24   31   37   41   97  115  155  189  275  920
Jens_mmx2   10   23   21   31   32   76   93   91  114  102  127  149  182  523


Jens' mmx version sure thrash the rest.
Title: Re: szLen optimize...
Post by: roticv on June 24, 2005, 07:07:45 PM
I tried wiht my own routine, and it gives weird results.

0 byte misalignment
lszLenSSE   19   35   27   35   28   36   38   46   56   73  147  220  301 1001
roticv2      9   13   15   17   17   43  106  131  187  239   65  545  866 3454
Ratch       19   14   24   18   26   19   31   36   62   80  148  195  288  994
szLength    15   23   16   22   17   27   24   36   61   85  140  186  264  913
Jens_mmx2   18   54   51   53   59   54   63   70   93   71  114  104  181  489

1 byte misalignment
lszLenSSE   27   35   34   27   34   40   38   37   44   78  128  259  347 1171
roticv2     17   13   15   10   27   35   99  136  184  244  168  538  863 3456
Ratch       11   20   17   12   20   27   31   33   71   98  184  270  420 1558
szLength    18   26   25   25   20   30   34   33   58   67  137  190  267  924
Jens_mmx2   11   15   13   33   57  101   86  103  121  109  153  160  202  534

2 byte misalignment
lszLenSSE   22   34   28   27   35   38   50   49   57   70  122  258  344 1155
roticv2      9   19   15   28   22   38  101  132  129  151  355  187  235  635
Ratch       11   14   12   20   12   30   33   33   63   72  184  265  439 1570
szLength    14   25   23   24   19   29   25   29   56   75  138  198  268  904
Jens_mmx2   19   15   29   44   33   79   91   95  102   89  131  154  189  541

3 byte misalignment
lszLenSSE   35   27   32   35   35   32   23   45   63   86  119  265  372 1173
roticv2     10   13   20   17   30   42   86  113  189  231  137  549  882 3459
Ratch       11    6   18   13   24   19   31   32   69  107  187  301  433 1531
szLength    27   23   12   28   22   31   38   34  102  115  149  190  276  922
Jens_mmx2   20   14   30   24   39   91   82   99  111   95  135  149  179  531


My code is
roticv2 proc lpstring:dword
;int 3
mov eax, [esp+4]
cmp byte ptr[eax], 0
mov ecx, eax
jz done
and ecx, 0FFFFFFE0h
add ecx, 16
sub ecx, eax
cmp ecx, 16
jz aligned
@@:
add eax, 1 ;Simple byte scanner for alignment
cmp byte ptr[eax],0
jz done
sub ecx, 1
jnz @B
aligned:
pxor xmm1, xmm1
align 16
@@:
movdqa xmm0, [eax]
pcmpeqb xmm0, xmm1
add eax, 16
pmovmskb ecx, xmm0
test ecx, ecx
jz @B
bsf ecx, ecx
lea eax, [eax+ecx-16]
done:
sub eax, [esp+4]
retn 4
roticv2 endp
Title: Re: szLen optimize...
Post by: Codewarp on June 24, 2005, 07:35:00 PM
Nice, Roticv, I was wondering about an mmx implementation that didn't require sse...  The cpuid may be a moot point, because you can save it once in memory, and read it many.  It seems to me that some standard startup code should be grabbing/saving this value for subsequent access anywhere in the code--in 1 cycle!  The last time I checked, the cpuid doesn't exactly change during program execution ::).  On the other hand, what cpus don't have mmx--P1, PPro--does anyone out there use these any more? 

But maybe we still have to use cpuid, to prevent mmx use in future machines that don't have it.  If so, standard startup code really must be doing this.  Such code can also abort execution if run on machines outside the supported set.  Low level routines such as this must not be spending any more time on cpuid issues than actually required.

Roticv, I could not find any mention of the cpu type you ran on--what is it? :naughty:
Here is the run for my Sempron 2800+, cpuid is not nearly so bad, but still intolerable:


Proc/Byte    0    1    2    3    5    8   13   22   39   55   89  144  239  999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====

0 byte misalignment
lszLenSSE   26   25   25   25   25   28   28   32   38   47   82  119  167  591
Jens_mmx    82  112  111  111  113  118  121  129  157  122  169  139  201  361
Ratch        7   11   12   15   14   14   20   30   62   78  101  142  230  856
szLength     8    8    8   10   12   15   17   24   34   47   91  131  204  782
Jens_fast   20   20   20   20   21   26   29   36   57   70   99  146  220  929

1 byte misalignment
lszLenSSE   28   28   28   28   28   30   31   34   41   54   93  126  178  620
Jens_mmx    82   83   87   90   98  146  148  156  173  156  183  212  230  374
Ratch        7   11   12   15   18   15   23   32   67   86  109  155  255  954
szLength    13   14   14   16   15   19   20   26   40   51   92  135  208  787
Jens_fast   20   20   20   20   24   28   32   40   62   76  105  158  238 1000

2 byte misalignment
lszLenSSE   28   28   28   28   28   30   30   32   41   55   92  123  176  623
Jens_mmx    82   83   87   90   98  142  144  156  169  148  163  195  212  362
Ratch        9   11   12   15   18   15   23   32   69   86  109  155  255  953
szLength    14   14   16   16   15   20   20   29   40   51   92  136  208  785
Jens_fast   20   20   20   20   24   28   32   40   61   76  105  154  238 1000

3 byte misalignment
lszLenSSE   28   28   28   28   28   31   30   35   43   56   90  123  174  626
Jens_mmx    82   83   87   90   98  128  136  143  153  131  177  188  205  349
Ratch        7   11   12   15   18   15   23   33   70   87  109  155  254  953
szLength    14   16   16   15   19   20   24   29   41   51   94  136  209  787
Jens_fast   20   20   20   20   24   28   32   40   61   76  105  154  239 1001
Title: Re: szLen optimize...
Post by: Jimg on June 24, 2005, 08:55:20 PM
Ok, time to show my ignorance.

When trying roticv's routine, masm chokes on the movdqa line-

Assembling: F:\WinAsm\Progs\FastStringLength\timelen4\timelen.asm
F:\WinAsm\Progs\FastStringLength\timelen4\timelen.asm(337) : error A2008: syntax error : xmm

What's the solution for this?
Title: Re: szLen optimize...
Post by: Codewarp on June 24, 2005, 09:58:00 PM
The Jens-mmx code is not yet practical, because it can read up to 40 bytes beyond the end of memory.  The best way to resolve this is to align to 32 bytes instead of 8 bytes, then process 32 bytes at a time, instead of 48.  It makes it run faster too, probably because it lays down for the caches.  Because memory blocks are multiples of 4k, they always end on a 32-byte boundary.  Since 32 byte alignment can take up to 31 iterations, first align to 4 bytes, then align to 32 bytes, 4 at a time.  This improves short and long string speed.
Title: Re: szLen optimize...
Post by: roticv on June 25, 2005, 02:46:19 AM
I am a using a Celeron(R) 2.40GHz that comes with SSE3.

Anyway, you have to use a newer verison of ml in order to compile movdqa. I'm using Microsoft (R) Macro Assembler Version 7.00.9466.



[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: Phil on June 25, 2005, 03:04:51 AM
roticv: Here are the results of running your latest strlen.exe on a 996 MHz P3. There is a problem with 'correctness' with 2-byte misalignment. I also cannot assemble the source because I only have access to the earlier ML version.

Test routines for correctness:
0 byte misalignment
lszLenSSE    0    1    2    3    5    8   13   22   39   55   89  144  239  999
roticv2      0    1    2    3    5    8   13   22   39   55   98  144  239  999
Ratch        0    1    2    3    5    8   13   22   39   55   89  144  239  999
szLength     0    1    2    3    5    8   13   22   39   55   89  144  239  999
Jens_mmx2    0    1    2    3    5    8   13   22   39   55   89  144  239  999
1 byte misalignment
lszLenSSE    0    1    2    3    5    8   13   22   39   55   89  144  239  999
roticv2      0    1    2    3    5    8   13   22   39   55   98  144  239  999
Ratch        0    1    2    3    5    8   13   22   39   55   89  144  239  999
szLength     0    1    2    3    5    8   13   22   39   55   89  144  239  999
Jens_mmx2    0    1    2    3    5    8   13   22   39   55   89  144  239  999
2 byte misalignment
lszLenSSE    0    1    2    3    5    8   13   22   39   55   89  144  239  999
roticv2      0    1    2    3    5    8   13   22   48   64   89  144  239 1008
Ratch        0    1    2    3    5    8   13   22   39   55   89  144  239  999
szLength     0    1    2    3    5    8   13   22   39   55   89  144  239  999
Jens_mmx2    0    1    2    3    5    8   13   22   39   55   89  144  239  999
3 byte misalignment
lszLenSSE    0    1    2    3    5    8   13   22   39   55   89  144  239  999
roticv2      0    1    2    3    5    8   13   31   39   55   98  144  239  999
Ratch        0    1    2    3    5    8   13   22   39   55   89  144  239  999
szLength     0    1    2    3    5    8   13   22   39   55   89  144  239  999
Jens_mmx2    0    1    2    3    5    8   13   22   39   55   89  144  239  999

Proc/Byte    0    1    2    3    5    8   13   22   39   55   89  144  239  999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====

0 byte misalignment
lszLenSSE   16   16   16   16   17   19   19   22   29   48   63   86  116  403
roticv2      4   10   12   16   12   44   58   87  136  185   49  453  736 3019
Ratch       18   25   32   40   29   25   35   58   78   92  106  144  245  871
szLength    19   19   19   19   23   25   30   37   52   79  116  174  263 1025
Jens_mmx2    7   34   34   34   39   43   45   67   85   49   98   66  135  320

1 byte misalignment
lszLenSSE   16   16   16   16   16   19   19   22   32   60   92  117  184  648
roticv2      4   10   13   16   22   44   58   67  136  185  100  453  738 3017
Ratch       18   25   32   39   29   25   35   58   86  110  124  191  326 1150
szLength    24   25   25   28   28   30   34   41   72   87  119  177  274 1033
Jens_mmx2    7   12   17   18   23   64   73   78  114   77  126  144  169  349

2 byte misalignment
lszLenSSE   16   16   16   16   16   26   26   40   37   54   81  113  173  627
roticv2      4   10   13   16   22   44   58   87   73   77  287  111  135  328
Ratch       18   25   32   39   29   25   57   79   86   96  135  182  303 1140
szLength    25   25   28   28   28   31   34   45   72   87  119  177  274 1033
Jens_mmx2    7   12   17   18   23   66   70   81  112   75  124  142  167  347

3 byte misalignment
lszLenSSE   16   16   16   16   16   19   19   22   32   60   92  117  184  648
roticv2      4   10   13   16   22   44   58   66  138  185   96  453  736 3017
Ratch       18   25   32   39   29   25   35   58   86  110  124  191  326 1149
szLength    25   28   28   28   30   31   39   45   72   89  127  177  270 1032
Jens_mmx2    7   12   17   18   23   51   57   65   96   59  110  127  145  330

Press enter to exit...


It is odd that roticv2 only fails the correctness test with 2-byte misalignment when the string 999 contains extended ASCII characters.
Title: Re: szLen optimize...
Post by: roticv on June 25, 2005, 03:10:55 AM
That's extremely weird.

I get
Test routines for correctness:
0 byte misalignment
lszLenSSE    0    1    2    3    5    8   13   22   39   55   89  144  239  999
roticv2      0    1    2    3    5    8   13   22   39   55   89  144  239  999
Ratch        0    1    2    3    5    8   13   22   39   55   89  144  239  999
szLength     0    1    2    3    5    8   13   22   39   55   89  144  239  999
Jens_mmx2    0    1    2    3    5    8   13   22   39   55   89  144  239  999
1 byte misalignment
lszLenSSE    0    1    2    3    5    8   13   22   39   55   89  144  239  999
roticv2      0    1    2    3    5    8   13   22   39   55   89  144  239  999
Ratch        0    1    2    3    5    8   13   22   39   55   89  144  239  999
szLength     0    1    2    3    5    8   13   22   39   55   89  144  239  999
Jens_mmx2    0    1    2    3    5    8   13   22   39   55   89  144  239  999
2 byte misalignment
lszLenSSE    0    1    2    3    5    8   13   22   39   55   89  144  239  999
roticv2      0    1    2    3    5    8   13   22   39   55   89  144  239  999
Ratch        0    1    2    3    5    8   13   22   39   55   89  144  239  999
szLength     0    1    2    3    5    8   13   22   39   55   89  144  239  999
Jens_mmx2    0    1    2    3    5    8   13   22   39   55   89  144  239  999
3 byte misalignment
lszLenSSE    0    1    2    3    5    8   13   22   39   55   89  144  239  999
roticv2      0    1    2    3    5    8   13   22   39   55   89  144  239  999
Ratch        0    1    2    3    5    8   13   22   39   55   89  144  239  999
szLength     0    1    2    3    5    8   13   22   39   55   89  144  239  999
Jens_mmx2    0    1    2    3    5    8   13   22   39   55   89  144  239  999


When I run it on my machine. Will take a look into it. It probably has to do with the alignment and stuff like that.
Title: Re: szLen optimize...
Post by: Jimg on June 25, 2005, 02:26:45 PM
To anyone having the same problem I did with movdqa-

movdqa requires masm 6.15 or better.  Version 6.14 distributed with Hutch's masm32 distribution won't do.  6.15 is available from his site.

There, now the information will show up on a search :wink
Title: Re: szLen optimize...
Post by: Jimg on June 25, 2005, 02:36:29 PM
Victor-

I'm getting the same results as Phil for the 2 byte misalign on the 999 string, and the same large cycle count for the other three.  Also notice there is an error at the 3-byte misalignment for string 22. Also try a string length of 39.

After more testing, it gets even weirder.

Using these test strings:
    SIZES TEXTEQU <7,8,9,10,11,12,13,14,15,16,17,LastString>
I get:
Test routines for correctness:
0 byte misalignment
roticv2      7   17   18   19   20   21   22   23   48   16   17  999
1 byte misalignment
roticv2      7    8    9   10   11   12   13   14   15   16   17 1008
2 byte misalignment
roticv2      7    8    9   10   11   12   13   14   15   16   17 1008
3 byte misalignment
roticv2      7    8    9   10   11   12   13   14   15   16   17 1008


but using these strings:
    SIZES TEXTEQU <5,8,9,10,11,12,13,14,15,16,17,LastString>
I get:
Test routines for correctness:
0 byte misalignment
roticv2      5    8    9   10   11   12   13   14   15   16   17  999
1 byte misalignment
roticv2      5    8    9   10   11   12   13   14   15   16   17 1008
2 byte misalignment
roticv2      5    8    9   10   11   12   13   14   15   16   17 1008
3 byte misalignment
roticv2      5    8    9   10   11   12   13   14   15   16   17 1008

I just can't see how this can be happening???
Title: Re: szLen optimize...
Post by: Jimg on June 26, 2005, 02:40:54 AM
temporarily deleted
Title: Re: szLen optimize...
Post by: Codewarp on June 26, 2005, 06:08:00 AM
As far as I know, mmx is supported in all Pentiums except for P1 and PPro.  Does anyone out there know of any examples to the contrary?  If not, I am inclined to drop the cpuid test, but if there are newer 32-bit cpus that don't have mmx, then it must be included with strlen( ) versions intended for use in applications.
Title: Re: szLen optimize...
Post by: roticv on June 26, 2005, 07:33:09 AM
Ah it is not weird now. I figured out what is wrong.

The instruction i was using was a SSE2 instruction, but your processor interpreted it as a SSE instruction because it does not recongnise it (The only difference is the 66h prefix to the instruction).

Therefore it did not work as per expected on both your processors.
Title: Re: szLen optimize...
Post by: Phil on June 26, 2005, 03:25:24 PM
Well, I'm glad you solved that mystery! I had expected an illegal instruction exception when I ran the program but it just ran without complaint! Does this mean that it would be impractical to write an SSE2/SSE3 emulator? I suppose so, since the processor didn't seem to notice that anything was wrong in its instruction sequence. I'm thinking back to ancient times when you could run programs that used FPU instructions even if you didn't happen to have one. Now it looks like it's all up to the programmer to check capabilities if they are using SSE2/SSE3 to use a less advanced routine to ensure compatibility with older machines. I would have prefered seeing an illegal instruction exception!

Title: Re: szLen optimize...
Post by: roticv on June 26, 2005, 04:15:07 PM
Btw even Ollydbg 1.10 did not recongnise the SSE2 instruction and decoded it wrongly. It is sad that such things do happen. Do remind me to only use up to SSE instructions next time  :P

Well, it might be good or bad depending on how you at it. It is not nice for someone to run a program and get unknown opcode error just because his/her processor does not support it. Most probably he/she will not know what happened.

I think the programmer have to be proactive in ensuring that his target users have the instruction set before running it. I think I am a lousy programmer  :toothy Haven't been coding in asm for quite some time. Coding mainly in C, solving programming qn.
Title: Re: szLen optimize...
Post by: Codewarp on June 26, 2005, 08:04:46 PM
Roticv--

While we have you in this vulnerable, contrite state, let me suggest another little change to your code, to make it faster on really short strings...


roticv2 proc lpstring:dword
;int 3
mov  eax, [esp+4]   ; removed your 1st test for zero, it's coming up soon most of the time anyway
test   eax, 15          ; removed unnecessary code
jz      aligned
@@:
cmp  byte ptr [eax], 0   ; inc eax afterward so it's ready now
jz     done
add   eax, 1
test  eax, 15     ; simplified...
jnz   @B
aligned:
pxor xmm1, xmm1
align 16
@@:
movdqa     xmm0, [eax]
pcmpeqb   xmm0, xmm1
add           eax, 16
pmovmskb ecx, xmm0
test          ecx, ecx
jz             @B
bsf           ecx, ecx        ; nice use of bsr !
lea           eax, [eax+ecx-16]
done:
sub          eax, [esp+4]
retn  4
roticv2 endp
Title: Re: szLen optimize...
Post by: Jimg on June 27, 2005, 12:01:57 AM
Codewarp-

Unless this was a joke and I just didn't get it, the code you just posted doesn't give the correct answers.  I changed it's name to roticv3 to avoid conflict with 2 that I'm still looking at.  The results:
Test routines for correctness:
0 byte misalignment
szLength     0    1    2    3    5    8   13   22   39   55   89  144  239  999
roticv3      0    1    2    3    5   17   22   22   39   55   98  144 1255  999
1 byte misalignment
szLength     0    1    2    3    5    8   13   22   39   55   89  144  239  999
roticv3      0    1    2    3    5    8   13   22   48   64   98  144  239 1008
2 byte misalignment
szLength     0    1    2    3    5    8   13   22   39   55   89  144  239  999
roticv3      0    1    2    3    5    8   13   31   48   64   98  144  239 1008
3 byte misalignment
szLength     0    1    2    3    5    8   13   22   39   55   89  144  239  999
roticv3      0    1    2    3    5    8   13   31   48   64   98  144  239 1008

Proc/Byte    0    1    2    3    5    8   13   22   39   55   89  144  239  999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====

0 byte misalignment
szLength     9    8    9    9   12   16   17   24   34   48   89  132  204  783
roticv3     26   26   27   26   26   28   28   29   30   35   47   74  370  301

1 byte misalignment
szLength    13   14   14   15   15   19   21   26   40   52   91  134  207  785
roticv3      7   11   16   21   32   46   84   78   87   88   95  112  153  365

2 byte misalignment
szLength    14   14   16   16   15   19   20   29   40   52   91  136  207  784
roticv3      6   11   16   20   31   46   84   78   82   85   95  111  153  364

3 byte misalignment
szLength    14   16   16   16   19   20   24   30   42   51   94  135  209  790
roticv3      7   11   16   21   31   46   72   75   78   90   93  106  150  360

Press enter to exit...
Title: Re: szLen optimize...
Post by: Codewarp on June 27, 2005, 05:03:59 AM
Jimg--

No joke, but neither did I attempt to fix the sse2 issue (movdqa instruction).  I just tightened up on the initial byte scan.  Otherwise I don't see any errors in the code.  I would never knowingly post bad code, but I might unknowingly do it...  Are you using an sse2 capable machine.
Title: Re: szLen optimize...
Post by: Jimg on June 27, 2005, 01:53:22 PM
Duh...  Now I get it.  Sorry.  Even though Intel has 90% of market to AMD's 10% or so, and it's interesting to test this stuff out here in the laboratory, I wouldn't think sse2 would be a good choice for a general purpose rountine just yet.
Title: Re: szLen optimize...
Post by: Jimg on June 27, 2005, 03:52:23 PM
I modified Jens_mmx a little, and am getting some incredible times on the longer strings. I've looked and can't find out how it's cheating on the rest of the routines.  Is there something going on here I don't see?
Test routines for correctness:
0 byte misalignment
szLength     0    1    2    3    5    8   13   22   39   55   89  144  239  999
Ratch        0    1    2    3    5    8   13   22   39   55   89  144  239  999
Jens_fast    0    1    2    3    5    8   13   22   39   55   89  144  239  999
roticvSSE    0    1    2    3    5    8   13   22   39   55   89  144  239  999
lszLenSSE    0    1    2    3    5    8   13   22   39   55   89  144  239  999
Jens_mmx2    0    1    2    3    5    8   13   22   39   55   89  144  239  999
1 byte misalignment
szLength     0    1    2    3    5    8   13   22   39   55   89  144  239  999
Ratch        0    1    2    3    5    8   13   22   39   55   89  144  239  999
Jens_fast    0    1    2    3    5    8   13   22   39   55   89  144  239  999
roticvSSE    0    1    2    3    5    8   13   22   39   55   89  144  239  999
lszLenSSE    0    1    2    3    5    8   13   22   39   55   89  144  239  999
Jens_mmx2    0    1    2    3    5    8   13   22   39   55   89  144  239  999
2 byte misalignment
szLength     0    1    2    3    5    8   13   22   39   55   89  144  239  999
Ratch        0    1    2    3    5    8   13   22   39   55   89  144  239  999
Jens_fast    0    1    2    3    5    8   13   22   39   55   89  144  239  999
roticvSSE    0    1    2    3    5    8   13   22   39   55   89  144  239  999
lszLenSSE    0    1    2    3    5    8   13   22   39   55   89  144  239  999
Jens_mmx2    0    1    2    3    5    8   13   22   39   55   89  144  239  999
3 byte misalignment
szLength     0    1    2    3    5    8   13   22   39   55   89  144  239  999
Ratch        0    1    2    3    5    8   13   22   39   55   89  144  239  999
Jens_fast    0    1    2    3    5    8   13   22   39   55   89  144  239  999
roticvSSE    0    1    2    3    5    8   13   22   39   55   89  144  239  999
lszLenSSE    0    1    2    3    5    8   13   22   39   55   89  144  239  999
Jens_mmx2    0    1    2    3    5    8   13   22   39   55   89  144  239  999

Proc/Byte    0    1    2    3    5    8   13   22   39   55   89  144  239  999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====

0 byte misalignment
szLength     8    8   10   10   12   15   17   24   34   63   89  131  203  778
Ratch        8   11   11   13   14   14   20   30   64   77  100  143  229  853
Jens_fast   20   20   20   20   21   26   29   36   57   69   99  145  219  923
roticvSSE    4   28   28   29   28   32   32   35   39   52   80  111  166  563
lszLenSSE   25   25   25   25   25   28   28   32   39   47   83  122  167  587
Jens_mmx2    7   33   31   30   37   43   47   55   79   46   92   60  123  286

1 byte misalignment
szLength    13   14   14   16   15   19   20   26   56   67   91  134  206  784
Ratch       19   11   11   15   18   17   23   31   69   85  111  156  255  955
Jens_fast   20   20   23   20   24   28   33   41   61   76  105  158  239 1003
roticvSSE    3    7   10   12   18   58   58   51   94   71   94  125  175  583
lszLenSSE   28   28   26   28   28   40   29   34   40   56   92  126  179  625
Jens_mmx2    6    8   12   17   25   73   59   65   85   62   96  122  143  294

2 byte misalignment
szLength    14   14   16   16   15   20   20   31   40   51   92  136  210  786
Ratch        8    2   12   15   18   17   23   32   69   85  110  154  253  953
Jens_fast   20   20   20   20   24   28   32   40   63   75  105  154  237  999
roticvSSE    6    7   12   12   19   57   52   51   59   67   90  120  194  582
lszLenSSE   28   28   28   28   28   30   29   32   41   55   91  123  176  625
Jens_mmx2    8    9   12   16   26   56   54   68   86   61   95  120  140  292

3 byte misalignment
szLength    14   16   16   15   19   20   24   29   40   50   93  135  208  786
Ratch        7   10   24   15   18   16   26   32   68   87  109  157  253  951
Jens_fast   20   20   20   20   24   28   32   40   62   76  104  156  238  998
roticvSSE    4    7   10   12   18   45   46   49   54   67   89  124  171  582
lszLenSSE   28   28   28   28   28   31   30   35   43   56   89  123  173  635
Jens_mmx2    6    8   12   13   26   53   62   71   86   63  113  125  143  294

Press enter to exit...


Of course, Jens_mmx is no good for a general purpose routine as it needs 64 bytes past the possible end of the test string, but if you are writing the program and can assure enough headroom, it's fast.  For general purpose, I'll stick with szLength.

[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: Mark Jones on June 27, 2005, 03:59:12 PM
Nice, I get similar results with AMD XP 2500+
Title: Re: szLen optimize...
Post by: roticv on June 27, 2005, 04:19:05 PM
Quote from: Jimg on June 27, 2005, 03:52:23 PM
I modified Jens_mmx a little, and am getting some incredible times on the longer strings. I've looked and can't find out how it's cheating on the rest of the routines. Is there something going on here I don't see?

Of course, Jens_mmx is no good for a general purpose routine as it needs 64 bytes past the possible end of the test string, but if you are writing the program and can assure enough headroom, it's fast. For general purpose, I'll stick with szLength.

There are a couple of reasons why it is possible to achieve such good speed.
1) Alignment. It ensures that strings are aligned to 8bytes before getting into the main loop that scans using mmx registers. (Maybe we can make use of Codewarp's improvements to speed it up)
2) Unrolling of loops. It speeds up the routine as it unroll all the data and fit it into the L1 code cache.
3) Usuage of lea is not found in the main loop. Instead it is only found in the second loop to determine where is the null terminator found. Maybe this could be improved by using pmovmskb.
4) Grouped read/compares and ors. (Rule no 2 of the advanced part of optimisation in mark larson's tips)

PS: I don't think MichaelW's timing marcos are as stable as I want it to be. Oh well.
Title: Re: szLen optimize...
Post by: Codewarp on June 28, 2005, 12:27:33 AM
Quote from: roticv on June 27, 2005, 04:19:05 PM
Quote from: Jimg on June 27, 2005, 03:52:23 PM
I modified Jens_mmx a little, and am getting some incredible times on the longer strings. I've looked and can't find out how it's cheating on the rest of the routines. Is there something going on here I don't see?

Of course, Jens_mmx is no good for a general purpose routine as it needs 64 bytes past the possible end of the test string, but if you are writing the program and can assure enough headroom, it's fast. For general purpose, I'll stick with szLength.

There are a couple of reasons why it is possible to achieve such good speed.
1) Alignment. It ensures that strings are aligned to 8bytes before getting into the main loop that scans using mmx registers. (Maybe we can make use of Codewarp's improvements to speed it up)
2) Unrolling of loops. It speeds up the routine as it unroll all the data and fit it into the L1 code cache.
3) Usuage of lea is not found in the main loop. Instead it is only found in the second loop to determine where is the null terminator found. Maybe this could be improved by using pmovmskb.
4) Grouped read/compares and ors. (Rule no 2 of the advanced part of optimisation in mark larson's tips)

PS: I don't think MichaelW's timing marcos are as stable as I want it to be. Oh well.

All those reasons are ok, but the big one is this--it's damn hard and awkward to find a single byte at any alignment in any dword using the normal  cpu instructions.  But mmx is designed to operate on bigger chunks like and it gets right down to it.  You can unroll, align, lea or not lea, and reorder instructions all you want, I did, but the tripling in speed is a different animal.  It gets better with each extension (mmx -->sse -->sse2 -->sse3...), but most of the improvement can be implemented with just mmx.  PMOVMSKB is a very useful instruction here, but is SSE, not MMX.  SSE requires a P3 or later, but PII's are still around.

Also, the scan overshoot is not a problem for a nondestructive operation like strlen( ), as long as it doesn't go off the end of the 4k page.  That problem is easily remedied by processing 32-byte chunks, with 32-byte alignment--goodbye page faults...

Now, it seems to me that mmx is so standard that it could be used for "everyday" use without checking every time. However, it's host library start-up code should still abort if no mmx support exists.  Can we consider the P1 and PPro dead, or are there other non-mmx pentiums out there?
Title: Re: szLen optimize...
Post by: roticv on June 28, 2005, 02:04:04 PM
Here's my 2 cents.

It is not right to take things for granted. It is better to first check whether cpuid exist by checking the EFLAG, then call CPUID. After that, set the flag for MMX/SSE/SSE2/SSE3 and then from then on just compare with the flag. We only need to figure out whether the processor supports certain extenstion once, then we can proceed to using the correct instruction set.

There's a reason why MMX/SSE/SSE2/SSE3 instruction sets are invented  :toothy

Let's declare Jen's MMX variant of strlen the winner.
Title: Re: szLen optimize...
Post by: Codewarp on June 28, 2005, 05:56:59 PM
Quote from: roticv on June 28, 2005, 02:04:04 PM

It is not right to take things for granted. It is better to first check whether cpuid exist by checking the EFLAG, then call CPUID. After that, set the flag for MMX/SSE/SSE2/SSE3 and then from then on just compare with the flag. We only need to figure out whether the processor supports certain extenstion once, then we can proceed to using the correct instruction set.

I tend to agree, however, anything that destroys performance on string lengths of a few bytes is dead on arrival.  In "real" applications, the bulk of the clock cycles spent in strlen( ) is usually on the short strings--not on 1000 bytes+ strings.  Therefore, no cpuid and no eflags is ok with me.  Would you be doing all this real-time conditional coding in the memchr( ) and in the memmove( ), etc...?  No, this has to be performed at application start-up time, not inside these low level routines.  That way, the decision overhead is reduced to a single memory test instruction.

Title: Re: szLen optimize...
Post by: Vortex on June 28, 2005, 06:36:53 PM
Tested on a P4 2.66 GHz

Test routines for correctness:
0 byte misalignment
szLength     0    1    2    3    5    8   13   22   39   55   89  144  239  999
Ratch        0    1    2    3    5    8   13   22   39   55   89  144  239  999
Jens_fast    0    1    2    3    5    8   13   22   39   55   89  144  239  999
roticvSSE    0    1    2    3    5    8   13   22   39   55   89  144  239  999
lszLenSSE    0    1    2    3    5    8   13   22   39   55   89  144  239  999
Jens_mmx2    0    1    2    3    5    8   13   22   39   55   89  144  239  999
1 byte misalignment
szLength     0    1    2    3    5    8   13   22   39   55   89  144  239  999
Ratch        0    1    2    3    5    8   13   22   39   55   89  144  239  999
Jens_fast    0    1    2    3    5    8   13   22   39   55   89  144  239  999
roticvSSE    0    1    2    3    5    8   13   22   39   55   89  144  239  999
lszLenSSE    0    1    2    3    5    8   13   22   39   55   89  144  239  999
Jens_mmx2    0    1    2    3    5    8   13   22   39   55   89  144  239  999
2 byte misalignment
szLength     0    1    2    3    5    8   13   22   39   55   89  144  239  999
Ratch        0    1    2    3    5    8   13   22   39   55   89  144  239  999
Jens_fast    0    1    2    3    5    8   13   22   39   55   89  144  239  999
roticvSSE    0    1    2    3    5    8   13   22   39   55   89  144  239  999
lszLenSSE    0    1    2    3    5    8   13   22   39   55   89  144  239  999
Jens_mmx2    0    1    2    3    5    8   13   22   39   55   89  144  239  999
3 byte misalignment
szLength     0    1    2    3    5    8   13   22   39   55   89  144  239  999
Ratch        0    1    2    3    5    8   13   22   39   55   89  144  239  999
Jens_fast    0    1    2    3    5    8   13   22   39   55   89  144  239  999
roticvSSE    0    1    2    3    5    8   13   22   39   55   89  144  239  999
lszLenSSE    0    1    2    3    5    8   13   22   39   55   89  144  239  999
Jens_mmx2    0    1    2    3    5    8   13   22   39   55   89  144  239  999

Proc/Byte    0    1    2    3    5    8   13   22   39   55   89  144  239  999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====

0 byte misalignment
szLength    12    3    2    3   17   12   15   17   48   55   92  138  225  777
Ratch       -1    1   35    8    9    8   15   21   50   58   97  134  231  881
Jens_fast   12    0    0    0   16    6   36   41   68   74   92  130  200  930
roticvSSE    8   20   17   15   26   19   20   23   40   68   57  132  184  623
lszLenSSE   53   12   11   11   22   16   15   19   25   51   53  129  183  609
Jens_mmx2    8   33   23   23   38   33   33   39   62   39   72   65  132  416

1 byte misalignment
szLength    10   20   10   13    9   26   20   21   40   60  123  139  232  783
Ratch        1   12    6    7    7   16   12   21   42   69  125  185  356 1132
Jens_fast    3   10    1   -1    4   16   35   40   49  109  163  162  243 1087
roticvSSE   -2   12    7    7   13   53   40   45   59   91  100  149  206  664
lszLenSSE   11   22   12   12   11   26   15   19   25  113  120  158  214  729
Jens_mmx2   -3   12    3    9   17   70   73   70   85   78  102  122  140  445

2 byte misalignment
szLength     9   10   24   10   11   14   28   28   38   49   97  137  210  838
Ratch        1    1    5    8    7    5   39   20   54   58  124  185  296 1107
Jens_fast    3    0   10    2    3    5   79   43   57   60  115  168  278 1100
roticvSSE   -3    1   14   10   13   33   35   41   47   52   65  106  200  625
lszLenSSE   13   11   22   13   11   15   60   21   24   37   84  159  224  726
Jens_mmx2   -3    1   14   11   17   94   62   69  113   62  102  114  136  430

3 byte misalignment
szLength    11   16    9   20   16   11   21   38   66   82  122  137  209  788
Ratch        1    1    5    8    7    5   15   30   48   59  124  211  332 1073
Jens_fast    3   -1   -1   10    4    5   36   42   86   92  114  180  262 1080
roticvSSE   -2    1    5   11    9   59   37   70   79   85  103  145  201  631
lszLenSSE   15   14   11   22   13   15   15   30   39  114   73  184  241  746
Jens_mmx2   -2    1    3   20   18   48   85   72   99   60   95  118  164  424
Title: Re: szLen optimize...
Post by: Phil on June 28, 2005, 06:46:52 PM
Tested on 996 MHz P3 taken from timelen5.zip above. Only the timings are included here. I visually verified the correctness section and excluded it from the results.
Proc/Byte    0    1    2    3    5    8   13   22   39   55   89  144  239  999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====

0 byte misalignment
szLength    19   19   19   19   23   25   30   37   55   79  116  175  263 1023
Ratch       18   25   32   39   29   25   35   58   78   92  105  144  245  872
Jens_fast   12   12   12   12   15   18   19   38   48   63   88  130  199  871
roticvSSE    4   18   18   18   18   20   20   23   28   48   59   78  104  341
lszLenSSE   16   16   16   16   16   19   19   22   29   48   63   86  116  402
Jens_mmx2    4   31   30   31   35   38   44   63   73   45   85   60  121  287

1 byte misalignment
szLength    24   26   25   28   28   30   34   41   72   87  121  177  274 1033
Ratch       18   25   32   39   29   25   35   58   86  110  124  191  326 1149
Jens_fast   12   12   12   12   15   18   19   38   55   77   99  158  242  978
roticvSSE    4   10   13   16   22   48   49   50   59   78   93  108  143  387
lszLenSSE   16   16   16   16   16   19   19   22   32   60   92  117  185  650
Jens_mmx2    4    9   12   15   21   62   67   70  104   71  117  133  152  332

2 byte misalignment
szLength    25   25   28   28   28   31   35   45   72   87  119  177  274 1033
Ratch       18   25   32   39   29   25   47   81   85   97  135  183  303 1141
Jens_fast   12   12   12   12   15   18   30   62   57   66  107  148  231 1008
roticvSSE    4   10   13   16   22   46   46   50   57   75   87  106  133  374
lszLenSSE   16   16   16   15   17   26   26   40   36   54   82  114  171  630
Jens_mmx2    4    9   12   15   21   59   66   74  103   72  116  139  149  324

3 byte misalignment
szLength    25   28   28   28   30   31   39   45   88   89  128  178  272 1037
Ratch       18   25   32   39   29   25   35   58   86  110  125  192  327 1153
Jens_fast   12   12   12   12   15   18   19   42   55   77   99  158  243  982
roticvSSE    4   10   13   16   22   32   35   37   43   63   83   95  123  366
lszLenSSE   16   16   16   16   16   19   19   22   32   60   93  118  184  651
Jens_mmx2    4    9   12   15   21   45   56   59   88   55  101  115  136  305
Title: Re: szLen optimize...
Post by: roticv on July 15, 2005, 12:29:18 PM
Sorry to wake this dead thread but I just found another interesting strlen routine by r22.

align 16
strLenAlign16SSE:
        mov ecx,[esp+4]
        movdqa xmm2,dqword[filled]
        lea eax,[ecx+16]
        movdqa xmm0,dqword[ecx]
    .lp:
        movdqa xmm1,xmm0
        pxor xmm0,xmm2     ;xor -1
        paddb xmm1,xmm2    ;sub 1
        movdqa xmm3,[eax]  ;used for unroll
        pand xmm0,xmm1
        pmovmskb edx,xmm0
        add eax,16
        test dx,-1 ;1111 1111 1111 1111b
        jnz .unrol
        movdqa xmm1,xmm3
        pxor xmm3,xmm2     ;xor -1
        paddb xmm1,xmm2    ;sub 1
        pand xmm3,xmm1
        movdqa xmm0,[eax]  ;back to first roll
        pmovmskb edx,xmm3
        add eax,16
        test dx,-1 ;1111 1111 1111 1111b
        jz .lp
     .unrol:
        add ecx,32
        sub eax,ecx
        xor ecx,ecx
        sub ecx,edx
        and edx,ecx
        CVTSI2SD xmm0,edx
        PEXTRW edx,xmm0,3
        shr dx,4
        add dx,0fc01h
        ;          bsf edx,edx replaced by crazy SSE version
        add eax,edx
        ret 4
align 16
filled dq 0FFFFFFFFFFFFFFFFh,0FFFFFFFFFFFFFFFFh
Title: Re: szLen optimize...
Post by: lingo on August 04, 2005, 03:45:32 PM
Victor,
"Sorry to wake this dead thread but I just found another interesting strlen routine by r22."

I'm wondering what is so interesting...  :bg
A lot of code in the main loop and slow exchange of the bsf...  :'(
It is not a big deal to create something faster with 128-bit  instructions

Here is the proof tested on my P4 3.6 GHz:

Proc/Byte  0   1   2   3   5   8  13  22  39  55  89 144 239  999
0 byte misalignment
szLength  16  15  14  15  20  22  24  32  56  79 144 245 250  861
Ratch     12  17  20  22  19  21  25  35  59  80 135 180 268 1046
Jens_fast 18  20  20  19  21  26  35  73  84  99 127 168 243 1097
roticvSSE  9  32  31  32  31  36  36  42  55  68 111 235 325  919
lszLenSSE 30  28  28  30  30  32  32  37  51  65 101 193 275  978
Jens_mmx2  9  48  45  45  50  51  54  62  79  64  98  90 152  460
slenLingo 14  14  15  15  15  15  15  22  27  36  48  72  96  405

1 byte misalignment
szLength  20  20  18  25  23  26  29  35  51  63 210 178 262  869
Ratch     13  15  18  20  19  19  25  37  59  91 171 257 420 1666
Jens_fast 19  19  19  18  20  24  29  72  85 131 155 207 322 1450
roticvSSE  7  13  16  19  24  57  57  62  79  94 122 169 319  967
lszLenSSE 29  29  29  29  28  32  32  42  74 113 116 230 329 1095
Jens_mmx2  9  13  17  21  30  76  82  88 113  97 134 147 179  564
slenLingo 15  15  15  15  15  15  15  22  27  37  49  71 101  407

2 byte misalignment
szLength  21  18  22  21  21  27  26  37  51  63 131 176 255  965
Ratch     13  17  19  22  21  22  36  34  59  79 166 246 422 1491
Jens_fast 19  23  18  20  22  23  88 101 119 101 156 211 330 1355
roticvSSE  9  13  16  18  24  56  55  63  76  92 169 260 299  939
lszLenSSE 29  29  29  28  28  33  45  38  56  69 115 221 330 1195
Jens_mmx2  9  14  17  21  32  77  77  88 118  89 120 151 183  502
slenLingo 15  15  15  15  14  15  15  22  28  37  49  71 102  407

3 byte misalignment
szLength  20  24  23  23  26  25  32  38  92 105 135 176 253  880
Ratch     13  18  20  22  21  20  24  34  72  95 169 318 410 1434
Jens_fast 20  21  19  20  22  24  27  74 119 135 159 237 331 1474
roticvSSE  8  14  16  18  24  51  56  60  72  86 114 160 296 1090
lszLenSSE 29  30  28  29  28  32  31  38  59  76 119 247 343 1124
Jens_mmx2  8  11  17  21  30  66  81 141 192  85 119 136 170  484
slenLingo 14  14  14  15  15  14  15  22  28  36  48  72 103  406

Press enter to exit...


Regards,
Lingo

[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: Codewarp on August 05, 2005, 12:35:58 AM
Lingo,

Very cool solution to the strlen( ) implementation. :clap: Your method illustrates a different strategy for search alignment than any of the other algorithms seen in this thread.  It avoids alignment complexity by scanning the first 16 bytes without regard for alignment, and only aligning for subsequent reads if necessary.   I do have a few comments about it:

(1)  The Prefetches appear to be unnecessary and removing them reduces clock count by a few points
       (on Athlon64).  Prefetches are good for multiple data streams, and/or enhancing random access.  However,
       this is a single sequential stream, and normal cpu prefetch for that is as good as it gets.

(2)  Unless I am mistaken, this code requires SSE2, because of the movdqa instruction, making it less
       widely applicable.

(3)  Strings < 16 bytes can be sped up a few cycles, with simplification and avoiding jmps.

(4)  An additional 12% cycle reduction resulted when I doubled-up the loop to process 32-bytes at a time,
       but I left it out because of its potential 31-byte overreach, past the end of the string, and off the end of the last page.

(5)  Your method and mine below can both access memory past the end of the string, and off the end of the last page,
       by as much as 15 bytes (causing a possible page-fault).  But it is a way too cool :8) algorithm to leave on the shelf...

My adaptation of your method is as follows:


            mov             eax, [esp + 4]  ; eax = base address to start search
            movdqu          xmm1, [eax]     ; load first 16 bytes, aligned or not
            pxor            xmm0, xmm0      ; xmm0 = 0's
            and             eax, -16        ; align eax down to base paragraph
            pcmpeqb         xmm1, xmm0      ; check 16 bytes for zeros
            pmovmskb        edx, xmm1           
            test            edx, edx        ; test edx for zero
            jz              again           ; branch ahead if no zero bytes found
            bsf             eax, edx        ; return the byte position as the length
            ret

            align           8
again:      movdqa          xmm1, [eax + 16] 
            add             eax, 16         ; eax = address of 16-byte compare
            pcmpeqb         xmm1, xmm0      ; search the 16-bytes
            pmovmskb        edx, xmm1     
            test            edx, edx
            jz              again
            sub             eax, [esp + 4]    ; subtract original base address
            bsf             edx, edx          ; get position of 1st zero byte
            add             eax, edx          ; add to base address
            ret

Title: Re: szLen optimize...
Post by: lingo on August 06, 2005, 04:09:54 AM
Codewarp, :bg

"2)  Unless I am mistaken, this code requires SSE2, because of the movdqa instruction, making it less
       widely applicable."


I have a new version with movups and movaps indeed movdqu and movdqa
Pls, reload the zip file and test it again
    
"(3)  Strings < 16 bytes can be sped up a few cycles, with simplification and avoiding jmps."
I agree

"5)  Your method and mine below can both access memory past the end of the string, and off the end of the last page,
   by as much as 15 bytes (causing a possible page-fault).  But it is a way too cool  algorithm to leave on the shelf..."


I disagree
What is  "the end of the string"?
We search the end of the string, hence the phrase "the end of the string" is undefined ?
Before the usage of the StrLen we need a buffer with ENOUGHT memory
for our string, hence  if someone use my algo he needs to allocate  ENOUGHT+ 32 bytes of memory

Example: The constant  _MAX_PATH ->(Maximum length of full path) is 260 bytes long
               If we search length of the string  of the current path with file name
              we need to allocate 260+32 bytes for buffer
   or ENOUGHT+32 memory will be 292 bytes

"; align eax down to base paragraph"
Thanks for comments  :bg

Regards,
Lingo


Title: Re: szLen optimize...
Post by: Codewarp on August 06, 2005, 08:51:52 AM
Lingo,

The bare SSE support is much appreciated. :thumbu

Sadly, the overreach is real.  Suppose that you have a valid 2 byte string (3rd byte is zero), and these are the last three bytes at end of a 4k page, the last page in the block.  This algorithm will initially read 16 bytes from the misaligned address, trespassing 13 bytes into the non-existant next page.  The best way to handle this problem is to use only 16/aligned reads, then mask out the initial unwanted comparison bytes.  But that is hard to do as fast as the method you have here.
Title: Re: szLen optimize...
Post by: Ratch on August 06, 2005, 02:18:33 PM
Codewarp,
     Why not just put a ALIGN 16 after the buffer?  Then you would be guaranteed to be within a 16 byte boundary.  Ratch
Title: Re: szLen optimize...
Post by: Codewarp on August 06, 2005, 06:14:44 PM
Putting an Align 16 anywhere is irrelevant.  You are handed a pointer to 3 bytes, you read 16 bytes, you don't get to choose its alignment.  This will lead to overrun.  There are only three choices:  live with it, backup to read 16 bytes from the beginning, or use GPRs instead of SSE.
Title: Re: szLen optimize...
Post by: lingo on August 06, 2005, 11:58:32 PM
Codewarp, :bg

"The best way to handle this problem is to use only 16/aligned reads,
then mask out the initial unwanted comparison bytes.
But that is hard to do as fast as the method you have here.


For me it is not a big deal (Pls, reload the zip file and see Lingo2)
but it is slower because we have additional code  :(

Regards,
Lingo
Title: Re: szLen optimize...
Post by: Ratch on August 07, 2005, 01:40:56 AM
Codewarp,

QuoteYou are handed a pointer to 3 bytes, you read 16 bytes, you don't get to choose its alignment.

     Now you setting conditions that I did not know about.

Quote
...backup to read 16 bytes from the beginning

     Do you mean back it up to a 16 byte boundary?  It becomes even more complicated if there is a '0' lurking within the backed up data area.  Ratch

Title: Re: szLen optimize...
Post by: Codewarp on August 07, 2005, 06:37:54 AM
I am not setting any conditions--I am merely reporting a perfectly normal case where the algorithm in question fails to meet desired objectives, i.e. where it reads past the end of its data, and off the end of the 4k page.  By doing so, I have shown the assertion to be irrefutable. 

Ratch, are you really suggesting that strlen( ) work only for 16-byte aligned strings? :eek Strlen( ) is a routine defined by the c-runtime library--it has no preconditions about alignment.  Its users expect it to return a correct string length from any byte address, as long as its terminating zero byte follows within valid memory.  We, as programmers, are bound to implement that, even if we don't like it.  Now, if you want to have two versions, strlen( ) and strlen16( ), where everyone knows the limitations, that's fine.  Or you can switch internally to a different method, upon detecting misaligned strings.  However expecting strings to all be 16-byte aligned is unreasonable.

By backing up to the previous 16-byte boundary, comparing, then masking off the unwanted part of the comparison, you can completely avoid unaligned reads AND avoid the overreach.  But that is more work than lingo's algorithm, and consequently slower, but perhaps more correct.
Title: Re: szLen optimize...
Post by: Ratch on August 09, 2005, 03:44:20 AM
Codewarp,
Quote
Ratch, are you really suggesting that strlen( ) work only for 16-byte aligned strings?

     Nope, I am saying that backing up and reading might pose difficulities with respect to time and logic.  I think it's better to go forward to the next 16-byte boundary instead.  See below.  Ratch

http://www.masmforum.com/simple/index.php?topic=2442.0
Title: Re: szLen optimize...
Post by: Codewarp on August 12, 2005, 06:09:32 AM
Ratch,
Your prediction of "logic and time" consequences doesn't hold up.  Here is a fully correct version that cannot ever overreach a string and its 4k page.  It required exactly three additional GPR instructions, plus it reads only aligned blocks, which can more than pay for the 3 instructions, for a net zero cost.  This is a practical sse version of Lingo's method for any application, and one that outperforms all GPR implementations:


lensse:  mov            eax, [esp + 4]  ; eax = base address to start search
         mov            ecx, eax
         and            eax, not 15     ; pull down to aligned address
         and            ecx, 15         ; ecx = pre-read bytes to skip
         movaps         xmm1, [eax]     ; load first 16 bytes, aligned
         pxor           xmm0, xmm0      ; xmm0 = 0's
         pcmpeqb        xmm1, xmm0      ; check 16 bytes for zeros
         pmovmskb       edx, xmm1       ; edx holds a 1-bit for each zero byte (in the low 16 bits)
         shr            edx, cl         ; discard the pre-read
         test           edx, edx        ; test edx for zero
         jz             again           ; branch ahead if no zero bytes found
         bsf            eax, edx        ; return the bit position as the length
         ret

         align          8
again:   movaps         xmm1, [eax + 16] 
         add            eax, 16         ; eax = address of 16-byte compare
         pcmpeqb        xmm1, xmm0      ; search the 16-bytes
         pmovmskb       edx, xmm1     
         test           edx, edx
         jz             again
         bsf            edx, edx        ; get position of 1st zero byte
         sub            eax, [esp + 4]  ; subtract original base address
         add            eax, edx        ; add for base address
         ret

Title: Re: szLen optimize...
Post by: Biterider on August 12, 2005, 09:34:19 AM
Hi Codewarp
I was working on a similar approach like yours, but you came first. Since I have some problems compiling the movdqa instruction, I reduced the the routine to xmm instructions and shuffled a little bit some instructions to obtain a better performance. As expected, the performance is similar to Donkeys routine, but the real advantage comes out with misaligned strings.

StrLength proc pString:dword
.xmm
    mov eax, [esp + 4]    ; eax = base address to start search
    mov ecx, eax
    and eax, not 7        ; pull down to aligned address
    and ecx, 7            ; ecx = pre-read bytes to skip
    movq mm1, [eax]       ; load first 8 bytes, aligned
    pxor mm0, mm0         ; mm0 = 0's
    pcmpeqb mm1, mm0      ; check 8 bytes for zeros
    pmovmskb edx, mm1     ; edx holds a 1-bit for each zero byte (in the low 8 bits)
    shr edx, cl           ; discard the pre-read
    test edx, edx         ; test edx for zero
    jz more               ; branch ahead if no zero bytes found
    emms
    bsf eax, edx          ; return the bit position as the length
    ret 4
more:
    add eax, 8
again:   
    movq mm1, [eax] 
    pcmpeqb mm1, mm0      ; search the 8-bytes
    add eax, 8            ; eax = address of 8-byte compare
    pmovmskb edx, mm1     
    test edx, edx
    jz again
    bsf edx, edx          ; get position of 1st zero byte
    emms
    sub eax, [esp + 4]    ; subtract original base address
    lea eax, [eax + edx - 8]   ; add for base address
    ret 4
StrLength endp


Regards,

Biterider
Title: Re: szLen optimize...
Post by: Codewarp on August 12, 2005, 07:48:53 PM
Thank you, Biterider, your mmx adaptation will now replace the mmx version in my codebase (with some changes ::)).  I was particularly pleased with how effortlessly I could dispose of the pre-read.  You are quite correct about the misalignment performance, important now since the greater majority of strings we call this on are not 16-byte aligned.
Title: Re: szLen optimize...
Post by: Ratch on August 13, 2005, 03:49:33 AM
Codewarp,

Quote
This is a practical sse version of Lingo's method for any application, and one that outperforms all GPR implementations:

     It does not perform anything on my machine (AMD Athlon running Windows ME).  Evidently my CPU chokes on 128-bit registers referenced by MOVAPS XMM1,[EAX].  Not all machines have the latest MMX hardware and OS support.  That's one advantage of writing GPR code; it works on the 386 and up.  Anyway, congratulations on finding a solution to the alignment problem.  Ratch
Title: Re: szLen optimize...
Post by: Brett Kuntz on August 13, 2005, 04:01:06 AM
Hello, I was bored so I disasm'd Borlands version and tweaked it a bit. Could someone bench this against some of the other fast ones posted up to see where it stands? I ran it through Olly with a whole lot of different strings and it always returned the proper length.


   strlen proto :dword
.code
;##########################################################################
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE

strlen proc xstr:dword

   mov eax, dword ptr [esp+4]
d1: mov edx, dword ptr [eax]
   mov ecx, edx
   add eax, 4
   sub edx, 16843009
   and edx, 2155905152
   jz d1
   not ecx
   and edx, ecx
   jz d1
   test dl, dl
   jnz d2
   test dh, dh
   jnz d3
   test edx, 16711680
   jnz d4
   jmp d5
d2: dec eax
d3: dec eax
d4: dec eax
d5: dec eax
   sub eax, dword ptr [esp+4]
   add esp, 8
   jmp dword ptr [esp-8]

strlen endp

OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
;##########################################################################
Title: Re: szLen optimize...
Post by: Jimg on August 13, 2005, 12:51:17 PM
That's the slowest one yet!


Proc/Byte    0    1    2    3    5    8   13   22   39   55   89  144  239  999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====

0 byte misalignment
szLength     8    8    9    9   12   15   17   24   48   63   89  131  202  779
Ratch        9   11   12   15   14   14   20   31   64   78  100  143  231  855
Jens_fast   20   20   19   20   21   26   29   36   58   69   99  145  219  923
roticvSSE    4   28   29   28   28   32   32   35   39   53   79  112  160  576
lszLenSSE   25   25   25   25   25   28   28   32   38   47   84  116  166  591
Jens_mmx2    6   31   31   32   37   43   48   55   78   45   93   60  124  288
BiteRider   27   26   26   26   26   28   29   32   37   46   82  112  161  569
borland      9   10   11    9   14   18   22   29   58   74  109  162  249 1121

1 byte misalignment
szLength    13   13   14   16   15   19   20   26   56   67   91  134  206  781
Ratch        8   11   12   15   18   16   23   32   69   86  110  156  254  959
Jens_fast   20   20   21   20   24   28   32   41   64   75  105  154  243 1001
roticvSSE    3    7   10   12   18   53   48   51   60   73   94  125  176  589
lszLenSSE   32   28   28   28   31   30   30   37   45   55   92  126  178  623
Jens_mmx2    6   10   12   16   26   73   59   65   86   61   97  123  142  296
BiteRider   25   26   26   26   27   29   29   31   48   57   84  113  169  576
borland      8    9   11   10   15   20   25   36   71   90  135  205  323 1198

2 byte misalignment
szLength    14   14   16   16   15   20   20   31   41   51   92  136  208  787
Ratch        8   11   12   15   18   17   24   32   69   86  110  156  254  950
Jens_fast   20   20   20   19   24   28   32   40   62   75  104  153  237  997
roticvSSE    4    7    8   12   18   45   46   51   59   67   93  125  172  590
lszLenSSE   28   28   28   30   28   30   31   35   41   54   94  126  178  621
Jens_mmx2    7    8   12   17   26   59   54   58   84   59   96  120  141  295
BiteRider   26   26   26   26   24   29   29   33   48   57   84  112  164  575
borland      9   10   11    9   15   17   25   36   72   90  135  206  325 1203

3 byte misalignment
szLength    15   16   16   15   19   20   25   30   41   51   94  137  209  784
Ratch        9   11   12   15   18   16   23   32   70   85  108  156  253  947
Jens_fast   20   20   20   20   24   28   32   40   65   76  104  153  237  996
roticvSSE    6    7   10   35   18   43   46   49   53   66   89  119  172  583
lszLenSSE   28   28   29   28   28   30   31   32   41   57   91  124  176  624
Jens_mmx2    7    9   12   16   26   54   62   70   86   63  115  125  144  290
BiteRider   26   26   26   26   29   29   31   33   47   56   82  111  164  577
borland      8   10   12    9   15   19   25   36   69   91  134  207  325 1197


These are only the one that give the correct answers, the one's with movedqa don't work on my machine.
Title: Re: szLen optimize...
Post by: Ratch on August 13, 2005, 12:58:23 PM
 kunt0r,
Quote
I ran it through Olly with a whole lot of different strings and it always returned the proper length.

     It should.  It uses the same algo as most of the other GPR routines.  For instance, its decimal SUB constant 16843009 decimal converts to 010101010H, and its AND constant 2155905152 decimal converts to 080808080H.  It is sensitive to string alignment; 7575 ticks aligned, vs. 8856 ticks with 3 byte misalignment for a 10003 byte long string.  It has no provision to prevent over reading its memory.  Its code to search the last word for a zero byte is clunky, but that does not affect its speed too much, because to only executes it once.  It uses two instructions instead of RET DWORD, and it leaves its return address vulnerable on the stack for a possible overwrite.  It speed is comparable to my GPR routine, except my routine does not slow up for string misalignment on long strings.  Not that it matters much, but my code is also shorter.   Ratch
Title: Re: szLen optimize...
Post by: Brett Kuntz on August 13, 2005, 02:16:13 PM
ah I didn't know about "ret 4" doing the same thing as those two lines of code, that sped it up slightly.
Title: Re: szLen optimize...
Post by: Ratch on August 13, 2005, 03:46:00 PM
 kunt0r,

Quote
ah I didn't know about "ret 4" doing the same thing as those two lines of code, that sped it up slightly.

     Can't be much of a difference because it only gets executed once.  The search loop is where the subprogram it spends most of its time.  As I mentioned before,  it leaves its return address vulnerable to a possible overwrite.  One would expect something better from Borland. Ratch

Jimg,

     Did you use my latest STRLEN?  What was the switch was set for, the 8-bit or 7-bit seach?  Ratch

http://www.masmforum.com/simple/index.php?topic=2442.0
Title: Re: szLen optimize...
Post by: Brett Kuntz on August 13, 2005, 03:51:45 PM
(edit-Ratch, Borland didn't use that kind of stack stuff, I did it myself to shave a few cycles. Borlands actually has more code which checks for misaligned data with a test al, 3 line which leads to more code ect..I basically used Borlands as a basis and tried to make it faster then they did)

I updated my borland one and got these results:


Proc/Byte    0    1    2    3    5    8   13   22   39   55   89  144  239  999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
borland      9   11   10   10   11   12   15   21   49   61  87  129  200 774
szLength     8    8    9    9   12   15   17   24   48   63   89  131  202  779
Ratch        9   11   12   15   14   14   20   31   64   78  100  143  231  855
Jens_fast   20   20   19   20   21   26   29   36   58   69   99  145  219  923
roticvSSE    4   28   29   28   28   32   32   35   39   53   79  112  160  576
lszLenSSE   25   25   25   25   25   28   28   32   38   47   84  116  166  591
Jens_mmx2    6   31   31   32   37   43   48   55   78   45   93   60  124  288
BiteRider   27   26   26   26   26   28   29   32   37   46   82  112  161  569



    .686p
    .model flat, stdcall
    option casemap :none

    include \masm32\include\windows.inc
    include \masm32\include\kernel32.inc
    includelib \masm32\lib\kernel32.lib
    include \masm32\include\user32.inc
    includelib \masm32\lib\user32.lib

    include \masm32\kinc\strlen.inc
    include \masm32\kinc\timer.inc

.data
    align 4
    teststr db 999 dup("a"), 0
    msgstr db "Cycles: %d", 0
.data?
    time dd ?
    buff db 64 dup(?)
.code
start:
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««

    counter_begin 1000000, REALTIME_PRIORITY_CLASS
    invoke strlen, addr teststr
    counter_end

    invoke wsprintf, addr buff, addr msgstr, eax
    invoke MessageBox, 0, addr buff, 0, 0
    invoke ExitProcess, 0

; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
end start



    strlen proto :dword
.code
;##########################################################################
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE

strlen proc xstr:dword

    mov eax, dword ptr [esp+4]
d1: mov edx, dword ptr [eax]
    mov ecx, edx
    add eax, 4
    sub edx, 16843009
    and edx, 2155905152
    jz d1
    not ecx
    and edx, ecx
    jz d1
    test dl, dl
    jnz d2
    test dh, dh
    jnz d3
    test edx, 16711680
    jnz d4
    jmp d5
d2: dec eax
d3: dec eax
d4: dec eax
d5: dec eax
    sub eax, dword ptr [esp+4]
    ret 4

strlen endp

OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
;##########################################################################
Title: Re: szLen optimize...
Post by: hutch-- on August 16, 2005, 02:55:23 PM
If anyone has the time to test this out, it would be appreciated, I needed a DWORD type strlen algo that had auto aligning code at its beginning so I added a front end to Agner Fog's strlen algo ad it appears to be working OK at the moment. Some rough benchmarking with a mixed set of samples from a few bytes to 16k shows this hybrid to be about 2.5 times faster than a classic byte scanner.


; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««

OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE

align 4

slen2 proc item:DWORD

    push edi
    push esi

    mov eax, [esp+12]
    mov ecx, eax                ; copy EAX to ECX
    add ecx, 3                  ; align up by 4
    and ecx, -4       
    sub ecx, eax                ; calculate any misalignment in ecx
    mov esi, ecx                ; store ECX in ESI
    jz proceed

    sub eax, 1
  @@:
    add eax, 1
    cmp BYTE PTR [eax], 0       ; scan for terminator for
    je quit                     ; up to the 1st 3 bytes
    sub ecx, 1
    jns @B
    jmp proceed

  quit:
    sub eax, [esp+12]           ; calculate length if terminator
    jmp outa_here               ; is found in 1st 3 bytes

  ; ----------------

  proceed:                      ; proceed with the rest
    lea edx, [eax+3]            ; pointer+3 used in the end
  align 4
  @@:     
    mov edi, [eax]              ; read first 4 bytes
    add eax, 4                  ; increment pointer
    lea ecx, [edi-01010101h]    ; subtract 1 from each byte
    not edi                     ; invert all bytes
    and ecx, edi                ; and these two
    and ecx, 80808080h   
    jz @B                       ; no zero bytes, continue loop
    test ecx, 00008080h         ; test first two bytes
    jnz @F
    shr ecx, 16                 ; not in the first 2 bytes
    add eax, 2
  @@:
    shl cl, 1                   ; use carry flag to avoid branch
    sbb eax, edx                ; compute length
    add eax, esi

  outa_here:
    pop esi
    pop edi

    ret 4

slen2 endp

OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef

; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
Title: Re: szLen optimize...
Post by: lingo on August 16, 2005, 05:44:38 PM
Codewarp wrote: :P

" Re: Improved STRLEN
« Reply #1 on: August 08, 2005, 10:40:48 pm »

Ratch,
Isn't this a rehash of what we already covered this in the szLen thread, ad nausium?  Also back in that thread, I suggested a super optimization that you are almost doing in your routine above. 

The idea is this: search 7-bit ascii, since it's faster than 8-bit.  But when you find a "zero", check it for bit7=1: 1=>resume the search for 8-bit ascii, 0=>you are done.  In other words use 7-bit search as far as you can take it, then ride the rest of the way using 8-bit search as needed.  In most text, the 8-bit part will never be needed, but when required, it covers 8-bit as well--the best of both worlds..."


I agree with bolded text and you can test the result (see timelen1.asm)
I improved my algos LingoSSE2 and  LingoMMX too.. (see timelen.asm)
Here is the results on my P4 3.6 GHz Prescott:



A. Timelen.asm -> test

Test routines for correctness:
0 byte misalignment
Borland   0 1 2 3 5 8 13 22 39 55 89 144 239 999
szLength  0 1 2 3 5 8 13 22 39 55 89 144 239 999
Ratch     0 1 2 3 5 8 13 22 39 55 89 144 239 999
Hutch     0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_fast 0 1 2 3 5 8 13 22 39 55 89 144 239 999
lszLenSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
roticvSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Biterider 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_mmx2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
LingoMMX  0 1 2 3 5 8 13 22 39 55 89 144 239 999
LingoSSE2 0 1 2 3 5 8 13 22 39 55 89 144 239 999

1 byte misalignment
Borland   0 1 2 3 5 8 13 22 39 55 89 144 239 999
szLength  0 1 2 3 5 8 13 22 39 55 89 144 239 999
Ratch     0 1 2 3 5 8 13 22 39 55 89 144 239 999
Hutch     0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_fast 0 1 2 3 5 8 13 22 39 55 89 144 239 999
lszLenSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
roticvSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Biterider 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_mmx2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
LingoMMX  0 1 2 3 5 8 13 22 39 55 89 144 239 999
LingoSSE2 0 1 2 3 5 8 13 22 39 55 89 144 239 999

2 byte misalignment
Borland   0 1 2 3 5 8 13 22 39 55 89 144 239 999
szLength  0 1 2 3 5 8 13 22 39 55 89 144 239 999
Ratch     0 1 2 3 5 8 13 22 39 55 89 144 239 999
Hutch     0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_fast 0 1 2 3 5 8 13 22 39 55 89 144 239 999
lszLenSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
roticvSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Biterider 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_mmx2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
LingoMMX  0 1 2 3 5 8 13 22 39 55 89 144 239 999
LingoSSE2 0 1 2 3 5 8 13 22 39 55 89 144 239 999

3 byte misalignment
Borland   0 1 2 3 5 8 13 22 39 55 89 144 239 999
szLength  0 1 2 3 5 8 13 22 39 55 89 144 239 999
Ratch     0 1 2 3 5 8 13 22 39 55 89 144 239 999
Hutch     0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_fast 0 1 2 3 5 8 13 22 39 55 89 144 239 999
lszLenSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
roticvSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Biterider 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_mmx2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
LingoMMX  0 1 2 3 5 8 13 22 39 55 89 144 239 999
LingoSSE2 0 1 2 3 5 8 13 22 39 55 89 144 239 999


Proc/Byte  0  1  2 3  5  8  13 22  39  55  89 144 239  999
===========================================================

0 byte misalignment
Borland   19 20 20 20 22 25 32 42  70 122 167 232 344 1988
szLength  15 15 16 16 19 22 24 31  52  72 128 174 248  866
Ratch     12 15 18 20 19 21 24 35  59  82 140 182 284  993
Hutch     21 21 23 23 24 30 33 40  56  81 149 187 279  989
Jens_fast 19 19 19 19 20 23 43 80  88 101 126 168 252  981
lszLenSSE 28 29 28 28 28 32 32 37  52  64 101 200 289  925
roticvSSE  8 13 19 18 25 36 36 41  59  73 109 159 275  924
Biterider 14 15 15 15 14 25 23 37  49  61 102 219 272  922
Jens_mmx2  8 44 45 45 50 53 57 67  78  63  99  90 155  466
LingoMMX  14 14 14 14 14 24 24 36  47  66 100 117 143  402
LingoSSE2 14 14 14 14 14 14 14 25  39  46  69  98 111  301

1 byte misalignment
Borland   19 20 20 20 22 25 31 42  69 165 189 281 437 2535
szLength  19 19 19 23 23 27 29 35  50  63 131 176 256  882
Ratch     12 15 18 20 19 20 25 34  59  92 170 247 397 1465
Hutch     20 21 24 23 38 40 42 45  60  83 154 187 298 1002
Jens_fast 20 20 19 20 23 32 27 72  85 131 162 215 321 1362
lszLenSSE 28 28 28 28 28 32 32 36  51  76 118 226 331 1084
roticvSSE 10 14 16 18 24 58 60 64  83  97 122 168 308  978
Biterider 14 14 14 14 14 24 24 37  55  72  97 165 283  932
Jens_mmx2  8 12 17 21 31 77 84 95 119 100 131 153 183  490
LingoMMX  14 14 14 14 14 24 24 36  51  67 101 121 145  403
LingoSSE2 14 14 14 14 14 14 14 25  39  49  69  98 117  304

2 byte misalignment
Borland   19 20 20 20 22 26 43 42  69 121 192 284 455 2418
szLength  19 18 21 21 21 24 27 37  51  63 131 176 253  872
Ratch     12 15 19 21 20 20 37 36  59  81 166 247 401 1472
Hutch     19 21 20 22 33 36 41 51  60  80 159 212 321 1006
Jens_fast 19 19 19 19 20 27 62 77  85  99 156 209 328 1366
lszLenSSE 29 29 28 28 28 32 44 37  52  64 124 253 343 1091
roticvSSE  8 12 16 18 25 55 54 64  76  90 117 164 297  956
Biterider 14 14 15 14 15 24 24 44  56  73  97 160 284  936
Jens_mmx2  8 14 17 20 33 76 77 92 106  89 123 144 177  497
LingoMMX  13 15 15 15 14 24 24 38  56  69  97 118 145  403
LingoSSE2 14 14 14 14 14 14 14 25  39  50  74 101 117  301

3 byte misalignment
Borland   20 23 23 23 23 26 31 45  83 150 196 328 443 2381
szLength  20 21 21 22 24 24 31 37  92 104 137 176 254  872
Ratch     13 15 17 20 20 20 24 35  72  95 167 282 405 1438
Hutch     19 19 22 28 32 33 40 65  70  94 158 264 367  998
Jens_fast 19 19 19 19 20 27 59 86 117 128 150 198 301 1377
lszLenSSE 30 28 29 28 28 32 34 40  59  76 111 245 354 1128
roticvSSE  8 12 16 19 24 51 55 60  72  89 115 161 293  962
Biterider 14 14 15 15 24 24 37 44  55  73  97 156 283  935
Jens_mmx2  8 12 17 20 30 66 74 82 101  82 118 142 180  490
LingoMMX  13 14 14 14 14 24 25 41  57  71  97 117 146  402
LingoSSE2 14 14 14 14 14 14 14 25  39  46  69  98 117  301

Press enter to exit...


B. Timelen1.asm -> test

Test routines for correctness:
0 byte misalignment
RatchN  0 1 2 3 5 8 13 22 39 55 89 144 239 999
Lingo32 0 1 2 3 5 8 13 22 39 55 89 144 239 999

1 byte misalignment
RatchN  0 1 2 3 5 8 13 22 39 55 89 144 239 999
Lingo32 0 1 2 3 5 8 13 22 39 55 89 144 239 999

2 byte misalignment
RatchN  0 1 2 3 5 8 13 22 39 55 89 144 239 999
Lingo32 0 1 2 3 5 8 13 22 39 55 89 144 239 999

3 byte misalignment
RatchN  0 1 2 3 5 8 13 22 39 55 89 144 239 999
Lingo32 0 1 2 3 5 8 13 22 39 55 89 144 239 999


Proc/Byte 0  1  2  3  5  8 13 22 39 55  89 144 239  999

========================================================

0 byte misalignment
RatchN   12 18 22 23 21 20 26 36 65 80 135 178 258 1392
Lingo32  10 14 17 19 18 17 20 28 39 55 111 141 198 1047

1 byte misalignment
RatchN    9 15 21 32 38 38 44 57 75 101 146 192 270 1513
Lingo32   9 14 18 22 27 27 31 32 46  53 113 146 203 1054

2 byte misalignment
RatchN    9 15 25 29 32 32 37 48 70 93 144 189 268 1493
Lingo32  10 15 18 20 26 26 27 33 46 54 113 147 206 1052

3 byte misalignment
RatchN    9 22 22 23 23 31 30 42 64 87 139 184 257 1411
Lingo32   9 15 17 19 17 28 28 34 47 55 112 148 200 1053

Press enter to exit...



Regards,
Lingo



[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: Ratch on August 17, 2005, 01:36:52 PM
Lingo,
     Ok, let's talk about string length subroutines.  I am going to limit my remarks to GPR routines only.  Well written MMX/SSE code is going to beat the pants off any GPR implementation.  If MMX/SSE instructions did not, they would not exist. 

     The most important part of the subroutine is the core loop where most of the execution time is spent, especially on a long string.  The code before and after this core loop is either executed once, or only on special conditions such as aligning on a dword, locating the zero byte within the word and testing for a legitimate zero byte.  Let's define the core loops.

The following loop I will call the Agner Fog loop.  It is widely attributed to Agner, but I have my doubts.  I keep seeing it elsewhere, and I suspect it is old and was well known to computer science academics before Agner.  It is a 7 line loop. http://www.cl.cam.ac.uk/~am/progtricks.html


  .REPEAT                   ;searching string ....
    MOV EDX,[EAX]           ;next 4 byte gulp (DWORD)
    ADD EAX,DWORD           ;EAX=character pointer
    LEA ECX,[EDX-01010101H] ;propagate if byte is zero
    NOT EDX                 ;set up test pattern
    AND EDX,ECX             ;leftmost bit of zero byte should now be set
    AND EDX,080808080H      ;sieve out zero bytes
  .UNTIL !ZERO?             ;check the next DWORD
endif


     This I will call the Lingo loop.  It is a 7 line loop.


  .REPEAT
    LEA EDX,[ECX+0FEFEFEFFH]
    NOT ECX
    AND ECX,080808080H
    ADD EAX,4
    AND EDX,ECX
    MOV ECX,[EAX]
  .UNTIL !ZERO?             ;check the next DWORD


     Below is the Ratch8 loop. It is a 6 line loop, and used for 8-bit extended ASCII.


  .REPEAT
    MOV EDX,[EAX]           ;next 4 byte gulp (DWORD)
    AND EDX,07F7F7F7FH      ;mask out bit 8
    ADD EAX,DWORD           ;EAX=character pointer
    SUB EDX,01010101H       ;make those zero bytes shine
    AND EDX,ECX             ;sieve out zero bytes
  .UNTIL !ZERO?             ;check the next DWORD


     And finally is the Ratch7 loop.  It is a 5 line loop used for 7-bit ASCII.


  .REPEAT
    MOV EDX,[EAX]           ;next 4 byte gulp (DWORD)
    ADD EAX,DWORD           ;EAX=character pointer
    SUB EDX,01010101H       ;make those zero bytes shine
    AND EDX,ECX             ;sieve out zero bytes
  .UNTIL !ZERO?             ;check the next DWORD


     Theoretically, the Ratch7 loop will execute the fastest, and according to my timings, it does.  If 8-bit is needed, my timings show the Ratch8, Lingo, and Agner loops in a dead heat.  I do not know why I do not get faster speeds on Ratch8 since it only has 6 lines in the loop vs. 7 for Lingo and Agner.  It appears that timing is a tricky thing and I do not pretend to understand it.  I executed your timelen1 routine on my 1 ghz AMD Athlon and as you can see, the results are different than yours were.  By the way, you did not test my 7-bit version, and you transcribed the MOV ECX 80808080H in my 8-bit version to the wrong spot in timelen1.  Also you used an old version of my STRLEN in timelen.


Test routines for correctness:
0 byte misalignment
RatchN       0    1    2    3    5    8   13   22   39   55   89  144  239  999
Lingo32      0    1    2    3    5    8   13   22   39   55   89  144  239  999
1 byte misalignment
RatchN       0    1    2    3    5    8   13   22   39   55   89  144  239  999
Lingo32      0    1    2    3    5    8   13   22   39   55   89  144  239  999
2 byte misalignment
RatchN       0    1    2    3    5    8   13   22   39   55   89  144  239  999
Lingo32      0    1    2    3    5    8   13   22   39   55   89  144  239  999
3 byte misalignment
RatchN       0    1    2    3    5    8   13   22   39   55   89  144  239  999
Lingo32      0    1    2    3    5    8   13   22   39   55   89  144  239  999

Proc/Byte    0    1    2    3    5    8   13   22   39   55   89  144  239  999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====

0 byte misalignment
RatchN      10   11   14   17   14   15   19   30   61   73   95  136  220 1029
Lingo32      8    8    9   10   12   13   15   22   53   60   80  120  179 1066

1 byte misalignment
RatchN       4    7   12   18   26   26   33   42   62   76  108  152  227 1050
Lingo32      9   13   17   17   18   20   23   28   58   66   91  136  199 1061

2 byte misalignment
RatchN       4    8   14   18   26   25   32   31   63   75  109  148  222 1048
Lingo32      9   12   14   14   16   18   24   28   57   66   85  120  184 1062

3 byte misalignment
RatchN       5   12   15   18   14   24   20   30   63   76   96  150  219 1028
Lingo32      8   12   14   14   14   17   19   24   53   65   86  124  186 1055

Press enter to exit...


Quote
Ratch,
Isn't this a rehash of what we already covered this in the szLen thread, ad nausium?  Also back in that thread, I suggested a super optimization that you are almost doing in your routine above. 

The idea is this: search 7-bit ascii, since it's faster than 8-bit.  But when you find a "zero", check it for bit7=1: 1=>resume the search for 8-bit ascii, 0=>you are done.  In other words use 7-bit search as far as you can take it, then ride the rest of the way using 8-bit search as needed.  In most text, the 8-bit part will never be needed, but when required, it covers 8-bit as well--the best of both worlds..."

I agree with bolded text and you can test the result (see timelen1.asm)

     If it was discussed before, I missed it.  There are presently 10 pages to this thread.  That's a lot of material to plow through.  I guess my contribution to that idea is a 6 line implementation, if that is any achievement.  Ratch
Title: Re: szLen optimize...
Post by: lingo on August 18, 2005, 02:26:57 AM
Ratch, :lol

"The following loop I will call the Agner Fog loop.  It is widely attributed to Agner, but I have my doubts.  I keep seeing it
elsewhere, and I suspect it is old and was well known to computer science academics before Agner.  It is a 7 line loop"


Who cares about the first "author"?
We just use it

If we talk about "doubts" just for example
what about your "new" strlen algo and the similar very old
algo of the Paul Hsieh here:
http://www.azillionmonkeys.com/qed/asmexample.html

Let's compare them:
A. "Update!

While discussing sprite data copying (see next example) I realized that there is a significant improvement for 32-bit x86's that have

slow branching (P-IIs and Athlon.) "
; by Paul Hsieh

    lea     ecx,[ebx-1]
l1: inc     ecx
    test    ecx,3
    jnz     l3
l2: mov     edx,[ecx]        ; U
    mov     eax,07F7F7F7Fh   ;   V
    and     eax,edx          ; U
    add     ecx,4            ;   V
    add     eax,07F7F7F7Fh   ; U
    or      eax,edx          ; U
    and     eax,080808080h   ; U
    cmp     eax,080808080h   ; U
    je      l2               ;   V +1brt
    sub     ecx,4
l3: cmp     byte ptr [ecx],0
    jne     l1
    sub     ecx,ebx

B.  final version of the improved STRLEN by Ratch:
http://www.masmforum.com/simple/index.php?topic=2442.0

"OK, here is my final version of STRLEN, unless someone finds a bug.  It can now detect the obscure 8-bit byte 080H.  It does

this by checking the byte for 080H at the end of the subroutine, and returning to the beginning of the subroutine if that value is

detected. If a lot of 080H bytes are present in the string, a performance penalty will be incurred.  Ratch"

   
004097D1 8B 44 24 04      mov     eax,dword ptr [esp+4]
004097D5 B9 80 80 80 80    mov     ecx,80808080h
Labe_0:
004097DA A8 03            test      al,3
004097DC 74 08            je         004097E6
004097DE F6 00 FF          test      byte ptr [eax],0FFh
004097E1 74 26            je         00409809
Labe_Begin:
004097E3 40                inc       eax 
004097E4 EB F4            jmp      004097DA
;Labe_1:
;mov         ecx, 80808080h -> From 1st version
Labe_2: 
004097E6 8B 10            mov      edx,dword ptr [eax]
004097E8 81E27F7F7F7F and      edx,7F7F7F7Fh
004097EE 83 C0 04          add      eax,4
004097F1 81EA01010101 sub      edx,1010101h
004097F7 23 D1            and      edx,ecx
004097F9 74 EB            je         004097E6  ; Labe_2 

004097FB 83 E8 05          sub      eax,5

Labe_3:
004097FE 40                inc       eax 
004097FF C1 CA 08          ror       edx,8
00409802 73 FA            jae       004097FE  ; Labe_3
00409804 F6 00 FF          test      byte ptr [eax],0FFh
00409807 78 DA            js         004097E3  ; Labe_Begin

Labe_4:
00409809 2B 44 24 04      sub      eax,dword ptr [esp+4]
0040980D C2 04 00          ret       4



"Theoretically, the Ratch7 loop will execute the fastest,
and according to my timings, it does"


Ratch7 is the buliaNaza's algo:
and some time ago I improved it:
http://board.win32asmcommunity.net/index.php?topic=8330.msg77056#msg77056

So "my 7-bit algo" is improved "buliaNaza's algo" and it is faster then Ratch7(buliaNaza)
Pls,try to change inc eax with add eax,1 in your algo too...


"...and according to my timings, it does"
Pls, attach your test files (like me)    :lol

"By the way, you did not test my 7-bit version..."
Because I CAN'T...
It is the most important point in "my algo"
and it is the main reason for me to create the timelen1.asm file
It is the Codewarp's point of view too and I created timelen1.asm file
just as an answer to him rather then to "offend" your algo:
"In other words use 7-bit search as far as you can take it,
then ride the rest of the way using 8-bit search as needed.
In most text, the 8-bit part will never be needed, but when required,
it covers 8-bit as well--the best of both worlds..." by Codewarp


"My algo" starts the job with 7-bit code search part and if it failed
"automatically" switch to 8-bit code search part and tiil to end...

"Your algo" works with 7-bit code search OR with 8-bit code
search but can't switch "automatically" if we have "mixed" string with
7 AND and 8-bit simbols in it


...and you transcribed the MOV ECX 80808080H
in my 8-bit version to the wrong spot in timelen1 "



A. your 1st variant

STRLEN:                     ;it all begins here
  MOV EAX,[ESP+DWORD]       ;address of string

  .WHILE TRUE               ;check DWORD alignment

   TEST AL,DWORD-1          ;is DWORD aligned
   .BREAK .IF ZERO?         ;yes, DWORD aligned

   TEST BYTE PTR [EAX],0FFH ;not aligned, check  for zero byte
   JZ @F                    ;jmp if end of string

   INC EAX                  ;prepare to check next byte
  .ENDW                     ;around the horn

  MOV ECX,080808080H        ;sieve mask

  .REPEAT
.......

B. your 2nd variant

STRLEN:                     ;it all begins here
  MOV EAX,[ESP+DWORD]       ;address of string
  MOV ECX,80808080h
......


Ok, the technical error  is mine but it is not so
important here because it isn't in the main loop...
It is an obvious example how the macros "hide"
the pure code..


"Also you used an old version of my STRLEN in timelen."
I downloaded timelen and added
Biterider, Hutch, LingoMMX  and  LingoSSE2 algos ONLY!
I didn't touch your algo (with macros) there


"I guess my contribution to that idea is a 6 line implementation..."
I agree with you that it is your contribution and I used it
as an idea in my new timelen1.asm file...  :U

I preferred Codewarp's point of view about 5&7 line implementation
rather then 6 line implementation and just tried to proof it...

Right now I prefer new couple 5&6 line implementation (see my new timelen1.asm)
rather then 5&7 line implementation   :lol
   

In conclusion please:
- feel free to edit the test files and algos in your way
  and post them for us...(like me)  :lol
- use the pure code rather then macros (if you can)
- answer the question who will uses my or your GPR algos with
  5&7-5&6 or 6 line implementations if we have similar faster MMX/SSE/SSE2 algos  :lol
- try to optimize my new lingo32 algo  (if you can) (see my new timelen1.asm)   :lol

Here are new results:

Test routines for correctness:
0 byte misalignment
RatchN       0    1    2    3    5    8   13   22   39   55   89  144  239  999
Lingo32      0    1    2    3    5    8   13   22   39   55   89  144  239  999
1 byte misalignment
RatchN       0    1    2    3    5    8   13   22   39   55   89  144  239  999
Lingo32      0    1    2    3    5    8   13   22   39   55   89  144  239  999
2 byte misalignment
RatchN       0    1    2    3    5    8   13   22   39   55   89  144  239  999
Lingo32      0    1    2    3    5    8   13   22   39   55   89  144  239  999
3 byte misalignment
RatchN       0    1    2    3    5    8   13   22   39   55   89  144  239  999
Lingo32      0    1    2    3    5    8   13   22   39   55   89  144  239  999

Proc/Byte    0    1    2    3    5    8   13   22   39   55   89  144  239  999
========= ==== ==== ==== ==== ==== ==== ==== ==== ====

0 byte misalignment
RatchN      17   20   20   22   20   22   28   45   61   84  149  186  276 1546
Lingo32     12   15   16   20   17   21   21   31   43   59  128  148  226 1197

1 byte misalignment
RatchN      21   16   23   36   39   40   66   61   86  109  157  216  301 1650
Lingo32      9    15   18   23   24   28   27   39   49   67  128  179  234 1204

2 byte misalignment
RatchN      10   15   28   29   41   33   46   55   86   96  168  203  284 1676
Lingo32     12   15   20   22   26   29   32   32   51   55  125  174  235 1187

3 byte misalignment
RatchN       9   21   32   25   23   32   36   47   79   87  149  196  291 1550
Lingo32     10   17   18   19   18   28   26   31  43   54   125  157  225 1215

Press enter to exit...



Regards,
Lingo

[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: Ratch on August 18, 2005, 07:45:55 AM
Lingo,

Quote
If we talk about "doubts" just for example
what about your "new" strlen algo and the similar very old
algo of the Paul Hsieh here:
http://www.azillionmonkeys.com/qed/asmexample.html

     The code you present from Paul Hsieh's site is about sprites (whatever they are).  It is a spaghetti code of two or more internal loops that have no resemblance to my search loop.

Quote
Ratch7 is the buliaNaza's algo:
and some time ago I improved it:
http://board.win32asmcommunity.net/index.php?topic=8330.msg77056#msg77056

So "my 7-bit algo" is improved "buliaNaza's algo" and it is faster then Ratch7(buliaNaza)
Pls,try to change inc eax with add eax,1 in your algo too...

     I believe your code referred to by the link was written as a 8-bit algo, because it returns to the loop if a zero is not found.  And it uses a LEA instruction instead of a SUB like my 7-bit algo does, so it is not quite the same.  They are both 5 line core loops, so they should show equal times.  By the way, any byte over 081H will kick your code out of the core loop and slow it up greatly. 

Quote
"By the way, you did not test my 7-bit version..."
Because I CAN'T...
It is the most important point in "my algo"
and it is the main reason for me to create the timelen1.asm file
It is the Codewarp's point of view too and I created timelen1.asm file
just as an answer to him rather then to "offend" your algo:

     If you could test my 8-bit version, I don't see why you cannot test my 7-bit version also.  But no matter. 

Quote
"In other words use 7-bit search as far as you can take it,
then ride the rest of the way using 8-bit search as needed.
In most text, the 8-bit part will never be needed, but when required,
it covers 8-bit as well--the best of both worlds..." by Codewarp

"My algo" starts the job with 7-bit code search part and if it failed
"automatically" switch to 8-bit code search part and tiil to end...
"In other words use 7-bit search as far as you can take it,
then ride the rest of the way using 8-bit search as needed.
In most text, the 8-bit part will never be needed, but when required,
it covers 8-bit as well--the best of both worlds..." by Codewarp

"Your algo" works with 7-bit code search OR with 8-bit code
search but can't switch "automatically" if we have "mixed" string with
7 AND and 8-bit simbols in it

     If you are referring to your code in timelen1, that appears to be a 8-bit search code period.  My 7-bit code expects all characters to be 7-bit ASCII.  If they are not, an error in counting will occur.

Quote
"Your algo" works with 7-bit code search OR with 8-bit code
search but can't switch "automatically" if we have "mixed" string with
7 AND and 8-bit simbols in it

     That is absolutely wrong.  Eight bit code by definition can be from value 0 to 0FFH.  My program only goes out of the loop and returns when the value is 080H, which should be rare.  It can evaluate all the 8-bit values in any order.

Quote
Ok, the technical error  is mine but it is not so
important here because it isn't in the main loop...
It is an obvious example how the macros "hide"
the pure code..

     It becomes important only if there are a lot of 080H bytes in the string.

Quote
In conclusion please:
- feel free to edit the test files and algos in your way
  and post them for us...(like me) 
- use the pure code rather then macros (if you can)
- answer the question who will uses my or your GPR algos with
  5&7-5&6 or 6 line implementations if we have similar faster MMX/SSE/SSE2 algos 
- try to optimize my new lingo32 algo  (if you can) (see my new timelen1.asm)   

     Below are the results of the most recent timelin1.ece run on my machine.  I don't know why your algo shows a little better time than mine, because they both use a 6 line loop.  The difference is not as great as your machine, however.  As I said previously, there is something about timing that is mysterious.  You refer to my using macros.  If you mean .REPEAT, .WHILE, etc., those are not macros.  They are built in directives of MASM, but if they confuse you, I will translate them to jumps instead.  The GPR implementations of STRLENs are only good for old CPUs that do not have MMX/SSE instructions, or if MMX is not available for some reason.  Correct me if I am wrong, but doesn't MMX use the same registers as the FPU?  Ratch


Test routines for correctness:
0 byte misalignment
RatchN       0    1    2    3    5    8   13   22   39   55   89  144  239  999
Lingo32      0    1    2    3    5    8   13   22   39   55   89  144  239  999
1 byte misalignment
RatchN       0    1    2    3    5    8   13   22   39   55   89  144  239  999
Lingo32      0    1    2    3    5    8   13   22   39   55   89  144  239  999
2 byte misalignment
RatchN       0    1    2    3    5    8   13   22   39   55   89  144  239  999
Lingo32      0    1    2    3    5    8   13   22   39   55   89  144  239  999
3 byte misalignment
RatchN       0    1    2    3    5    8   13   22   39   55   89  144  239  999
Lingo32      0    1    2    3    5    8   13   22   39   55   89  144  239  999

Proc/Byte    0    1    2    3    5    8   13   22   39   55   89  144  239  999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====

0 byte misalignment
RatchN      10   11   13   18   15   16   18   29   61   81   95  136  221 1000
Lingo32      7    7   10   13   12   12   15   22   48   56   77  118  175  883

1 byte misalignment
RatchN       5   10   15   17   23   24   29   39   60   72  106  145  216 1025
Lingo32      9   12   17   17   19   19   23   30   57   68   86  124  187  896

2 byte misalignment
RatchN       5   10   14   17   24   23   30   31   60   74  105  147  218 1021
Lingo32      9   13   13   13   19   18   22   23   54   66   93  123  187  919

3 byte misalignment
RatchN       5   12   14   17   15   24   21   29   62   74   96  148  220  999
Lingo32      8   11   12   13   13   16   19   23   53   65   86  121  185  900

Press enter to exit...

Title: Re: szLen optimize...
Post by: hutch-- on August 18, 2005, 09:01:25 AM
Guys,

Let me ask you this question, I know Lingo is developing on a very late PIV, what box are you using Ratch ? I ask the question because you bot appear not to agree on the benchmarking results and it just may be AMD/Intel differences.
Title: Re: szLen optimize...
Post by: lingo on August 18, 2005, 12:11:09 PM
Ratch,

To be honest I can't understand what is your problem
and what you expect from me...

Hutch,
" it just may be AMD/Intel differences.

f0dder was so kind to test the same files
on his box with  Socket 939 AMD64 3500+,
TWINX1024-3200C2 DDR-400 (2x512MB),
MSI K8N-NEO4-Platinum nForce4 chipset.

http://board.win32asmcommunity.net/index.php?topic=21565.0


Regards,
Lingo
Title: Re: szLen optimize...
Post by: Ratch on August 18, 2005, 03:11:58 PM
Lingo,

Quote
Ratch,

To be honest I can't understand what is your problem
and what you expect from me...

     Why do you preceive there to be a problem between us?  You first made a comment about my reply to Codewarp, which I answered.  Then you answered back and so on.  I do not expect anything from you.

Hutch,

Quote
Guys,

Let me ask you this question, I know Lingo is developing on a very late PIV, what box are you using Ratch ? I ask the question because you bot appear not to agree on the benchmarking results and it just may be AMD/Intel differences.

     Both our algos use a 6 line core loop.  I was wondering why there was so much discrepancy between the results when run on his and my machines, especially on long strings.  This is true even when I run Lingo's timelen1.exe test on my machine.  I fear there is something about timing that none of us know about, or worse, nothing we can do even if we did know.  I use a 1 Ghz AMD Athlon which is a few years old now.  But no matter how fast or slow the machine, the same length of loop should be somewhat the same timing.  Any comments or speculation would be appreciated.  Ratch
Title: Re: szLen optimize...
Post by: Jimg on August 18, 2005, 03:50:17 PM
Ratch, Lingo-

I love this stuff myself, but it has become painfully obvious that athlons and pentiums do not work the same.  You can optimize for one or the other.  I have an athlon myself, and I can optimize all day to save a few cycles based on some carefully selected instructions or strategically placed nop's, and a pentium user would find that the new code runs slower than the old.  It's really no use to argue with each other if we're not using the same cpu, just present your best and let the rest of us pick whichever we want.

That being said, I do think it's really important to have general purpose routines.  There are very few cpus left which don't have mmx capability, but the number that can do sse2 is still a small percentage of the total.  SSE2 is fun and very appealing, but I certainly wouldn't use it without checking if the cpu was capable first.  And checking if the cpu is capable takes longer than any savings in the code itself on these little gp routines.

Also, I agree, I really think our timing assumptions are seriously flawed.  Unfortunately, it's all we have.  There needs to be some way to find the real world performance of a routine as it is normally used.  We seldom call any of these routines over and over a million times in a tight loop.  The real timing test is how long does your code take when it is called just once and is not in the cache.  Does it take longer to execute a routine the first time because it is long and complicated than the time it saves by being tricky?  How do we test such a thing?  I've been playing with calling each routine once in sequence, then repeating the sequence and averaging the results, but I can't get any consistency at all.  Anyone have any ideas?

But all together, most of us are enjoying the competition, just keep it civil and keep those great ideas coming!
Title: Re: szLen optimize...
Post by: Mark Jones on August 18, 2005, 03:55:15 PM
Ratch, if I may butt in with my two cents, I've heard it mentioned on numerous occasions that trying to get a quantitive, definite value on CPU timings is a sure-fire way to drive yourself insane. (http://dukunbola.com/tagboard/e/spin.gif) This is due to the fact that virtually all processor architectures are different - some use more "pipelines" for concurrently executing or staging (or optimizing) instructions, others have a bigger/smaller L2 cache, some physically handle ALU/MMX/SSE differently (or not at all...) yadda, yadda, yadda. It's like measuring harmonic cabin vibrations of various models of Chrysler vehicles... :bg

The easiest solution is as Michael had said, simply time the proc on as many hardware(s) as you can, and just accept the results. :)

Of course if you're looking to optimize the code for a specific processor, check out Agner Fog's and Michael's optimization guides. Maybe if you compared Intel datasheets with AMD datasheets you might find a minimum and maximum execution time for a specific instruction, but even so, cache and optimization(s) are going to skew the actual timings. Keep It Simple. :)
Title: Re: szLen optimize...
Post by: Ratch on August 18, 2005, 03:58:52 PM
Jimg

Quote
It's really no use to argue with each other if we're not using the same cpu, just present your best and let the rest of us pick whichever we want.....

Also, I agree, I really think our timing assumptions are seriously flawed.  Unfortunately, it's all we have....


    Thanks Jimg, your observations are the better than both Lingo's and mine put together.  Ratch
Title: Re: szLen optimize...
Post by: Ratch on August 18, 2005, 04:08:31 PM
Mark Jones,
Quote
The easiest solution is as Michael had said, simply time the proc on as many hardware(s) as you can, and just accept the results. :)

     I agree with just about everything you said in your last post.  Unfortunately I don't have the hardware or inclination to test my algo on a suite of platforms.  But no matter.  I usually try to just keep the highly used portions of the programs such as the loops as short as possible, and avoid the "bad" instructions like DIV, LOOP, REP SCASB, XLAT, etc.  Ratch
Title: Re: szLen optimize...
Post by: hutch-- on August 18, 2005, 08:55:39 PM
I worked on an algo recently with Jim and we both use different hardware, mine is a 2.8 gig PIV and Jim was testing on a late model AMD pre 64 bit processor. We were getting different times on different code techniques which displayed the hardware differences so together we produced a version that averaged the best across both processors.

This is basically the joys of writing mixed model code where if you want general purpose code, you must test across a number of different machines and correlate the results.
Title: Re: szLen optimize...
Post by: Codewarp on August 22, 2005, 08:51:26 PM
Quote from: Jimg on August 18, 2005, 03:50:17 PM

...I do think it's really important to have general purpose routines.  There are very few cpus left which don't have mmx capability, but the number that can do sse2 is still a small percentage of the total.  SSE2 is fun and very appealing, but I certainly wouldn't use it without checking if the cpu was capable first.  And checking if the cpu is capable takes longer than any savings in the code itself on these little gp routines.

Jimg,

As I have mentioned before in other postings here, none of these routines should be using cpuid directly--ever!  Rather, the library startup code containing the routines in question should use cpuid to set feature-DWORDs testable from anywhere using a single memory test instruction.  This method reduces the test overhead to a single cycle or less.  My overhead for some of this is down to zero--because my library simply refuses to load if mmx and conditional moves are not supported, so no tests for these features are necessary.  This affords me complete freedom to employ sse, sse2 and sse3 as I see fit anywhere in my supporting libraries.

Quote
Also, I agree, I really think our timing assumptions are seriously flawed.  Unfortunately, it's all we have.  There needs to be some way to find the real world performance of a routine as it is normally used.  We seldom call any of these routines over and over a million times in a tight loop.  The real timing test is how long does your code take when it is called just once and is not in the cache.  Does it take longer to execute a routine the first time because it is long and complicated than the time it saves by being tricky?  How do we test such a thing?  I've been playing with calling each routine once in sequence, then repeating the sequence and averaging the results, but I can't get any consistency at all.  Anyone have any ideas?

Clock cycles are the best measure we have of pure cpu cost of execution implied by a particular sequence of code.  If you muddy the waters by injecting the differences in cache contents, page faults, motherboard design, chipsets, cpu clock multipliers and financial resources for obtaining the fastest memory sticks, clock cycles will tell you little more a State of the Union address--next to nothing.

Your desire for a real-world test is legitimate and shared by many others, but I am afraid that clock cycle counts are not it.  If clock counts represent the best case of a run, why not set up the conditions for the worst case, call the routine exactly once, then report the cycles consumed?  Unfortunately, every machine would report something different--not all that useful.  Our testing is not flawed at all--you have requirements that go beyond this particular metric, that's all.

Furthermore, some aspects that greatly affect execution time--such as cache utilization and locality of reference--cannot be tested outside the context of an entire running application, nor tested with clock cycles.  Real-world performance testing--look elsewhere. 

One of my favorite techniques, is to determine the cost of a particular routine in a complete application context by weighting it down with additional known cost, then measuring the drag on overall performance in real time.  From this data you can compute the percentage of overall execution time taken by this routine, and hence the real time spent in that routine.  You can then compare the difference between your "idealized" clock performance and your real-time performance.  However don't be fooled--this difference will vary across machines and other factors.
Title: Re: szLen optimize...
Post by: Snouphruh on September 15, 2005, 07:12:40 AM
what do you think about this code? :

szBuffer            db 'sdkjahgkyugkuygfkljashdgvlkasgdfkluygqweoiugalsdkf', 0
...
                    push offset szBuffer
                    call myStrLength
...
myStrLength:
                    pop edi
                    pop esi
                    push edi
                    mov edx, esi
ll_Loop:
                    mov al, byte ptr [esi]
                    inc esi
                    test al, al
                    jnz ll_Loop
                    dec esi
                    mov eax, esi
                    sub eax, edx
                    retn
Title: Re: szLen optimize...
Post by: lingo on September 15, 2005, 10:29:46 PM
Snouphruh, :bg

"what do you think about this code?"

You can use eax and ecx registers (without preserving)
rather than edi and esi

myStrLength:
      or ecx, -1
      mov eax, [esp+4] ; eax->szBuffer
      sub ecx, eax ; ecx->-szBuffer-1
ll_Loop:
              cmp byte ptr [eax], 0
              lea eax,[eax+1]
              jnz ll_Loop
      add eax, ecx 
              ret 4
   

Regards,
Lingo
Title: Re: szLen optimize...
Post by: Snouphruh on September 16, 2005, 08:38:34 AM
nice! very nice!
but I heard LEA is slow.
and MOV EAX, [ESP + 4] takes 4 bytes long.

what if...:

myStrLength:
                   or ecx, -1
                   pop edx         ; edx = return address
                   pop eax         ; eax = pointer to the buffer
                   sub ecx, eax
ll_Loop:
                   cmp byte ptr [eax], 0
                   lea eax, [eax + 1]
                   jnz ll_Loop
                   add eax, ecx
                   jmp edx


but my previous example has body loop:

                   mov al, byte ptr [esi]
                   inc esi
                   test al, al
                   jnz ll_Loop

which takes only 2 CPU clocks, 'cause MOV and INC are being paired as well as TEST and JNZ are.
Title: Re: szLen optimize...
Post by: jdoe on December 21, 2008, 07:44:23 AM

Sorry for that old topic rivial  :P

I just would like to have some timing results of these functions. Please, don't forget to write your processor AMD/Intel.
These ones a much slower than the others but I need to choose and I don't know which one is better for Intel.

Thanks a lot


AMD Athlon XP 1800+

Quote
lstrlenA return value     : 191
AzmtStrLen1A return value : 191
AzmtStrLen2A return value : 191

lstrlenA     : 439 cycles
AzmtStrLen1A : 210 cycles
AzmtStrLen2A : 203 cycles

Press any key to exit...



[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: jj2007 on December 21, 2008, 08:10:28 AM
Celeron M:

lstrlenA return value     : 191
AzmtStrLen1A return value : 191
AzmtStrLen2A return value : 191

lstrlenA     : 431 cycles
AzmtStrLen1A : 283 cycles
AzmtStrLen2A : 224 cycles
Title: Re: szLen optimize...
Post by: sinsi on December 21, 2008, 08:14:14 AM
q6600

lstrlenA     : 255 cycles
AzmtStrLen1A : 191 cycles
AzmtStrLen2A : 192 cycles

Title: Re: szLen optimize...
Post by: jdoe on December 21, 2008, 08:21:51 AM


Wow sinsi, I'm surprised that lstrlen does not perform more worstly than that.

Title: Re: szLen optimize...
Post by: Biterider on December 21, 2008, 09:12:01 AM
PIII 500

lstrlenA return value     : 191
AzmtStrLen1A return value : 191
AzmtStrLen2A return value : 191

lstrlenA     : 457 cycles
AzmtStrLen1A : 241 cycles
AzmtStrLen2A : 229 cycles

Biterider
Title: Re: szLen optimize...
Post by: Biterider on December 21, 2008, 09:25:22 AM
Hi
After looking into your code, the only problem i see is that if the string pointer is not aligned to 4 and the string is at the end of an allocated memory page, the algo can produce a GPF.
I suggest to check the lower 2 bits of the string pointer and jump according to them into to comparison chain. Previously you have to set the lower 2 bits to zero and load ecx with the content of an aligned address.

Biterider
Title: Re: szLen optimize...
Post by: jdoe on December 21, 2008, 09:59:03 AM

You're right, I completly forgot to make the string aligned of a 4 byte boundary.
This algo is made for aligned string so I not planning to change it... but thanks anyway.

:U







[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: MichaelW on December 21, 2008, 11:27:51 AM
I added a (not very well tested) procedure that is essentially one posted by Mark Larson, roughly 3 years ago. I think my very early version of Windows 2000, even though it has the latest SP installed, has a slow version of lstrlen. This is running on a P3:

lstrlenA return value     : 191
AzmtStrLen1A return value : 191
AzmtStrLen2A return value : 191
markl_szlen return value  : 191

lstrlenA      : 826 cycles
AzmtStrLen1A  : 238 cycles
AzmtStrLen2A  : 227 cycles
markl_szlen   : 177 cycles



[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: jdoe on December 21, 2008, 12:19:07 PM
Seeing the Michael's result, there is no doubt that coding our own string length algo is not worthless.

The problem with this algo of  Mark Larsen is that the end of the string must be 4 null-char and not just one.


Title: Re: szLen optimize...
Post by: NightWare on December 22, 2008, 01:27:16 AM
if mmx isn't a problem, this version (short, quite fast and reduce branch mispredictions to the minimum) :
ALIGN 16
;
; obtenir la taille réelle d'une chaîne de caractères
; note : la chaîne doit être alignée sur 4/8/16 octets (pour une meilleure vitesse)
;
; Syntax :
; mov esi,OFFSET {start address of the string}
; call StringLength_Mmx_Mini
;
; Return :
; eax = string length
;
StringLength_Mmx_Mini PROC
push edx ;; empiler edx

mov eax,esi ;; placer l'adresse de la chaîne dans ecx
; nop ;; ) aucun alignement nécessaire pour un meilleur rendement
; nop ;; )
; nop ;; )
; nop ;; )
; nop ;; )
; nop ;; )
; nop ;; )
; nop ;; )
pxor MM0,MM0 ;; effacer MM0 (ce sera notre registre de comparaison)
Label1: pcmpeqb MM0,QWORD PTR [eax] ;; comparer les octets en parallèle, pour voir si l'un de ceux en MM0 est égal à 0
pmovmskb edx,MM0 ;; générer le masque de MM0 dans edx
add eax,8 ;; ajouter 8 à eax (notre pas de progression)
test edx,edx ;; fixer les flags de edx
jz Label1 ;; si c'est égal à 0, aller Label1
sub eax,esi ;; soustraire l'adresse de départ à eax
bsf edx,edx ;; scanner le premier bit armé à partir de la droite
lea eax,[eax+edx-8] ;; placer eax+edx-8 dans eax

pop edx ;; désempiler edx
ret ;; retourner (sortir de la procédure)
StringLength_Mmx_Mini ENDP

hmm, i don't know the ratio of lamps i could have with this one, but it must be clearly good... (clearly, because of the numbers of lamps, of course...  :lol)
Title: Re: szLen optimize...
Post by: sinsi on December 22, 2008, 03:39:56 AM
That algo gets 47 cycles on mine - very fast! Is it only MMX though? I had to use ".xmm" to get it to compile.
Title: Re: szLen optimize...
Post by: NightWare on December 22, 2008, 03:49:14 AM
you're right, pmovmskb has been implemented only with sse...  :red
Title: Re: szLen optimize...
Post by: qWord on December 22, 2008, 03:52:37 AM
I've modified the testbed (variable alignment) and added an xmm-version:

on my Core2Duo:

lstrlenA return value     : 191
AzmtStrLen1A return value : 191
AzmtStrLen2A return value : 191
markl_szlen return value  : 191
StringLength_Mmx_Min
(MMX/SSE2) return value   : 191
StrSizeA(SSE2) value      : 191

align 0

lstrlenA      :       252 cycles
AzmtStrLen1A  :       193 cycles
AzmtStrLen2A  :       193 cycles
markl_szlen   :       106 cycles
StringLength_Mmx_Min: 72 cycles
StrSizeA(SSE2):       38 cycles

align 1

lstrlenA      :       247 cycles
AzmtStrLen1A  :       216 cycles
AzmtStrLen2A  :       221 cycles
markl_szlen   :       152 cycles
StringLength_Mmx_Min: 103 cycles
StrSizeA(SSE2):       102 cycles

align 4

lstrlenA      :       244 cycles
AzmtStrLen1A  :       193 cycles
AzmtStrLen2A  :       192 cycles
markl_szlen   :       90 cycles
StringLength_Mmx_Min: 109 cycles
StrSizeA(SSE2):       92 cycles

align 7

lstrlenA      :       246 cycles
AzmtStrLen1A  :       203 cycles
AzmtStrLen2A  :       202 cycles
markl_szlen   :       126 cycles
StringLength_Mmx_Min: 96 cycles
StrSizeA(SSE2):       105 cycles

Press any key to exit...





[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: donkey on December 22, 2008, 10:25:48 AM
I wrote a similar one a few years ago for my string library...

lszLenMMX FRAME pString

mov eax,[pString]
nop
nop ; fill in stack frame+mov to 8 bytes

pxor mm0,mm0
nop ; fill pxor to 4 bytes
pxor mm1,mm1
nop ; fill pxor to 4 bytes

: ; this is aligned to 16 bytes
movq mm0,[eax]
pcmpeqb mm0,mm1
add eax,8
pmovmskb ecx,mm0
or ecx,ecx
jz <

sub eax,[pString]

bsf ecx,ecx
sub eax,8
add eax,ecx

emms


   RET

ENDF
Title: Re: szLen optimize...
Post by: japheth on December 22, 2008, 12:34:26 PM

lstrlen() is known not to be the fastest algo.
AFAIK strlen() from MSVCRT is significantly faster. You probably should compare your routine with that version as well.
Title: Re: szLen optimize...
Post by: jdoe on December 22, 2008, 01:30:38 PM
Quote from: japheth on December 22, 2008, 12:34:26 PM

lstrlen() is known not to be the fastest algo.
AFAIK strlen() from MSVCRT is significantly faster. You probably should compare your routine with that version as well.


Yes, the strlen from the c runtime is quite fast but a little slower than a custom agner fog algo, though I think they use the same tricks. These functions takes speed with long strings but are easy to beat on small strings. But it is a good point that if you don't want to code your algo, the ansi version of strlen from msvcrt is the best alternative to lstrlen. For the unicode version, that's another story.

:U

Title: Re: szLen optimize...
Post by: NightWare on December 22, 2008, 10:35:44 PM
hmm, if it's sse due to pmovmskb, then a fully sse version  :toothy :
ALIGN 16
;
; obtenir la taille réelle d'une chaîne de caractères
; note : la chaîne doit être alignée sur 4/8/16 octets (pour une meilleure vitesse)
;
; Syntax :
; mov esi,OFFSET {start address of the string}
; call StringLength_Sse
;
; Return :
; eax = string length
;
StringLength_Sse PROC
push ecx ;; empiler ecx
push edx ;; empiler edx

mov edx,esi ;; placer l'adresse de départ dans edx
; nop ;; ) aucun alignement nécessaire pour un meilleur rendement
; nop ;; )
; nop ;; )
; nop ;; )
; nop ;; )
; nop ;; )
; nop ;; )
; nop ;; )
pxor XMM0,XMM0 ;; ) effacer XMM0 et XMM1 (ce sera nos registres de comparaison)
pxor XMM1,XMM1 ;; )
; ici, on teste un bloc de 32 caractères, pour voir s'il existe un 0
Label1: pcmpeqb XMM0,OWORD PTR [edx] ;; comparer le qword à l'adresse en edx à XMM0
pcmpeqb XMM1,OWORD PTR [edx+16] ;; comparer le qword à l'adresse en edx+16 à XMM1
por XMM1,XMM0 ;; fusionner XMM1 et XMM0
add edx,OWORD*2 ;; ajouter 32 (notre pas de progression) à edx
pmovmskb eax,XMM1 ;; générer le masque de XMM1 dans eax
test eax,eax ;; fixer les flags de eax
jz Label1 ;; si c'est égal à 0 (pas de 0 trouvé), aller Label1
; ici, on va chercher le 0 dans le bloc de 32 caractères
pmovmskb ecx,XMM0 ;; générer le masque de XMM0 dans ecx
shl eax,16 ;; décaler eax (qui contient déjà le masque XMM1) à gauche d'un dword/2
or eax,ecx ;; fusionner ecx à eax
sub edx,esi ;; enlever l'adresse de départ à edx
sub edx,OWORD*2 ;; enlever la dernière progression à edx
bsf eax,eax ;; scanner le premier bit armé de eax à partir de la droite
add eax,edx ;; ajouter edx à eax pour obtenir la taille finale

pop edx ;; désempiler edx
pop ecx ;; désempiler ecx
ret ;; retourner (sortir de la procédure)
StringLength_Sse ENDP
Title: Re: szLen optimize...
Post by: lingo on March 06, 2009, 09:01:12 AM
I use similar algo with masm64... :wink
My results->Windows Vista Ultimate 64bit  - SP1
CPU-DualCore Intel Core 2 Duo E8500, 3.16 GHz

C:\My Documents\ASM\strlen>strlena
lstrlenA return value     : 191
strlen64Lingo return value: 191
AzmtStrLen1A return value : 191
AzmtStrLen2A return value : 191
markl_szlen return value  : 191
StringLength_Mmx_Min
(MMX/SSE2) return value   : 191
StrSizeA(SSE2) value      : 191

align 0

lstrlenA      :       258 cycles
strlen64Lingo :       20 cycles
AzmtStrLen1A  :       191 cycles
AzmtStrLen2A  :       191 cycles
markl_szlen   :       100 cycles
StringLength_Mmx_Min: 54 cycles
StrSizeA(SSE2):       39 cycles

align 1

lstrlenA      :       254 cycles
strlen64Lingo :       20 cycles
AzmtStrLen1A  :       220 cycles
AzmtStrLen2A  :       218 cycles
markl_szlen   :       151 cycles
StringLength_Mmx_Min: 112 cycles
StrSizeA(SSE2):       97 cycles

align 4

lstrlenA      :       254 cycles
strlen64Lingo :       20 cycles
AzmtStrLen1A  :       191 cycles
AzmtStrLen2A  :       191 cycles
markl_szlen   :       89 cycles
StringLength_Mmx_Min: 114 cycles
StrSizeA(SSE2):       113 cycles

align 7

lstrlenA      :       254 cycles
strlen64Lingo :       20 cycles
AzmtStrLen1A  :       200 cycles
AzmtStrLen2A  :       200 cycles
markl_szlen   :       119 cycles
StringLength_Mmx_Min: 99 cycles
StrSizeA(SSE2):       109 cycles

Press any key to exit...




[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: jj2007 on March 06, 2009, 04:04:20 PM
Hi Lingo,
As usual, your algo beats the hell out of 'em... at least for long strings:

lstrlenA return value     : 1024
strlen64Lingo return value: 1024
AzmtStrLen1A return value : 1024
AzmtStrLen2A return value : 1024
markl_szlen return value  : 1024
StringLength_Mmx_Min
(MMX/SSE2) return value   : 1024
StrSizeA(SSE2) value      : 1024
_strlen return value      : 1024

align 1k

lstrlenA return value: 1024
lstrlenA      :       3032 cycle
strlen64Lingo :       370 cycles
AzmtStrLen1A  :       1407 cycle
AzmtStrLen2A  :       1407 cycle
markl_szlen   :       581 cycles
StringLength_Mmx_Min: 908 cycles
StrSizeA(SSE2):       600 cycles
_strlen (Agner Fog):  635 cycles

align 0

lstrlenA return value: 191
lstrlenA      :       630 cycles
strlen64Lingo :       369 cycles
AzmtStrLen1A  :       301 cycles
AzmtStrLen2A  :       307 cycles
markl_szlen   :       276 cycles
StringLength_Mmx_Min: 224 cycles
StrSizeA(SSE2):       114 cycles
_strlen (Agner Fog):  103 cycles

align 1

lstrlenA return value: 191
lstrlenA      :       632 cycles
strlen64Lingo :       370 cycles
AzmtStrLen1A  :       383 cycles
AzmtStrLen2A  :       382 cycles
markl_szlen   :       276 cycles
StringLength_Mmx_Min: 304 cycles
StrSizeA(SSE2):       138 cycles
_strlen (Agner Fog):  111 cycles

align 4

lstrlenA return value: 191
lstrlenA      :       628 cycles
strlen64Lingo :       371 cycles
AzmtStrLen1A  :       301 cycles
AzmtStrLen2A  :       304 cycles
markl_szlen   :       251 cycles
StringLength_Mmx_Min: 339 cycles
StrSizeA(SSE2):       142 cycles
_strlen (Agner Fog):  114 cycles

align 7

lstrlenA return value: 191
lstrlenA      :       628 cycles
strlen64Lingo :       369 cycles
AzmtStrLen1A  :       384 cycles
AzmtStrLen2A  :       387 cycles
markl_szlen   :       274 cycles
StringLength_Mmx_Min: 321 cycles
StrSizeA(SSE2):       144 cycles
_strlen (Agner Fog):  115 cycles


I added the last algo, see attachment.

[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: lingo on March 06, 2009, 05:00:36 PM
Thanks, but I can't understand why to use so big strings...
I included A.Fog's algo too, but it is slower... :wink


C:\My Documents\ASM\strlen>strlena
lstrlenA return value     : 191
strlen64Lingo return value: 191
A.Fog StrLen return value : 191
AzmtStrLen1A return value : 191
AzmtStrLen2A return value : 191
markl_szlen return value  : 191
StringLength_Mmx_Min
(MMX/SSE2) return value   : 191
StrSizeA(SSE2) value      : 191

align 0

lstrlenA      :       259 cycles
strlen64Lingo :       20 cycles
A.Fog StrLen  :       36 cycles
AzmtStrLen1A  :       192 cycles
AzmtStrLen2A  :       191 cycles
markl_szlen   :       100 cycles
StringLength_Mmx_Min: 50 cycles
StrSizeA(SSE2):       39 cycles

align 1

lstrlenA      :       243 cycles
strlen64Lingo :       20 cycles
A.Fog StrLen  :       49 cycles
AzmtStrLen1A  :       222 cycles
AzmtStrLen2A  :       218 cycles
markl_szlen   :       151 cycles
StringLength_Mmx_Min: 113 cycles
StrSizeA(SSE2):       97 cycles

align 4

lstrlenA      :       254 cycles
strlen64Lingo :       20 cycles
A.Fog StrLen  :       44 cycles
AzmtStrLen1A  :       192 cycles
AzmtStrLen2A  :       191 cycles
markl_szlen   :       89 cycles
StringLength_Mmx_Min: 112 cycles
StrSizeA(SSE2):       110 cycles

align 7

lstrlenA      :       243 cycles
strlen64Lingo :       20 cycles
A.Fog StrLen  :       49 cycles
AzmtStrLen1A  :       200 cycles
AzmtStrLen2A  :       200 cycles
markl_szlen   :       119 cycles
StringLength_Mmx_Min: 99 cycles
StrSizeA(SSE2):       109 cycles

Press any key to exit...




[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: jj2007 on March 06, 2009, 05:38:54 PM
Quote from: lingo on March 06, 2009, 05:00:36 PM
Thanks, but I can't understand why to use so big strings...
I included A.Fog's algo too, but it is slower... :wink


Strange. Here are my Celeron M results, and the Agner Fog algo is a lot faster on non-aligned short strings... ::)

align 0

lstrlenA return value: 191
lstrlenA      :       429 cycles
strlen64Lingo :       198 cycles
AzmtStrLen1A  :       283 cycles
AzmtStrLen2A  :       223 cycles
markl_szlen   :       113 cycles
StringLength_Mmx_Min: 72 cycles
StrSizeA(SSE2):       72 cycles
_strlen (Agner Fog):  91 cycles

align 1

lstrlenA return value: 191
lstrlenA      :       422 cycles
strlen64Lingo :       198 cycles
AzmtStrLen1A  :       282 cycles
AzmtStrLen2A  :       230 cycles
markl_szlen   :       144 cycles
StringLength_Mmx_Min: 118 cycles
StrSizeA(SSE2):       87 cycles
_strlen (Agner Fog):  64 cycles
Title: Re: szLen optimize...
Post by: lingo on March 07, 2009, 04:20:22 PM
Nothing strange for me...My results are the same as the results of qWord...  :wink
On my old lapi with AMD Turion 64 ML-30, 1.6GHz
and Vista64bit Ultimate-SP1:

lstrlenA return value     : 191
strlen64Lingo return value: 191
A.Fog StrLen return value : 191
AzmtStrLen1A return value : 191
AzmtStrLen2A return value : 191
markl_szlen return value  : 191
StringLength_Mmx_Min
(MMX/SSE2) return value   : 191
StrSizeA(SSE2) value      : 191

align 0

lstrlenA      :       425 cycles
strlen64Lingo :       55 cycles
A.Fog StrLen  :       223 cycles
AzmtStrLen1A  :       175 cycles
AzmtStrLen2A  :       174 cycles
markl_szlen   :       109 cycles
StringLength_Mmx_Min: 103 cycles
StrSizeA(SSE2):       101 cycles

align 1

lstrlenA      :       425 cycles
strlen64Lingo :       55 cycles
A.Fog StrLen  :       211 cycles
AzmtStrLen1A  :       213 cycles
AzmtStrLen2A  :       218 cycles
markl_szlen   :       111 cycles
StringLength_Mmx_Min: 112 cycles
StrSizeA(SSE2):       106 cycles

align 4

lstrlenA      :       426 cycles
strlen64Lingo :       55 cycles
A.Fog StrLen  :       198 cycles
AzmtStrLen1A  :       175 cycles
AzmtStrLen2A  :       175 cycles
markl_szlen   :       108 cycles
StringLength_Mmx_Min: 112 cycles
StrSizeA(SSE2):       106 cycles

align 7

lstrlenA      :       425 cycles
strlen64Lingo :       55 cycles
A.Fog StrLen  :       197 cycles
AzmtStrLen1A  :       192 cycles
AzmtStrLen2A  :       191 cycles
markl_szlen   :       113 cycles
StringLength_Mmx_Min: 110 cycles
StrSizeA(SSE2):       104 cycles

Press any key to exit...


Title: Re: szLen optimize...
Post by: jj2007 on March 07, 2009, 04:50:58 PM
Quote from: lingo on March 07, 2009, 04:20:22 PM
Nothing strange for me...My results are the same as the results of qWord...  :wink
On my old lapi with AMD Turion 64 ML-30, 1.6GHz
and Vista64bit Ultimate-SP1

Either there are dramatic differences between AMD and a Celeron M, or we are not talking about the same Agner Fog algo.
Title: Re: szLen optimize...
Post by: drizz on March 07, 2009, 05:24:12 PM
All of the routines, except Agner Fog's, fail on unaligned read beyond end of the buffer

invoke VirtualAlloc,0,1000h,MEM_COMMIT,PAGE_READWRITE
mov esi,eax
invoke RtlZeroMemory,esi,1000h

invoke AzmtStrLen1A,addr [esi+1000h-1]
invoke AzmtStrLen2A,addr [esi+1000h-1]
invoke markl_szlen,addr [esi+1000h-1]
invoke StringLength_Mmx_Min,addr [esi+1000h-1]
invoke StrSizeA,addr [esi+1000h-1]
lea ecx,[esi+1000h-1]
call strlen64

invoke _strlen,addr [esi+1000h-1]; *** working, not buggy

invoke VirtualFree,esi,0,MEM_RELEASE
   
zero should be returned, not access violation.


also markl_szlen function is buggy:

   .data
   teststr db 'ab',0,'a',0
   .code
   invoke markl_szlen,addr teststr

reports size 4
Title: Re: szLen optimize...
Post by: herge on March 08, 2009, 08:58:37 PM
 Hi jj2007:


lstrlenA return value     : 1024
strlen64Lingo return value: 1024
AzmtStrLen1A return value : 1024
AzmtStrLen2A return value : 1024
markl_szlen return value  : 1024
StringLength_Mmx_Min
(MMX/SSE2) return value   : 1024
StrSizeA(SSE2) value      : 1024
_strlen return value      : 1024

align 1k

lstrlenA return value: 1024
lstrlenA      :       1090 cycles
strlen64Lingo :       96 cycles
AzmtStrLen1A  :       1071 cycles
AzmtStrLen2A  :       1074 cycles
markl_szlen   :       416 cycles
StringLength_Mmx_Min: 245 cycles
StrSizeA(SSE2):       226 cycles
_strlen (Agner Fog):  195 cycles

align 0

lstrlenA return value: 191
lstrlenA      :       262 cycles
strlen64Lingo :       99 cycles
AzmtStrLen1A  :       195 cycles
AzmtStrLen2A  :       195 cycles
markl_szlen   :       107 cycles
StringLength_Mmx_Min: 49 cycles
StrSizeA(SSE2):       40 cycles
_strlen (Agner Fog):  51 cycles

align 1

lstrlenA return value: 191
lstrlenA      :       241 cycles
strlen64Lingo :       98 cycles
AzmtStrLen1A  :       204 cycles
AzmtStrLen2A  :       205 cycles
markl_szlen   :       154 cycles
StringLength_Mmx_Min: 104 cycles
StrSizeA(SSE2):       91 cycles
_strlen (Agner Fog):  40 cycles

align 4

lstrlenA return value: 191
lstrlenA      :       240 cycles
strlen64Lingo :       97 cycles
AzmtStrLen1A  :       195 cycles
AzmtStrLen2A  :       195 cycles
markl_szlen   :       106 cycles
StringLength_Mmx_Min: 108 cycles
StrSizeA(SSE2):       104 cycles
_strlen (Agner Fog):  45 cycles

align 7

lstrlenA return value: 191
lstrlenA      :       241 cycles
strlen64Lingo :       98 cycles
AzmtStrLen1A  :       213 cycles
AzmtStrLen2A  :       214 cycles
markl_szlen   :       99 cycles
StringLength_Mmx_Min: 105 cycles
StrSizeA(SSE2):       129 cycles
_strlen (Agner Fog):  40 cycles




Result on my computer.

regards herge
Title: Re: szLen optimize...
Post by: NightWare on March 09, 2009, 12:29:11 AM
Quote from: drizz on March 07, 2009, 05:24:12 PM
All of the routines, except Agner Fog's, fail on unaligned read beyond end of the buffer
...
zero should be returned, not access violation.
technically, -1 should be returned (0 for empty string, and obviously here it's not the case  :wink)
anyway, agner's algo isn't (really) safe, he has just displaced the problem from the end of the area to the beginning. so yes, here it works on current systems coz of microsoft functions, but there is no warrantry it will work in the futur... anyway it's not the proper way to solve the problem, your functions dedicated to string should simply contain a "safe area"...
Title: Re: szLen optimize...
Post by: drizz on March 09, 2009, 01:20:35 AM
What are you talking about?!?

Quote from: NightWare on March 09, 2009, 12:29:11 AM
technically, -1 should be returned (0 for empty string, and obviously here it's not the case  :wink)
but _it is_ an empty string

Quote from: NightWare on March 09, 2009, 12:29:11 AM
anyway, agner's algo isn't (really) safe, he has just displaced the problem from the end of the area to the beginning. so yes, here it works on current systems coz of microsoft functions, but there is no warrantry it will work in the futur...
"coz of microsoft functions"  ?!??

Which operating system allocates memory pages that are not 4kB or bigger?
Which operating system memory allocation functions return unaligned pointer?

facts please!

VirtualAlloc/NtAllocateMemory = 4kB aligned pointer
HeapAlloc/GlobalAlloc = 8-byte aligned on 32-bit platforms and 16-bytes on 64-bit


Windows, Linux, BSD or Mac, 32-bit x86
Quote;*************************  strlenSSE2.asm  **********************************
; Author:           Agner Fog
...
; Operating system: Windows, Linux, BSD or Mac, 32-bit x86

His function is safe, others are not!

AzmtStrLen1A - requires 4-byte aligned pointer
AzmtStrLen2A - requires 4-byte aligned pointer
markl_szlen - requires 16-byte aligned pointer
StringLength_Mmx_Min - requires 8-byte aligned pointer
StrSizeA - requires 16-byte aligned pointer
strlen64 - requires 32-byte aligned pointer
_strlen - no pointer alignment requirements
Title: Re: szLen optimize...
Post by: jj2007 on March 09, 2009, 02:02:54 AM
Quote from: NightWare on March 09, 2009, 12:29:11 AM
anyway, agner's algo isn't (really) safe, he has just displaced the problem from the end of the area to the beginning.

Agner's algo will fail, like all others, in the specific case more thoroughly described here (http://www.masm32.com/board/index.php?topic=10925.msg80375#msg80375). The issue is a non-issue for all real world applications; i.e. you must construct a test case where no null byte is being found near the page boundary, and the memory is allocated with VirtualAlloc. HeapAlloc'ed memory does not throw an exception if you go some bytes beyond the page boundary.

The trick with his algo is that he starts on a safe boundary, and then uses a bsf to eliminate false hits. While bsf is listed as a very slow opcode in opcodes.chm, with 6-42 cycles, my tests show that it is now down to 2 cycles. The AF algo is pretty fast, and it seems only Lingo's routine can beat it, and for longer strings only.
Title: Re: szLen optimize...
Post by: NightWare on March 09, 2009, 02:29:59 AM
Quote from: drizz on March 09, 2009, 01:20:35 AM
but _it is_ an empty string
0 must be returned only if the first byte is 0, and no other case, on a manipulated string area you can have no 0 (displacement of the 2nd part+insert) and the algo will fail... coz there is no size limit (+with this logic, you must take care of the possible fullfilled area for ALL of your string functions (copy, insert,...))

Quote from: drizz on March 09, 2009, 01:20:35 AM
"coz of microsoft functions"  ?!??
ok here i've misread agner's algo, it's safe until what's previously said
Title: Re: szLen optimize...
Post by: drizz on March 09, 2009, 02:47:48 AM
Quote from: NightWare0 for empty string, and obviously here it's not the case
Quote from: NightWare on March 09, 2009, 02:29:59 AM
0 must be returned only if the first byte is 0
the first byte IS 0  :dazzled:  :dazzled:  :dazzled:
I'm not talking nonsense situations like jj
Title: Re: szLen optimize...
Post by: NightWare on March 09, 2009, 03:02:18 AM
Quote from: NightWare on March 09, 2009, 02:29:59 AM
on a manipulated string area you can have no 0 (displacement of the 2nd part+insert) and the algo will fail... coz there is no size limit (+with this logic, you must take care of the possible fullfilled area for ALL of your string functions (copy, insert,...))
:red oops correction... here the non 0 is possible because I USE a safe area... otherwise you're right drizz, no access violation from agner's algo. but i will continue to use my security area...
Title: Re: szLen optimize...
Post by: lingo on March 09, 2009, 03:37:18 AM
jj2007, If I am not wrong may be there is an error in your test
program for my results:
strlen64Lingo :    370 cycles -> align 1k vs
strlen64Lingo :    369 cycles ->align 0      
and from herge's test:
strlen64Lingo :    96 cycles ->align 1k
strlen64Lingo :    99 cycles  ->align 0
What is this: nonsense or manipulation...  :lol


from your test:
align 1k

lstrlenA return value: 1024
lstrlenA      :       3032 cycle
strlen64Lingo :                        370 cycles
AzmtStrLen1A  :       1407 cycle
AzmtStrLen2A  :       1407 cycle
markl_szlen   :       581 cycles
StringLength_Mmx_Min: 908 cycles
StrSizeA(SSE2):       600 cycles
_strlen (Agner Fog):  635 cycles

align 0

lstrlenA return value: 191
lstrlenA      :       630 cycles
strlen64Lingo :                    369 cycles
AzmtStrLen1A  :       301 cycles
AzmtStrLen2A  :       307 cycles
markl_szlen   :       276 cycles
StringLength_Mmx_Min: 224 cycles
StrSizeA(SSE2):       114 cycles
_strlen (Agner Fog):  103 cycles







align 1k

lstrlenA return value: 1024
lstrlenA      :       1090 cycles
strlen64Lingo :       96 cycles
AzmtStrLen1A  :       1071 cycles
AzmtStrLen2A  :       1074 cycles
markl_szlen   :       416 cycles
StringLength_Mmx_Min: 245 cycles
StrSizeA(SSE2):       226 cycles
_strlen (Agner Fog):  195 cycles

align 0

lstrlenA return value: 191
lstrlenA      :       262 cycles
strlen64Lingo :       99 cycles
AzmtStrLen1A  :       195 cycles
AzmtStrLen2A  :       195 cycles
markl_szlen   :       107 cycles
StringLength_Mmx_Min: 49 cycles
StrSizeA(SSE2):       40 cycles
_strlen (Agner Fog):  51 cycles
Title: Re: szLen optimize...
Post by: sinsi on March 09, 2009, 03:56:29 AM
I get the same numbers as herge did - looks like herge has a new computer :bg
Title: Re: szLen optimize...
Post by: jj2007 on March 09, 2009, 07:44:22 AM
Quote from: drizz on March 09, 2009, 02:47:48 AM
I'm not talking nonsense situations like jj

It was precisely my intention to prove that such algos fail only in nonsense situations. There was an earlier thread where people argued that SSE2 is bad because it may read 16 bytes "beyond", except only "one harmless byte".

Quote from: lingo on March 09, 2009, 03:37:18 AM
jj2007, If I am not wrong may be there is an error in your test
..
What is this: nonsense or manipulation...  :lol

No manipulation. I posted the source above, so you can check. But the result is pretty odd indeed, worth investigating.
Title: Re: szLen optimize...
Post by: herge on March 09, 2009, 09:16:24 AM
 Hi All:

I am having trouble compiling StrLenaLingo.asm


C:\masm32\test>\masm32\bin\ml /c /coff /Zi /Zd /Fl "strlenalingo".asm 
Assembling: strlenalingo.asm
strlenalingo.asm(471) : error A2008: syntax error : xmm
strlenalingo.asm(485) : error A2008: syntax error : xmm
strlenalingo.asm(563) : error A2008: syntax error : movdqa

movdqa   xmm1, [eax]           ; read from nearest preceding boundary << 471

movdqa   xmm1, [eax]           ; read 16 bytes aligned << 485

@@: movdqa xmm0,OWORD ptr [edx]          ; << 563


I don't know much about xmm code.
So I don't know how to fix it.

Regards herge

Title: Re: szLen optimize...
Post by: lingo on March 09, 2009, 01:00:05 PM
drizz,
'strlen64 - requires 32-byte aligned pointer'
should be: strlen64 - requires 16-byte aligned pointer :wink
Title: Re: szLen optimize...
Post by: jj2007 on March 09, 2009, 01:37:32 PM
Quote from: jj2007 on March 09, 2009, 07:44:22 AM
Quote from: lingo on March 09, 2009, 03:37:18 AM
jj2007, If I am not wrong may be there is an error in your test
..
What is this: nonsense or manipulation...  :lol

No manipulation. I posted the source above, so you can check. But the result is pretty odd indeed, worth investigating.

I investigated, and Lingo is right, there was an error: His code was always called with the address of the 1024 bytes string, which was kind of unfair :red

Now I took the best of two worlds, i.e. Lingo's speed and Agner's brilliant alignment scheme, and threw them together. The result (shown as strlen32) is, ehm, how to put it: just about good enough for my own private library: :bg

EDIT: Bug fixed - there was a "hole" of 16 bytes between the two parts.

Intel(R) Pentium(R) 4 CPU 3.40GHz (SSE3)

align 1k
lstrlenA return value: 1024
strlen32 return value: 1024
lstrlenA      :       3096 cycles
strlen32      :       347 cycles
strlen64Lingo :       371 cycles
StrSizeA(SSE2):       594 cycles
_strlen (Agner Fog):  633 cycles

align 0
lstrlenA return value: 191
strlen32 return value: 191
lstrlenA      :       637 cycles
strlen32      :       79 cycles
strlen64Lingo :       92 cycles
StrSizeA(SSE2):       122 cycles
_strlen (Agner Fog):  105 cycles

align 1
lstrlenA return value: 191
strlen32 return value: 191
lstrlenA      :       625 cycles
strlen32      :       78 cycles
strlen64Lingo :       not possible
StrSizeA(SSE2):       139 cycles
_strlen (Agner Fog):  112 cycles


Here is the algo, full code is attached.
@Herge: It won't compile with Masm v614 - use JWasm instead.


strlen32 proc src:DWORD ; jj 9 March 2007, 92 (down from 103) bytes
mov eax, [esp+4] ; get pointer to string: -- this part taken from Agner Fog --------
mov ecx, eax ; copy pointer
pxor xmm0, xmm0 ; set to zero for comparison
and eax, -16 ; align pointer by 16
and ecx, 15 ; lower 4 bits indicate misalignment
pcmpeqb xmm0, [eax] ; read 16 from nearest preceding boundary and compare with zero
pmovmskb edx, xmm0 ; get one bit for each byte result
shr edx, cl ; shift out false bits
shl edx, cl ; shift back again
bsf edx, edx ; find first 1-bit
jnz fdr1 ; found in round 1

add eax, 16 ; correct aligned pointer for bytes already treated above
pxor xmm0, xmm0 ; reset to zero for comparisons below
pxor xmm1, xmm1
; align 16 ; no good, costs about one cycle extra
@@: pcmpeqb xmm0, [eax] ; -------------- this part taken from Lingo --------------
pcmpeqb xmm1, [eax+16] ; ecx is pointer to initial string, 16-byte aligned
por xmm1, xmm0
pmovmskb edx, xmm1
add eax, 32 ; len counter (moving up costs 3 cycles for the 191 byte string)
test edx, edx
jz @B

pmovmskb ecx, xmm0
shl edx, 16
or edx, ecx
bsf edx, edx
sub eax, 32 ; subtract initial 16 bytes
fdr1: sub eax, [esp+4]
add eax, edx
ret 4
strlen32 endp

[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: jj2007 on March 09, 2009, 04:42:39 PM
Probably a stupid question:

pxor xmm0, xmm0 ; reset to zero for comparisons below
pxor xmm1, xmm1
if 1 ; crashtest - some values will be incorrect
movdqa xmm1, Minus1
endif
; align 16 ; no good, costs about one cycle extra
@@: pcmpeqb xmm0, [eax] ; -------------- this part taken from Lingo --------------
pcmpeqb xmm1, [eax+16] ; ecx is pointer to initial string, 16-byte aligned
por xmm1, xmm0
pmovmskb edx, xmm1
add eax, 32 ; len counter (moving up costs 3 cycles for the 191 byte string)
test edx, edx
jz @B


Why is it apparently not necessary to reset xmm0 and xmm1 inside the loop? If I insert a movdqa xmm1, Minus1 before the loop, the algo will not work correctly for some strings; but although xmm1 changes a lot inside the loop, results seem not to be affected :dazzled:
Title: Re: szLen optimize...
Post by: jj2007 on March 09, 2009, 07:39:27 PM
0.183 cycles per byte seems quite acceptable - a factor 10 faster than lstrlen. In contrast to the P4, here Lingo's algo is a little bit faster for short strings.

Intel(R) Celeron(R) M CPU        420  @ 1.60GHz (SSE3)
strlen32 codesize=92

align 4k
lstrlenA return value: 4096
strlen32 return value: 4096
strlen32      :       749 cycles
strlen64Lingo :       761 cycles
_strlen (Agner Fog):  1095 cycles

align 1k
lstrlenA return value: 1024
strlen32 return value: 1024
strlen32      :       199 cycles
strlen64Lingo :       200 cycles
_strlen (Agner Fog):  271 cycles

align 0
lstrlenA return value: 191
strlen32 return value: 191
strlen32      :       48 cycles
strlen64Lingo :       44 cycles
_strlen (Agner Fog):  91 cycles

align 1
lstrlenA return value: 191
strlen32 return value: 191
strlen32      :       48 cycles
strlen64Lingo :       not possible
_strlen (Agner Fog):  64 cycles

align 4
lstrlenA return value: 191
strlen32 return value: 191
strlen32      :       48 cycles
strlen64Lingo :       not possible
_strlen (Agner Fog):  64 cycles

align 7
lstrlenA return value: 191
strlen32 return value: 191
strlen32      :       49 cycles
strlen64Lingo :       not possible
_strlen (Agner Fog):  64 cycles
Title: Re: szLen optimize...
Post by: GregL on March 09, 2009, 07:45:10 PM
Herge,

You could use MASM 6.15 or later also.

Title: Re: szLen optimize...
Post by: herge on March 09, 2009, 10:01:38 PM

Hi Greg:

I tried the ML.EXE that you can get if you have c++ 2005 from Microsoft.
It compiles okay, but you get a C...5 error access violation and you
send a message to Microsoft when you run the EXE.

lstrlenA return value     : 1024
strlen64Lingo return value: 1024
AzmtStrLen1A return value : 1024
AzmtStrLen2A return value : 1024
markl_szlen return value  : 1024
StringLength_Mmx_Min
(MMX/SSE2) return value   : 1024
StrSizeA(SSE2) value      : 1024
_strlen return value      : 1024

align 1k

lstrlenA return value: 1024
lstrlenA      :       1082 cycles
strlen64Lingo :       84 cycles
AzmtStrLen1A  :       1061 cycles
AzmtStrLen2A  :       1061 cycles
markl_szlen   :       415 cycles
StringLength_Mmx_Min: 275 cycles
StrSizeA(SSE2):       168 cycles
_strlen (Agner Fog):  183 cycles

align 0

lstrlenA return value: 191
lstrlenA      :       264 cycles
strlen64Lingo :       85 cycles
AzmtStrLen1A  :       194 cycles
AzmtStrLen2A  :       194 cycles
markl_szlen   :       107 cycles
StringLength_Mmx_Min: 71 cycles
StrSizeA(SSE2):       27 cycles
_strlen (Agner Fog):  37 cycles

align 1

lstrlenA return value: 191
lstrlenA      :       240 cycles
strlen64Lingo :       86 cycles
AzmtStrLen1A  :       203 cycles
AzmtStrLen2A  :       203 cycles
markl_szlen   :       154 cycles
StringLength_Mmx_Min: 109 cycles
StrSizeA(SSE2):       ; It Blows up HERE!



Micosoft writes a report.
C:\DOCUME~1\User\LOCALS~1\Temp\a488_appcompat.txt
Which for reasons I don't understand I can't find.
It does a dump in a list box you can Not Copy.
Which I must Say is Most helpful!

I believe we get a C5 error access violation.

Attachments StrLenaLingo ASM OBJ EXE PDB

Regards herge




[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: jj2007 on March 09, 2009, 10:46:52 PM
Hi Herge,
There is a new version towards the bottom of page 13 of this thread, in this post (http://www.masm32.com/board/index.php?topic=1807.msg81053#msg81053). You have a previous one with a tiny bug:

StrSizeA proc lpStrA:DWORD
   
@@:   mov edx,DWORD ptr [esp+4]
   pxor xmm1,xmm1
   mov ecx,edx
      neg ecx
      align 16
@@:   movdqu xmm0,OWORD ptr [edx]
      lea edx,[edx+16]   
      pcmpeqb xmm0,xmm1
      pmovmskb eax,xmm0
   test eax,eax   
      jz @B

@@:   lea ecx,[edx+ecx-16]
      xor edx,edx
      bsf edx,eax
   lea eax,[ecx+edx]
   ret 4

StrSizeA endp

The new version strlen32 is faster and shorter and does not crash.
Title: Re: szLen optimize...
Post by: herge on March 09, 2009, 11:02:02 PM

Hi jj2007:

We Have Lift Off!


lstrlenA return value     : 1024
strlen64Lingo return value: 1024
AzmtStrLen1A return value : 1024
AzmtStrLen2A return value : 1024
markl_szlen return value  : 1024
StringLength_Mmx_Min

(MMX/SSE2) return value   : 1024
StrSizeA(SSE2) value      : 1024
_strlen return value      : 1024

align 1k


lstrlenA return value: 1024
lstrlenA      :       1077 cycles
strlen64Lingo :       84 cycles
AzmtStrLen1A  :       1056 cycles
AzmtStrLen2A  :       1056 cycles
markl_szlen   :       413 cycles
StringLength_Mmx_Min: 275 cycles
StrSizeA(SSE2):       224 cycles
_strlen (Agner Fog):  182 cycles

align 0


lstrlenA return value: 191
lstrlenA      :       259 cycles
strlen64Lingo :       83 cycles
AzmtStrLen1A  :       194 cycles
AzmtStrLen2A  :       193 cycles
markl_szlen   :       105 cycles
StringLength_Mmx_Min: 71 cycles
StrSizeA(SSE2):       38 cycles
_strlen (Agner Fog):  37 cycles

align 1


lstrlenA return value: 191
lstrlenA      :       238 cycles
strlen64Lingo :       84 cycles
AzmtStrLen1A  :       201 cycles
AzmtStrLen2A  :       202 cycles
markl_szlen   :       152 cycles
StringLength_Mmx_Min: 109 cycles
StrSizeA(SSE2):       91 cycles
_strlen (Agner Fog):  49 cycles

align 4


lstrlenA return value: 191
lstrlenA      :       239 cycles
strlen64Lingo :       84 cycles
AzmtStrLen1A  :       191 cycles
AzmtStrLen2A  :       191 cycles
markl_szlen   :       105 cycles
StringLength_Mmx_Min: 95 cycles
StrSizeA(SSE2):       104 cycles
_strlen (Agner Fog):  44 cycles

align 7


lstrlenA return value: 191
lstrlenA      :       235 cycles
strlen64Lingo :       84 cycles
AzmtStrLen1A  :       211 cycles
AzmtStrLen2A  :       210 cycles
markl_szlen   :       98 cycles
StringLength_Mmx_Min: 106 cycles
StrSizeA(SSE2):       138 cycles
_strlen (Agner Fog):  49 cycles


Thank you jj2007.

Regards herge
Title: Re: szLen optimize...
Post by: NightWare on March 10, 2009, 02:25:51 AM
Quote from: jj2007 on March 09, 2009, 04:42:39 PM
Why is it apparently not necessary to reset xmm0 and xmm1 inside the loop?
because xmm0 and xmm1 are defined as zero during the comparisons (until a 0 is found)

here a new one, but must be tested (i've just made few test during conception) :
ALIGN 16
;
; syntax :
; mov esi,OFFSET String
; call NWStrLen
;
; Return :
; eax = String Length
;
NWStrLen PROC
push ecx ;; empiler ecx
push edx ;; empiler edx

mov edx,esi ;; placer l'adresse de départ dans edx
pxor XMM0,XMM0 ;; ) effacer XMM0 et XMM1 (ce sera nos registres de comparaison)
pxor XMM1,XMM1 ;; )
; ici, on teste un bloc de x caractères (dépend de l'alignement), pour voir s'il existe un 0
movdqu XMM2,OWORD PTR [edx] ;; placer l'oword à l'adresse en edx dans XMM2
and edx,0FFFFFFF0h ;; conserver l'alignement 16 précédant dans edx
pcmpeqb XMM0,XMM2 ;; comparer XMM2 à XMM0
pcmpeqb XMM1,OWORD PTR [edx+16] ;; comparer l'oword à l'adresse en edx+16 à XMM1
por XMM1,XMM0 ;; fusionner XMM1 et XMM0
pmovmskb eax,XMM1 ;; générer le masque de XMM1 dans eax
test eax,eax ;; fixer les flags de eax
jz Label1 ;; si c'est égal à 0 (pas de 0 trouvé), aller Label1
; ici, on va chercher le 0 dans le bloc de x caractères
shl eax,16 ;; décaler eax (qui contient déjà le masque XMM1) à gauche d'un dword/2
mov ecx,esi ;; placer l'adresse originelle dans ecx
sub ecx,edx ;; soustraire l'alignement précédant
shr eax,cl ;; décaler eax à droite, correspondant au décalage de l'alignement
pmovmskb ecx,XMM0 ;; générer le masque de XMM0 dans ecx
or eax,ecx ;; fusionner ecx à eax
bsf eax,eax ;; scanner le premier bit armé de eax à partir de la droite

pop edx ;; désempiler edx
pop ecx ;; désempiler ecx
ret ;; retourner (sortir de la procédure)

nop ;; ) alignement nécessaire pour un meilleur rendement
nop ;; )
nop ;; )
; nop ;; )
; nop ;; )
; nop ;; )
; nop ;; )
; nop ;; )
; ici, on teste un bloc de 32 caractères, pour voir s'il existe un 0
Label1: add edx,OWORD*2 ;; ajouter 32 (notre pas de progression) à edx
pcmpeqb XMM0,OWORD PTR [edx] ;; comparer l'oword à l'adresse en edx à XMM0
pcmpeqb XMM1,OWORD PTR [edx+16] ;; comparer l'oword à l'adresse en edx+16 à XMM1
por XMM1,XMM0 ;; fusionner XMM1 et XMM0
pmovmskb eax,XMM1 ;; générer le masque de XMM1 dans eax
test eax,eax ;; fixer les flags de eax
jz Label1 ;; si c'est égal à 0 (pas de 0 trouvé), aller Label1
; ici, on va chercher le 0 dans le bloc de 32 caractères
pmovmskb ecx,XMM0 ;; générer le masque de XMM0 dans ecx
shl eax,16 ;; décaler eax (qui contient déjà le masque XMM1) à gauche d'un dword/2
or eax,ecx ;; fusionner ecx à eax
sub edx,esi ;; enlever l'adresse de départ à edx
bsf eax,eax ;; scanner le premier bit armé de eax à partir de la droite
add eax,edx ;; ajouter edx à eax pour obtenir la taille finale

pop edx ;; désempiler edx
pop ecx ;; désempiler ecx
ret ;; retourner (sortir de la procédure)
NWStrLen ENDP
Title: Re: szLen optimize...
Post by: lingo on March 10, 2009, 05:34:16 AM
I modified a bit my strlen64 and created new strlen64A (Thanks to NightWare for movdqu idea)  :wink
I used jj's test program and have new results:

Intel(R) Core(TM)2 Duo CPU     E8500  @ 3.16GHz (SSE4)
strlen32 retval: 5, 10, 15, 20, 25, 30, 35, 40, 45
strlen32 retval: 5, 10, 15, 20, 25, 30, 35, 40, 45
lstrlenA return value     : 1024
strlen64Lingo return value: 1024
strlen32 return value:      1024
StrSizeA(SSE2) value      : 1024
_strlen return value      : 1024

strlen64A return value      : 1024

align 1k
lstrlenA return value: 1024
strlen32 return value: 1024
strlen32      :       105 cycles
strlen64Lingo :       84 cycles
strlen64LingoA:       83 cycles
_strlen (Agner Fog):  180 cycles

align 0
lstrlenA return value: 191
strlen32 return value: 191
strlen32      :       26 cycles
strlen64Lingo :       18 cycles
strlen64LingoA:       19 cycles
_strlen (Agner Fog):  40 cycles

align 1
lstrlenA return value: 191
strlen32 return value: 191
strlen32      :       26 cycles
strlen64Lingo :       not possible
strlen64LingoA:       22 cycles
_strlen (Agner Fog):  50 cycles

align 4
lstrlenA return value: 191
strlen32 return value: 191
strlen32      :       26 cycles
strlen64Lingo :       not possible
strlen64LingoA:       23 cycles
_strlen (Agner Fog):  50 cycles

align 7
lstrlenA return value: 191
strlen32 return value: 191
strlen32      :       26 cycles
strlen64Lingo :       not possible
strlen64LingoA:       23 cycles
_strlen (Agner Fog):  50 cycles

Press any key to exit...





[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: sinsi on March 10, 2009, 05:45:06 AM
This is getting good...

Intel(R) Core(TM)2 Quad CPU    Q6600  @ 2.40GHz (SSE4)
strlen32 retval: 5, 10, 15, 20, 25, 30, 35, 40, 45
strlen32 retval: 5, 10, 15, 20, 25, 30, 35, 40, 45
lstrlenA return value     : 1024
strlen64Lingo return value: 1024
strlen32 return value:      1024
StrSizeA(SSE2) value      : 1024
_strlen return value      : 1024

strlen64A return value      : 1024

align 1k
lstrlenA return value: 1024
strlen32 return value: 1024
strlen32      :       97 cycles
strlen64Lingo :       84 cycles
strlen64LingoA:       78 cycles
_strlen (Agner Fog):  178 cycles

align 0
lstrlenA return value: 191
strlen32 return value: 191
strlen32      :       24 cycles
strlen64Lingo :       19 cycles
strlen64LingoA:       20 cycles
_strlen (Agner Fog):  40 cycles

align 1
lstrlenA return value: 191
strlen32 return value: 191
strlen32      :       29 cycles
strlen64Lingo :       not possible
strlen64LingoA:       23 cycles
_strlen (Agner Fog):  49 cycles

align 4
lstrlenA return value: 191
strlen32 return value: 191
strlen32      :       25 cycles
strlen64Lingo :       not possible
strlen64LingoA:       23 cycles
_strlen (Agner Fog):  49 cycles

align 7
lstrlenA return value: 191
strlen32 return value: 191
strlen32      :       24 cycles
strlen64Lingo :       not possible
strlen64LingoA:       23 cycles
_strlen (Agner Fog):  49 cycles

Hey jj, the CPU identification is good, now add the Windows version to it as well.  :bg
Title: Re: szLen optimize...
Post by: jj2007 on March 10, 2009, 10:14:32 AM
Quote from: sinsi on March 10, 2009, 05:45:06 AM
This is getting good...
...
Hey jj, the CPU identification is good, now add the Windows version to it as well.  :bg

XP unless otherwise specified. Speedwise, it should not make any difference. You may check this thread (http://www.masm32.com/board/index.php?topic=8802.msg64219#msg64219), but warning, what M$ expects us to do to detect the version is no good for your mental health.

I have incorporated Lingo's new algo, and replaced lstrlen with crt_strlen because lstrlen is no longer a serious competitor for these algos.

              Intel(R) Pentium(R) 4 CPU 3.40GHz (SSE3)
codesizes: strlen32=92, strlen64A=117, _strlen=66

-- test 16k           return values jj, Lingo, Agner: 16384, 16384, 16384
crt_strlen    :       16155 cycles
strlen32      :       4819 cycles
strlen64LingoA :      6208 cycles
_strlen (Agner Fog):  10044 cycles

-- test 4k            return values jj, Lingo, Agner: 4096, 4096, 4096
crt_strlen    :       3973 cycles
strlen32      :       1144 cycles
strlen64LingoA :      1137 cycles
_strlen (Agner Fog):  2308 cycles

-- test 1k            return values jj, Lingo, Agner: 1024, 1024, 1024
crt_strlen    :       1046 cycles
strlen32      :       362 cycles
strlen64LingoA :      357 cycles
_strlen (Agner Fog):  651 cycles

-- test 0             return values jj, Lingo, Agner: 191, 191, 191
crt_strlen    :       260 cycles
strlen32      :       73 cycles
strlen64LingoA :      78 cycles
_strlen (Agner Fog):  108 cycles

-- test 1             return values jj, Lingo, Agner: 191, 191, 191
crt_strlen    :       255 cycles
strlen32      :       84 cycles
strlen64LingoA :      91 cycles
_strlen (Agner Fog):  115 cycles

-- test 4             return values jj, Lingo, Agner: 191, 191, 191
crt_strlen    :       242 cycles
strlen32      :       78 cycles
strlen64LingoA :      80 cycles
_strlen (Agner Fog):  116 cycles

-- test 7             return values jj, Lingo, Agner: 191, 191, 191
crt_strlen    :       257 cycles
strlen32      :       79 cycles
strlen64LingoA :      80 cycles
_strlen (Agner Fog):  111 cycles

[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: herge on March 10, 2009, 11:20:29 AM
 Hi jj2007:

Good Morning here are my results for
strlenSSE2.exe


Intel(R) Core(TM)2 Duo CPU     E4600  @ 2.40GHz (SSE4)
codesizes: strlen32=92, strlen64A=117, _strlen=66

-- test 16k       return values Lingo, jj, Agner: 16384, 16384, 16384
crt_strlen    :       9666 cycles
strlen32      :       1479 cycles
strlen64LingoA :      1139 cycles
_strlen (Agner Fog):  2817 cycles

-- test 4k       return values Lingo, jj, Agner: 4096, 4096, 4096
crt_strlen    :       2427 cycles
strlen32      :       405 cycles
strlen64LingoA :      333 cycles
_strlen (Agner Fog):  720 cycles

-- test 1k       return values Lingo, jj, Agner: 1024, 1024, 1024
crt_strlen    :       648 cycles
strlen32      :       101 cycles
strlen64LingoA :      98 cycles
_strlen (Agner Fog):  197 cycles

-- test 0       return values Lingo, jj, Agner: 191, 191, 191
crt_strlen    :       123 cycles
strlen32      :       26 cycles
strlen64LingoA :      20 cycles
_strlen (Agner Fog):  56 cycles

-- test 1       return values Lingo, jj, Agner: 191, 191, 191
crt_strlen    :       122 cycles
strlen32      :       26 cycles
strlen64LingoA :      33 cycles
_strlen (Agner Fog):  40 cycles

-- test 4       return values Lingo, jj, Agner: 191, 191, 191
crt_strlen    :       122 cycles
strlen32      :       26 cycles
strlen64LingoA :      23 cycles
_strlen (Agner Fog):  46 cycles

-- test 7       return values Lingo, jj, Agner: 191, 191, 191
crt_strlen    :       119 cycles
strlen32      :       26 cycles
strlen64LingoA :      23 cycles
_strlen (Agner Fog):  40 cycles

Press any key to exit...


Regards herge
Title: Re: szLen optimize...
Post by: lingo on March 10, 2009, 11:56:36 AM
On my old lapi with Vista64 Ultimate SP1:  :wink
AMD Turion(tm) 64 Mobile Technology ML-30 (SSE3)
strlen32 retval: 5, 10, 15, 20, 25, 30, 35, 40, 45
strlen32 retval: 5, 10, 15, 20, 25, 30, 35, 40, 45
lstrlenA return value     : 1024
strlen64Lingo return value: 1024
strlen32 return value:      1024
StrSizeA(SSE2) value      : 1024
_strlen return value      : 1024

strlen64A return value      : 1024

align 1k
lstrlenA return value: 1024
strlen32 return value: 1024
strlen32      :       285 cycles
strlen64Lingo :       236 cycles
strlen64LingoA:       236 cycles
_strlen (Agner Fog):  942 cycles

align 0
lstrlenA return value: 191
strlen32 return value: 191
strlen32      :       109 cycles
strlen64Lingo :       53 cycles
strlen64LingoA:       54 cycles
_strlen (Agner Fog):  223 cycles

align 1
lstrlenA return value: 191
strlen32 return value: 191
strlen32      :       74 cycles
strlen64Lingo :       not possible
strlen64LingoA:       64 cycles
_strlen (Agner Fog):  197 cycles

align 4
lstrlenA return value: 191
strlen32 return value: 191
strlen32      :       74 cycles
strlen64Lingo :       not possible
strlen64LingoA:       64 cycles
_strlen (Agner Fog):  198 cycles

align 7
lstrlenA return value: 191
strlen32 return value: 191
strlen32      :       74 cycles
strlen64Lingo :       not possible
strlen64LingoA:       64 cycles
_strlen (Agner Fog):  197 cycles

Press any key to exit...


AMD Turion(tm) 64 Mobile Technology ML-30 (SSE3)
codesizes: strlen32=92, strlen64A=117, _strlen=66

-- test 16k           return values Lingo, jj, Agner: 16384, 16384, 16384
crt_strlen    :       16537 cycles
strlen32      :       3182 cycles
strlen64LingoA :      3126 cycles
_strlen (Agner Fog):  14014 cycles

-- test 4k            return values Lingo, jj, Agner: 4096, 4096, 4096
crt_strlen    :       4132 cycles
strlen32      :       867 cycles
strlen64LingoA :      815 cycles
_strlen (Agner Fog):  3537 cycles

-- test 1k            return values Lingo, jj, Agner: 1024, 1024, 1024
crt_strlen    :       1051 cycles
strlen32      :       288 cycles
strlen64LingoA :      236 cycles
_strlen (Agner Fog):  939 cycles

-- test 0             return values Lingo, jj, Agner: 191, 191, 191
crt_strlen    :       222 cycles
strlen32      :       113 cycles
strlen64LingoA :      54 cycles
_strlen (Agner Fog):  225 cycles

-- test 1             return values Lingo, jj, Agner: 191, 191, 191
crt_strlen    :       217 cycles
strlen32      :       76 cycles
strlen64LingoA :      65 cycles
_strlen (Agner Fog):  197 cycles

-- test 4             return values Lingo, jj, Agner: 191, 191, 191
crt_strlen    :       214 cycles
strlen32      :       76 cycles
strlen64LingoA :      63 cycles
_strlen (Agner Fog):  197 cycles

-- test 7             return values Lingo, jj, Agner: 191, 191, 191
crt_strlen    :       211 cycles
strlen32      :       76 cycles
strlen64LingoA :      64 cycles
_strlen (Agner Fog):  198 cycles

Press any key to exit...


Title: Re: szLen optimize...
Post by: jj2007 on March 10, 2009, 12:53:23 PM
Thanks, very interesting. It seems the two algos are roughly equivalent, with Lingo's a bit stronger on AMD and Core2 (Herge) and mine stronger on P4's and (marginally) on Celeron M. In any case, Hutch faces a difficult choice for the next Masm32 version:

-- test 1k --
Masm32 lib szLen    : 2215 cycles
crt_strlen    :       1042 cycles
strlen32      :       354 cycles
strlen64LingoA :      354 cycles
_strlen (Agner Fog):  648 cycles

-- test aligned 1, 191 bytes --
Masm32 lib szLen :    515 cycles
crt_strlen    :       262 cycles
strlen32      :       73 cycles
strlen64LingoA :      105 cycles
_strlen (Agner Fog):  111 cycles


A factor 6-7 on one of the most popular functions is not so bad :green2
Title: Re: szLen optimize...
Post by: lingo on March 10, 2009, 02:03:40 PM
I can't understand what happen with your PC or with you... :lol
New nonsense about the same program and test:
'strlen64LingoA :      105 cycles !!!'
Pls, take a look of your previous messages about the same test and program..
Where is the true?

Title: Re: szLen optimize...
Post by: jj2007 on March 10, 2009, 02:34:43 PM
Quote from: lingo on March 10, 2009, 02:03:40 PM
I can't understand what happen with your PC or with you... :lol
New nonsense about the same program and test:
'strlen64LingoA :      105 cycles !!!'
Pls, take a look of your previous messages about the same test and program..
Where is the true?

The truth is that timings tend to be not 100% accurate, and that I have a P4 in office, and a Celeron M at home. Your algo is marginally slower than mine on a P4 for short unaligned strings... no need to panic, dear friend :thumbu

Here are some more timings with a higher LOOP_COUNT:
-- test 0             0=perfectly aligned on 16-byte boundary
crt_strlen    :       243 cycles
strlen32      :       74 cycles
strlen64LingoA :      71 cycles
_strlen (Agner Fog):  105 cycles

-- test 1             1=misaligned 1 byte
crt_strlen    :       247 cycles
strlen32      :       75 cycles
strlen64LingoA :      90 cycles
_strlen (Agner Fog):  111 cycles

-- test 4             return values
crt_strlen    :       240 cycles
strlen32      :       76 cycles
strlen64LingoA :      81 cycles
_strlen (Agner Fog):  130 cycles

-- test 7             return values
crt_strlen    :       243 cycles
strlen32      :       74 cycles
strlen64LingoA :      83 cycles
_strlen (Agner Fog):  114 cycles


Your algo seems faster on AMD and Core Duo. In any case, you should be proud of having found an algo that is 5 times as fast as the fastest M$ algo, and (for longer strings) twice as fast as the latest Agner Fog algo. My own one is a minor adaption of yours, so the credits go to you anyway :U
Title: Re: szLen optimize...
Post by: hutch-- on March 10, 2009, 02:38:50 PM
 :bg

> In any case, Hutch faces a difficult choice for the next Masm32 version:

Yeah ?

No I don't, I have been watching musical chairs on string length algos for at least the last 10 years, in about 99.9999999999999999999999999% of cases the slow byte scanner is more than fast enough and in the .0 --- 0001% of other cases Agner Fog's algo is even more than fast enough. Speed is greate but it must also be useful gains and string length algos are rarely ever a big deal.

On a native 64 bit box it should be a toss between native 64 bit and emulated 128 bit SSE3/4/? on paragraph alignment, shame most string data is aligned to 1.
Title: Re: szLen optimize...
Post by: jj2007 on March 10, 2009, 02:43:45 PM
Quote from: hutch-- on March 10, 2009, 02:38:50 PM
:bg
... paragraph alignment, shame most string data is aligned to 1.

Shame you don't read the posts in your own forum. Both 'winner' algos have no problem with misalignment.
:(
Title: Re: szLen optimize...
Post by: jj2007 on March 10, 2009, 11:33:31 PM
I was not satisfied with the performance of my algo for short strings, so I fumbled together a variant, strlen32b. It is now almost on par with Lingo's algo for short strings, and about 2% faster for very long strings. Of course, AMD, Core2 and P4 might look different again - the Celeron M is "Core" but not "Core Duo" ::)

Intel(R) Celeron(R) M CPU        420  @ 1.60GHz (SSE3)
codesizes: strlen32=92, strlen32b=114, strlen64A=117, _strlen=66

ERROR in strlen64A at ct 16: 14 bytes instead of 15
-- test 16k           return values Lingo, jj, Agner: 16384, 16384, 16384
strlen32      :       2881 cycles
strlen32b     :       2936 cycles
strlen64LingoA :      3024 cycles
_strlen (Agner Fog):  4250 cycles

-- test 4k            return values Lingo, jj, Agner: 4096, 4096, 4096
crt_strlen    :       3800 cycles
strlen32      :       743 cycles
strlen32b     :       744 cycles
strlen64LingoA :      774 cycles
_strlen (Agner Fog):  1103 cycles

-- test 0             return values Lingo, jj, Agner: 95, 95, 95
crt_strlen    :       101 cycles
strlen32      :       31 cycles
strlen32b     :       29 cycles
strlen64LingoA :      25 cycles
_strlen (Agner Fog):  30 cycles

-- test 1             return values Lingo, jj, Agner: 95, 95, 95
crt_strlen    :       111 cycles
strlen32      :       31 cycles
strlen32b     :       35 cycles
strlen64LingoA :      37 cycles
_strlen (Agner Fog):  34 cycles

-- test 3             return values Lingo, jj, Agner: 11, 14, 14
crt_strlen    :       23 cycles
strlen32      :       20 cycles
strlen32b     :       16 cycles
strlen64LingoA :      14 cycles
_strlen (Agner Fog):  14 cycles

-- test 15            return values Lingo, jj, Agner: -1, 14, 14
crt_strlen    :       23 cycles
strlen32      :       20 cycles
strlen32b     :       16 cycles
strlen64LingoA :      14 cycles
_strlen (Agner Fog):  14 cycles

[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: PBrennick on March 11, 2009, 12:48:59 AM
JJ,

hmmm? No kidding? Misalignment of a 'one byte aligned' string... I do not know whether to laugh or cry. I do not think you 'got' the point Hutch was trying to make.

Paul
Title: Re: szLen optimize...
Post by: NightWare on March 11, 2009, 01:25:59 AM
hi lingo,
por xmm1, xmm0
pxor xmm2, xmm2 ; why ? why do you want to use xmm2 ?
pmovmskb edx, xmm1
pxor xmm1, xmm1 ; why ? if the mask in edx = 0 then both xmm0 and xmm1 = 0
test edx, edx
>>jnz<< Ex_1


hi jj, same thing for you in your algo, why do you want to clean the simd registers when it's not needed ?
plus, if you really want to take care of the speed for small strings, you should avoid the jump (it's a potential misprediction).
Title: Re: szLen optimize...
Post by: herge on March 11, 2009, 02:35:32 AM
 Hi jj2007:

Here is my latest results:


Intel(R) Core(TM)2 Duo CPU     E4600  @ 2.40GHz (SSE4)
codesizes: strlen32=92, strlen32b=114, strlen64A=117, _strlen=66

ERROR in TestAlgo at ct 16: 14 bytes instead of 15
-- test 16k       return values Lingo, jj, Agner: 16384, 16384, 16384
strlen32      :       1503 cycles
strlen32b     :       1512 cycles
strlen64LingoA :      1138 cycles
_strlen (Agner Fog):  2814 cycles

-- test 4k       return values Lingo, jj, Agner: 4096, 4096, 4096
crt_strlen    :       2425 cycles
strlen32      :       410 cycles
strlen32b     :       403 cycles
strlen64LingoA :      325 cycles
_strlen (Agner Fog):  716 cycles

-- test 0       return values Lingo, jj, Agner: 95, 95, 95
crt_strlen    :       61 cycles
strlen32      :       16 cycles
strlen32b     :       15 cycles
strlen64LingoA :      14 cycles
_strlen (Agner Fog):  19 cycles

-- test 1       return values Lingo, jj, Agner: 95, 95, 95
crt_strlen    :       61 cycles
strlen32      :       15 cycles
strlen32b     :       18 cycles
strlen64LingoA :      26 cycles
_strlen (Agner Fog):  20 cycles

-- test 3       return values Lingo, jj, Agner: 11, 14, 14
crt_strlen    :       14 cycles
strlen32      :       10 cycles
strlen32b     :       8 cycles
strlen64LingoA :      6 cycles
_strlen (Agner Fog):  7 cycles

-- test 15       return values Lingo, jj, Agner: -1, 14, 14
crt_strlen    :       14 cycles
strlen32      :       10 cycles
strlen32b     :       8 cycles
strlen64LingoA :      6 cycles
_strlen (Agner Fog):  7 cycles

Press any key to exit...



Regards herge
Title: Re: szLen optimize...
Post by: jj2007 on March 11, 2009, 06:54:55 AM
Quote from: PBrennick on March 11, 2009, 12:48:59 AM
JJ,

hmmm? No kidding? Misalignment of a 'one byte aligned' string... I do not know whether to laugh or cry. I do not think you 'got' the point Hutch was trying to make.

Paul


Explain, please. I don't get your point.
Title: Re: szLen optimize...
Post by: sinsi on March 11, 2009, 07:03:23 AM
Well, jj, put it this way...

ALIGN 1

Unless we can ALIGN 0.5, byte align is...everything...'get' it?
Title: Re: szLen optimize...
Post by: jj2007 on March 11, 2009, 07:05:49 AM
Quote from: NightWare on March 11, 2009, 01:25:59 AM
hi jj, same thing for you in your algo, why do you want to clean the simd registers when it's not needed ?
plus, if you really want to take care of the speed for small strings, you should avoid the jump (it's a potential misprediction).

Hi NightWare, thanks a lot for reading this thoroughly :thumbu
I have wondered myself whether clearing is not needed in some places, but was not sure. In fact, I took one out tonight, see below, ; pxor xmm0, xmm0. Could you please indicate where you consider it not needed?

As to the nullptr jump, I had to introduce it because it failed for null strings (and I admit I ws too tired to analyse the reason; plus, my Olly version here does not display xmm registers :dazzled:).

strlen32b proc src:DWORD ; jj 9 March 2007, 92 (down from 103) bytes; 0.176 cycles/byte at 16k
mov ecx, [esp+4] ; get pointer to string: -- this part taken from Agner Fog ----
mov al, [ecx] ; test for Null$
test al, al
je nullptr
pxor xmm0, xmm0 ; set to zero for comparison
mov eax, ecx ; copy pointer
pxor xmm1, xmm1
and eax, -16 ; align pointer by 16
and ecx, 15 ; lower 4 bits indicate misalignment
pcmpeqb xmm1, [eax] ; read 16 from nearest preceding boundary and compare with zero
; lea eax, [eax+16]
add eax, 16
pcmpeqb xmm0, [eax] ; ecx is pointer to initial string, 16-byte aligned
por xmm1, xmm0
pmovmskb edx, xmm1 ; get one bit for each byte result; - OK, isnull, stays null, Z=1, Bad: notnull, willbenull, z=0
shr edx, cl ; shift out false bits ** compliments to Agner, **
shl edx, cl ; shift back again ** this is called genius ;-) **
test edx, edx
jnz fdr1
add eax, 16 ; correct aligned pointer for bytes already treated above (lea exactly same cycles)
; pxor xmm0, xmm0 (must be 0) ; reset to zero for comparisons below
pxor xmm1, xmm1 ; align 16 no good, costs about one cycle extra

@@: pcmpeqb xmm0, [eax] ; ------ this part taken from Lingo, with adaptions ------
pcmpeqb xmm1, [eax+16] ; ecx is pointer to initial string, 16-byte aligned
por xmm1, xmm0
; add eax, 32 ; is marginally slower than lea
lea eax, [eax+32] ; len counter (moving up lea or add costs 3 cycles for the 191 byte string)
pmovmskb edx, xmm1
test edx, edx
jz @B
sub eax, 32 ; subtract initial bytes

fdr1: pmovmskb ecx, xmm0
shl edx, 16 ; bswap works, too, but one cycle slower
or edx, ecx
bsf edx, edx
add eax, edx
sub eax, [esp+4]
nullptr: ret 4
strlen32b endp
Title: Re: szLen optimize...
Post by: jj2007 on March 11, 2009, 07:13:19 AM
Quote from: sinsi on March 11, 2009, 07:03:23 AM
Well, jj, put it this way...

ALIGN 1

Unless we can ALIGN 0.5, byte align is...everything...'get' it?

Yeah, of course :boohoo: I was desperately trying to find align 1 in the code I posted, but now I realise Paul means Hutch's statement. But again, this is deliberately trying to misunderstand him: What he meant (apparently - I also risk to misinterpret him) is that strings are in general not aligned on a paragraph or even dword border, and that algos for aligned strings are therefore pretty useless. Lingo's first version had that problem, but he fixed it (but still has a minor problem for very short strings). So both "fast" algos are general purpose, if you assume that the user has a modern CPU
Title: Re: szLen optimize...
Post by: sinsi on March 11, 2009, 07:26:47 AM
I think that strings > MAX_PATH are rare - most strings (that you need to get the length of) are short (filenames etc.) so there is no need for a 'long' string scanner.
Mind you, for a long string scanner, you have possibly opened up a small niche, and the replies re optimisation can often apply to other bits of code.

Thanks to you, I've been looking at SSEx instructions and broadening my asm horizons (at least I think 'thanks, but' since they're a bit hard atm  :bdg)
Title: Re: szLen optimize...
Post by: herge on March 11, 2009, 07:29:29 AM
 Hi jj2007:

Use windbg from Microsoft. It does display the xmm registers.

Regards herge
Title: Re: szLen optimize...
Post by: jj2007 on March 11, 2009, 08:13:58 AM
Quote from: sinsi on March 11, 2009, 07:26:47 AM
I think that strings > MAX_PATH are rare - most strings (that you need to get the length of) are short (filenames etc.) so there is no need for a 'long' string scanner.
Mind you, for a long string scanner, you have possibly opened up a small niche, and the replies re optimisation can often apply to other bits of code.

Thanks to you, I've been looking at SSEx instructions and broadening my asm horizons (at least I think 'thanks, but' since they're a bit hard atm  :bdg)

Long strings are rare, that's correct. But the new algos are a factor 3 faster than len(My$):

-- test 0             return values Lingo, jj, Agner: 62, 62, 62
Masm32 lib szLen :    88 cycles
crt_strlen    :       81 cycles
strlen32b     :       23 cycles
strlen64LingoA :      24 cycles
_strlen (Agner Fog):  26 cycles

-- test 1             return values Lingo, jj, Agner: 62, 62, 62
Masm32 lib szLen :    89 cycles
crt_strlen    :       81 cycles
strlen32b     :       23 cycles
strlen64LingoA :      23 cycles
_strlen (Agner Fog):  22 cycles


D:\masm32\examples\exampl10\timer_demos\unroll\unroll_test.exe has 62 bytes ;-)
Title: Re: szLen optimize...
Post by: hutch-- on March 11, 2009, 10:21:19 AM
 :bg

> Shame you don't read the posts in your own forum. Both 'winner' algos have no problem with misalignment.

Years of reading posts leave you with a reasonably good idea of the value of an "atom cracking" string length algo. Let me think, "As useful as a hip pocket in a singlet", wjhat about the world's fastest "MessageBoxA" algo ? How about a hobbling horse in the Kentucky Derby ?  :P
Title: Re: szLen optimize...
Post by: jj2007 on March 11, 2009, 10:39:44 AM
Quote from: herge on March 11, 2009, 07:29:29 AM
Hi jj2007:

Use windbg from Microsoft. It does display the xmm registers.

Regards herge

Thanks, herge. I will look into it. Olly has the same capacity, but it is somewhat hidden in the options.
Title: Re: szLen optimize...
Post by: herge on March 11, 2009, 12:06:37 PM

Hi jj2007:

See attachment a picture of windbg in action.

Regards herge

[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: PBrennick on March 11, 2009, 12:14:23 PM
Hutch,

... llike a screen door on a submarine.

Paul
Title: Re: szLen optimize...
Post by: lingo on March 11, 2009, 07:12:51 PM
"why do you want to use xmm2 ?"
Thanks NightWare, it was from other similar algos...
IMO we may need several strlen algos to use in the application.
For example: strlenA for bigger strings and strlenB  for short strings...
Intel(R) Core(TM)2 Duo CPU     E8500  @ 3.16GHz (SSE4)
codesizes: strlen32=92, strlen32b=114, strlen64A=112, strlen64B=87, _strlen=66

-- test 16k           return values LingoA,LingoB, jj, Agner: 16384, 16384, 163
4, 16384
strlen32      :       1577 cycles
strlen32b     :       1585 cycles
strlen64LingoA :      1553 cycles
strlen64LingoB :      1604 cycles
_strlen (Agner Fog):  2793 cycles

-- test 4k            return values LingoA,LingoB, jj, Agner: 4096, 4096, 4096, 4096
crt_strlen    :       2727 cycles
strlen32      :       420 cycles
strlen32b     :       421 cycles
strlen64LingoA :      405 cycles
strlen64LingoB :      412 cycles
_strlen (Agner Fog):  716 cycles

-- test 0             return values LingoA,LingoB, jj, Agner: 95, 95, 95, 95
crt_strlen    :       77 cycles
strlen32      :       17 cycles
strlen32b     :       15 cycles
strlen64LingoA :      11 cycles
strlen64LingoB :      13 cycles
_strlen (Agner Fog):  19 cycles

-- test 1             return values LingoA,LingoB, jj, Agner: 95, 95, 95, 95
crt_strlen    :       79 cycles
strlen32      :       17 cycles
strlen32b     :       19 cycles
strlen64LingoA :      28 cycles
strlen64LingoB :      25 cycles
_strlen (Agner Fog):  20 cycles

-- test 3             return values LingoA,LingoB, jj, Agner: 14, 14, 14, 14
crt_strlen    :       17 cycles
strlen32      :       10 cycles
strlen32b     :       8 cycles
strlen64LingoA :      6 cycles
strlen64LingoB :      4 cycles
_strlen (Agner Fog):  7 cycles

-- test 15            return values LingoA,LingoB, jj, Agner: 14, 14, 14, 14
crt_strlen    :       16 cycles
strlen32      :       10 cycles
strlen32b     :       8 cycles
strlen64LingoA :      6 cycles
strlen64LingoB :      3 cycles
_strlen (Agner Fog):  7 cycles

Press any key to exit...





[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: askm on March 11, 2009, 07:16:48 PM
I imagine this because there are lots of timings posted on this and other topics.

Wouldnt it be real nice to be able to write

code and is profiled as your writing it...youd get timings instantly !

Timings that would be identical to what youd get as you do timings now, manually.

Or even written and profiled simultaneously

as if your on a different processor altogether. Clusters ? Parallel ?

Code would be profiled by speed, security, or memory...

just daydreaming. I know this kind of editor

would have to be partially if not fully written

in assembler, and not in my lifetime ?  Open source ?

IT PROBABLY IS NOT AS DIFFICULT AS IT SEEMS, ON SOME LEVELS.

I know you think I am going toward 'the super optimizing compiler' direction.

More like 'the supervising optimizing compiler'.
Title: Re: szLen optimize...
Post by: NightWare on March 11, 2009, 10:43:53 PM
Quote from: jj2007 on March 11, 2009, 07:05:49 AM
I have wondered myself whether clearing is not needed in some places, but was not sure. In fact, I took one out tonight, see below, ; pxor xmm0, xmm0. Could you please indicate where you consider it not needed?
pxor xmm1,xmm1 just after is also useless coz you have jumped to fdr1 if it's not equal to 0.  :wink
Title: Re: szLen optimize...
Post by: herge on March 11, 2009, 10:52:31 PM
 Hi lingo:

Results from my computer.

Intel(R) Core(TM)2 Duo CPU     E4600  @ 2.40GHz (SSE4)
codesizes: strlen32=92, strlen32b=114, strlen64A=112, strlen64B=87, _strlen=66

-- test 16k       return values LingoA,LingoB, jj, Agner: 16384, 16384, 16384, 16384
strlen32      :       1491 cycles
strlen32b     :       1521 cycles
strlen64LingoA :      1140 cycles
strlen64LingoB :      1297 cycles
_strlen (Agner Fog):  2862 cycles

-- test 4k       return values LingoA,LingoB, jj, Agner: 4096, 4096, 4096, 4096
crt_strlen    :       2443 cycles
strlen32      :       401 cycles
strlen32b     :       410 cycles
strlen64LingoA :      353 cycles
strlen64LingoB :      325 cycles
_strlen (Agner Fog):  730 cycles

-- test 0       return values LingoA,LingoB, jj, Agner: 95, 95, 95, 95
crt_strlen    :       66 cycles
strlen32      :       17 cycles
strlen32b     :       14 cycles
strlen64LingoA :      12 cycles
strlen64LingoB :      14 cycles
_strlen (Agner Fog):  23 cycles

-- test 1       return values LingoA,LingoB, jj, Agner: 95, 95, 95, 95
crt_strlen    :       62 cycles
strlen32      :       18 cycles
strlen32b     :       18 cycles
strlen64LingoA :      31 cycles
strlen64LingoB :      25 cycles
_strlen (Agner Fog):  21 cycles

-- test 3       return values LingoA,LingoB, jj, Agner: 14, 14, 14, 14
crt_strlen    :       15 cycles
strlen32      :       11 cycles
strlen32b     :       10 cycles
strlen64LingoA :      6 cycles
strlen64LingoB :      2 cycles
_strlen (Agner Fog):  7 cycles

-- test 15       return values LingoA,LingoB, jj, Agner: 14, 14, 14, 14
crt_strlen    :       14 cycles
strlen32      :       10 cycles
strlen32b     :       8 cycles
strlen64LingoA :      6 cycles
strlen64LingoB :      3 cycles
_strlen (Agner Fog):  7 cycles

Press any key to exit...


Regards herge
Title: Re: szLen optimize...
Post by: jj2007 on March 11, 2009, 11:09:04 PM
Quote from: NightWare on March 11, 2009, 10:43:53 PM
Quote from: jj2007 on March 11, 2009, 07:05:49 AM
I have wondered myself whether clearing is not needed in some places, but was not sure. In fact, I took one out tonight, see below, ; pxor xmm0, xmm0. Could you please indicate where you consider it not needed?
pxor xmm1,xmm1 just after is also useless coz you have jumped to fdr1 if it's not equal to 0.  :wink

I thought so, too. But the shr edx, cl (shift out false bits) trick has one nasty side effect: You might have an FF somewhere in xmm1 because there was a zero byte before your misaligned string:
      align 16
      db 15 dup (0)
      szTest_Fail db "my other brother darryl my other brother darryl"
      db 255, 255, 255, 0

Now one might argue that no sane person has a string with FF/255 bytes. But it fails exactly for this case (I tested it) :wink
Title: Re: szLen optimize...
Post by: NightWare on March 11, 2009, 11:49:15 PM
Quote from: jj2007 on March 11, 2009, 11:09:04 PM
I thought so, too. But the shr edx, cl (shift out false bits) trick has one nasty side effect: You might have an FF somewhere in xmm1 because there was a zero byte before your misaligned string:

hmm, for example you could use (in your strlen32 algo) :

pxor xmm0,xmm0
movdqu xmm1,[eax]
pcmpeqb xmm1,xmm0 ; <- here you will have the same result as pxor xmm1,xmm1 if there is no 0
and eax,0FFFFFFF0h
pmovmskb edx,xmm1
...


and no need for shr/shl edx,cl
Title: Re: szLen optimize...
Post by: jj2007 on March 12, 2009, 12:49:21 AM
Quote from: NightWare on March 11, 2009, 11:49:15 PM
Quote from: jj2007 on March 11, 2009, 11:09:04 PM
I thought so, too. But the shr edx, cl (shift out false bits) trick has one nasty side effect: You might have an FF somewhere in xmm1 because there was a zero byte before your misaligned string:

hmm, for example you could use (in your strlen32 algo) :

pxor xmm0,xmm0
movdqu xmm1,[eax]
pcmpeqb xmm1,xmm0 ; <- here you will have the same result as pxor xmm1,xmm1 if there is no 0
and eax,0FFFFFFF0h
pmovmskb edx,xmm1
...


and no need for shr/shl edx,cl


Thanks, NightWare. In the meantime, I had found a different way to overcome this, a repeated pcmpeqb xmm0, [eax]:

strlen32s proc src:DWORD ; jj 12 March 2007, 89 bytes; 0.176 cycles/byte at 16k
mov ecx, [esp+4] ; get pointer to string: -- this part taken from Agner Fog ----
pxor xmm0, xmm0 ; zero for comparison
movups xmm1, [ecx] ; move 16 bytes into xmm1, unaligned (adapted from Lingo)
pcmpeqb xmm1, xmm0 ; set bytes in xmm2 to FF if nullbytes found in xmm1
pmovmskb edx, xmm1 ; set byte mask in edx
bsf eax, edx ; bit scan forward
jne Le16 ; return bsf index if a bit was set
mov eax, ecx ; copy pointer
and eax, -16 ; align pointer by 16
pxor xmm1, xmm1 ; zero for comparison
and ecx, 15 ; lower 4 bits indicate misalignment
je @F ; jumping is a few cycles faster
pcmpeqb xmm0, [eax] ; force FF's into false positives (the SSE2 equivalent to Agner's shr/shl trick)

@@: pcmpeqb xmm0, [eax] ; ------ this part taken from Lingo, with adaptions ------
pcmpeqb xmm1, [eax+16] ; ecx is pointer to initial string, 16-byte aligned
por xmm1, xmm0
lea eax, [eax+32] ; len counter (moving up lea or add costs 3 cycles for the 191 byte string)
pmovmskb edx, xmm1
test edx, edx
jz @B

pmovmskb ecx, xmm0
shl edx, 16 ; bswap works, too, but one cycle slower
or edx, ecx
bsf edx, edx
lea eax, [eax+edx-32] ; add scan index, subtract initial bytes
sub eax, [esp+4]
Le16: ret 4
strlen32s endp


New Timings:
Intel(R) Celeron(R) M CPU        420  @ 1.60GHz (SSE3)
codesizes: strlen32s=89, strlen64B=87, _strlen=66

-- test 16k           return values Lingo, jj, Agner: 16384, 16384, 16384
crt_strlen    :       15288 cycles
strlen32s     :       2890 cycles
strlen64LingoB :      2904 cycles
_strlen (Agner Fog):  4253 cycles

-- test 1k            return values Lingo, jj, Agner: 1024, 1024, 1024
crt_strlen    :       977 cycles
strlen32s     :       199 cycles
strlen64LingoB :      193 cycles
_strlen (Agner Fog):  272 cycles

-- test 0             return values Lingo, jj, Agner: 95, 95, 95
crt_strlen    :       101 cycles
strlen32s     :       29 cycles
strlen64LingoB :      28 cycles
_strlen (Agner Fog):  30 cycles

-- test 1             return values Lingo, jj, Agner: 95, 95, 95
crt_strlen    :       112 cycles
strlen32s     :       40 cycles
strlen64LingoB :      33 cycles
_strlen (Agner Fog):  34 cycles

-- test 3             return values Lingo, jj, Agner: 15, 15, 15
crt_strlen    :       25 cycles
strlen32s     :       5 cycles
strlen64LingoB :      6 cycles
_strlen (Agner Fog):  14 cycles

-- test 15            return values Lingo, jj, Agner: 15, 15, 15
crt_strlen    :       24 cycles
strlen32s     :       5 cycles
strlen64LingoB :      6 cycles
_strlen (Agner Fog):  14 cycles


The new version includes also a correctness test for all algos. My new favourite is strlen32s: For long strings, it is 14 cycles faster than No. 2, strlen64LingoB, while for very short strings it is a whopping 16% faster than the latter. Lingo, you have a challenge!

[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: herge on March 12, 2009, 01:29:15 AM
 Hi jj2007:

Even More Results from herge.


Intel(R) Core(TM)2 Duo CPU     E4600  @ 2.40GHz (SSE4)
codesizes: strlen32s=89, strlen64B=87, _strlen=66

-- test 16k       return values Lingo, jj, Agner: 16384, 16384, 16384
crt_strlen    :       9628 cycles
strlen32s     :       1489 cycles
strlen64LingoB :      1185 cycles
_strlen (Agner Fog):  2854 cycles

-- test 1k       return values Lingo, jj, Agner: 1024, 1024, 1024
crt_strlen    :       649 cycles
strlen32s     :       101 cycles
strlen64LingoB :      99 cycles
_strlen (Agner Fog):  193 cycles

-- test 0       return values Lingo, jj, Agner: 95, 95, 95
crt_strlen    :       64 cycles
strlen32s     :       15 cycles
strlen64LingoB :      14 cycles
_strlen (Agner Fog):  19 cycles

-- test 1       return values Lingo, jj, Agner: 95, 95, 95
crt_strlen    :       91 cycles
strlen32s     :       31 cycles
strlen64LingoB :      25 cycles
_strlen (Agner Fog):  20 cycles

-- test 3       return values Lingo, jj, Agner: 15, 15, 15
crt_strlen    :       17 cycles
strlen32s     :       3 cycles
strlen64LingoB :      3 cycles
_strlen (Agner Fog):  7 cycles

-- test 15       return values Lingo, jj, Agner: 15, 15, 15
crt_strlen    :       15 cycles
strlen32s     :       2 cycles
strlen64LingoB :      3 cycles
_strlen (Agner Fog):  7 cycles

Press any key to exit...


Regards herge

Title: Re: szLen optimize...
Post by: NightWare on March 12, 2009, 02:37:16 AM
?
   mov eax, ecx         ; copy pointer why ?
   and eax, -16         ; align pointer by 16
   pxor xmm1, xmm1         ; zero for comparison why ?
you don't need the following lines anymore... whith movups the possible 0 before can't exist...
   and ecx, 15         ; lower 4 bits indicate misalignment
   je @F            ; jumping is a few cycles faster
   pcmpeqb xmm0, [eax]      ; force FF's into false positives (the SSE2 equivalent to Agner's shr/shl trick)

you just need to modify the end of the algo to obtain the correct result...

EDIT :
Quote from: jj2007 on March 12, 2009, 12:49:21 AM
for very short strings it is a whopping 16% faster than the latter. Lingo, you have a challenge!
:bg, but i remember you there is a jump, so a (certainly) branch misprediction, and
QuoteThe cost of a branch misprediction ranges from 12 to more than 50 clock cycles, depending on the length of the pipeline and other details of the microarchitecture.
(taken fom agner fog's last optimizations pdf file). so 50 cycles... it could be 1000% slower...  :bg
Title: Re: szLen optimize...
Post by: lingo on March 12, 2009, 03:44:43 AM
jj,
Let's see what you "have":  :wink
1. strlen32 - it is 1st half of code from A.Fog end the rest from Lingo - just the name strlen32 is from you
Proof:
"Now I took the best of two worlds, i.e. Lingo's speed and Agner's brilliant alignment scheme, and threw them together. The result (shown as strlen32) is, ehm, how to put it: just about good enough for my own private library: "

2. strlens32s - it is your top of the ice cream... :lol
It is code without nothing from A.Fog and 100 % from Lingo's strlenLingoB code...What happen with "Agner's brilliant alignment scheme"?  :lol
Of course the new name- strlens32s and the test program is from you again.
Proof:"My new favorite is strlen32s: bla,blah,bla..."  and Lingo's code insight  :lol

3. "Lingo, you have a challenge!"

Actually you don't have  your own code or ideas to "compete" here and I am not interested to fight with myself ...so there is no challenge for me to try to continue...
against my own code and ideas.
Proof:  I have new faster strlen algo based on the new Nehalem string instructions but it is other story and challenge.
Hence, don't hurry up and read and think about NightWare notes carefully because I don't want to publish it yet...   :lol

Title: Re: szLen optimize...
Post by: jj2007 on March 12, 2009, 08:07:42 AM
Quote from: lingo on March 12, 2009, 03:44:43 AM
Proof:
"Now I took the best of two worlds, i.e. Lingo's speed and Agner's brilliant alignment scheme, and threw them together. The result (shown as strlen32) is, ehm, how to put it: just about good enough for my own private library: "


Lingo, you don't have to prove something that is openly stated. This code has evolved over time, and you, Nightware and myself, we have produced the two fastest algos ever, despite of certain trolls pretending that a fast len algo is a waste of time (but argue endlessly elsewhere about bad practices wasting cycles and damaging registers etc.). We are here because assembler can produce lean and mean code, and because it's fun testing the limits. You are excellent in testing these limits, and therefore your name does appear twice in the 30 lines of my current favourite called strlen32s. And if I find the time today, Nightware's corrections will also be tested, and his name will be added somewhere. Take it easy :U
Title: Re: szLen optimize...
Post by: jj2007 on March 12, 2009, 09:54:45 AM
Quote from: NightWare on March 12, 2009, 02:37:16 AM
Quote from: jj2007 on March 12, 2009, 12:49:21 AM
for very short strings it is a whopping 16% faster than the latter. Lingo, you have a challenge!
:bg, but i remember you there is a jump, so a (certainly) branch misprediction, and
QuoteThe cost of a branch misprediction ranges from 12 to more than 50 clock cycles, depending on the length of the pipeline and other details of the microarchitecture.
(taken fom agner fog's last optimizations pdf file). so 50 cycles... it could be 1000% slower...  :bg

:bg Thanks for your hints, it's now shorter and a bit faster. But Lingo's algo is equally good. New testbed attached below.

align 16 ; jj2007, 12 March 2007, 85 bytes; 0.176 cycles/byte at 16k on Celeron M (0.3 on P4)
strlen32s proc src:DWORD ; with lots of inspiration from Lingo, NightWare and Agner Fog
mov eax, [esp+4] ; get pointer to string
movups xmm1, [eax] ; move 16 bytes into xmm1, unaligned (adapted from Lingo/NightWare)
pxor xmm0, xmm0 ; zero for comparison (no longer needed for xmm1 - thanks, NightWare)
pcmpeqb xmm1, xmm0 ; set bytes in xmm1 to FF if nullbytes found in xmm1
pmovmskb eax, xmm1 ; set byte mask in eax
bsf eax, eax ; bit scan forward
jne Lt16 ; less than 16 bytes, we can return the index in eax

@@: push ecx ; all registers preserved, except eax = return value
push edx ; eax will be pointer to initial string, 16-byte aligned
mov ecx, [esp+12] ; get pointer to string
and ecx, -16 ; align initial pointer to 16-byte boundary
lea eax, [ecx+16] ; aligned pointer + 16 (first 0..15 dealt with by movups above)

@@: pcmpeqb xmm0, [eax] ; ---- inner loop inspired by Lingo, with adaptions -----
pcmpeqb xmm1, [eax+16] ; compare packed bytes in [m128] and xmm1 for equality
por xmm1, xmm0 ; or them: one of the mem locations may contain a nullbyte
lea eax, [eax+32] ; len counter (moving up lea or add costs 3 cycles for the 191 byte string)
pmovmskb edx, xmm1 ; set byte mask in edx
test edx, edx
jz @B

pmovmskb ecx, xmm0 ; set byte mask in ecx (has to be repeated, sorry)
shl edx, 16 ; create space for the ecx bytes
or edx, ecx ; combine xmm0 and xmm1 results
bsf edx, edx ; bit scan for the index
lea eax, [eax+edx-32] ; add scan index, subtract initial bytes
pop edx
sub eax, [esp+8]
pop ecx
Lt16: ret 4
strlen32s endp


Timings:


              Intel(R) Pentium(R) 4 CPU 3.40GHz (SSE3)
ERROR in strlen64A at ebx=11: 16 bytes instead of 11

codesizes: strlen32s=77, strlen64A=120, strlen64B=87, _strlen=66

-- test 16k, misaligned 0, 16384 bytes
strlen32s     :       4634 cycles
strlen64LingoB :      4978 cycles
_strlen (Agner Fog):  10152 cycles

-- test 4k, misaligned 11, 4096 bytes
crt_strlen    :       3955 cycles
strlen32s     :       1130 cycles
strlen64LingoB :      1126 cycles
_strlen (Agner Fog):  2235 cycles

-- test 1k, misaligned 0, 1024 bytes
strlen32s     :       345 cycles
strlen64LingoB :      349 cycles
_strlen (Agner Fog):  636 cycles

-- test 0, misaligned 0, 95 bytes
crt_strlen    :       231 cycles
strlen32s     :       80 cycles
strlen64LingoB :      80 cycles
_strlen (Agner Fog):  100 cycles

-- test 1, misaligned 1, 95 bytes
crt_strlen    :       203 cycles
strlen32s     :       91 cycles
strlen64LingoB :      58 cycles
_strlen (Agner Fog):  64 cycles

-- test 3, misaligned 3, 15 bytes
crt_strlen    :       35 cycles
strlen32s     :       11 cycles
strlen64LingoB :      13 cycles
_strlen (Agner Fog):  23 cycles

-- test 15, misaligned 15, 15 bytes
crt_strlen    :       32 cycles
strlen32s     :       12 cycles
strlen64LingoB :      14 cycles
_strlen (Agner Fog):  23 cycles


EDIT: Attached new version with minor modifications.

[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: herge on March 12, 2009, 10:32:41 AM
 Hi JJ2007:

The latest results from herge.


Intel(R) Core(TM)2 Duo CPU     E4600  @ 2.40GHz (SSE4)
ERROR in strlen64A at ebx=11: 16 bytes instead of 11

codesizes: strlen32s=77, strlen64A=120, strlen64B=87, _strlen=66

-- test 16k, misaligned 0, 16384 bytes
strlen32s     :       1457 cycles
strlen64LingoB :      1260 cycles
_strlen (Agner Fog):  2797 cycles

-- test 4k, misaligned 11, 4096 bytes
crt_strlen    :       2401 cycles
strlen32s     :       387 cycles
strlen64LingoB :      340 cycles
_strlen (Agner Fog):  731 cycles

-- test 1k, misaligned 0, 1024 bytes
strlen32s     :       97 cycles
strlen64LingoB :      95 cycles
_strlen (Agner Fog):  178 cycles

-- test 0, misaligned 0, 95 bytes
crt_strlen    :       60 cycles
strlen32s     :       20 cycles
strlen64LingoB :      14 cycles
_strlen (Agner Fog):  18 cycles

-- test 1, misaligned 1, 95 bytes
crt_strlen    :       63 cycles
strlen32s     :       32 cycles
strlen64LingoB :      25 cycles
_strlen (Agner Fog):  20 cycles

-- test 3, misaligned 3, 15 bytes
crt_strlen    :       15 cycles
strlen32s     :       4 cycles
strlen64LingoB :      3 cycles
_strlen (Agner Fog):  7 cycles

-- test 15, misaligned 15, 15 bytes
crt_strlen    :       15 cycles
strlen32s     :       4 cycles
strlen64LingoB :      3 cycles
_strlen (Agner Fog):  7 cycles

Press any key to exit...



Regards herge
Title: Re: szLen optimize...
Post by: Mark Jones on March 12, 2009, 03:39:16 PM
Here's my compulsatory submission for the latest evolution.


AMD Athlon(tm) 64 X2 Dual Core Processor 4000+ (SSE3)
ERROR in strlen64A at ebx=11: 16 bytes instead of 11

codesizes: strlen32s=85, strlen64A=120, strlen64B=87, _strlen=66

-- test 16k, misaligned 0, 16384 bytes
crt_strlen    :       12338 cycles
strlen32s     :       3135 cycles
strlen64LingoB :      3120 cycles
_strlen (Agner Fog):  13916 cycles

-- test 4k, misaligned 11, 4096 bytes
crt_strlen    :       3229 cycles
strlen32s     :       828 cycles
strlen64LingoB :      814 cycles
_strlen (Agner Fog):  3496 cycles

-- test 1k, misaligned 15, 1024 bytes
crt_strlen    :       826 cycles
strlen32s     :       252 cycles
strlen64LingoB :      237 cycles
_strlen (Agner Fog):  900 cycles

-- test 0, misaligned 0, 95 bytes
crt_strlen    :       93 cycles
strlen32s     :       57 cycles
strlen64LingoB :      40 cycles
_strlen (Agner Fog):  122 cycles

-- test 1, misaligned 1, 95 bytes
crt_strlen    :       102 cycles
strlen32s     :       59 cycles
strlen64LingoB :      43 cycles
_strlen (Agner Fog):  101 cycles

-- test 3, misaligned 3, 15 bytes
crt_strlen    :       20 cycles
strlen32s     :       20 cycles
strlen64LingoB :      20 cycles
_strlen (Agner Fog):  34 cycles

-- test 15, misaligned 15, 15 bytes
crt_strlen    :       20 cycles
strlen32s     :       20 cycles
strlen64LingoB :      20 cycles
_strlen (Agner Fog):  34 cycles


Ya know, tools such as these should also show the OS version and bit-width. It could be assumed erroniously that this box is running 64-bit XP when in fact it is running 32-bit XP. (Wasteful, perhaps, but I cannot afford to upgrade in the foreseeable future.)
Title: Re: szLen optimize...
Post by: jj2007 on March 12, 2009, 03:56:29 PM
Quote from: Mark Jones on March 12, 2009, 03:39:16 PM
Here's my compulsatory submission for the latest evolution.


AMD Athlon(tm) 64 X2 Dual Core Processor 4000+ (SSE3)

-- test 0, misaligned 0, 95 bytes
crt_strlen    :       93 cycles
strlen32s     :       57 cycles
strlen64LingoB :      40 cycles
_strlen (Agner Fog):  122 cycles

:bg Thanxalot. It seems Lingo has a little edge here. Interesting that Agner's algo gets beaten by crt_strlen, though.

Quote
Ya know, tools such as these should also show the OS version and bit-width. It could be assumed erroniously that this box is running 64-bit XP when in fact it is running 32-bit XP. (Wasteful, perhaps, but I cannot afford to upgrade in the foreseeable future.)

Good idea in principle, but showing the OS with GetVersionEx is so hilariously clumsy that I get an allergy when I even think of it :red
Title: Re: szLen optimize...
Post by: lingo on March 13, 2009, 06:01:51 AM
"It seems Lingo has a little edge here."
madjj is never tired to convert my code in lame code... :lol
but, 'Lingo has a BIG edge here' :lol....
Intel(R) Core(TM)2 Duo CPU     E8500  @ 3.16GHz (SSE4)
100000000 bytes allocated

codesizes: strlen32sLAME=85, strlen64A=120, strlen64B=84, _strlen=66

-- test 16k, misaligned 0, 16384 bytes
strlen32sLAME        1575 cycles
strlen64LingoB       1532 cycles
_strlen (Agner Fog)  2761 cycles

-- test 4k, misaligned 11, 4096 bytes
strlen32sLAME        427 cycles
strlen64LingoB       404 cycles
_strlen (Agner Fog)  708 cycles

-- test 1k, misaligned 15, 1024 bytes
strlen32sLAME        100 cycles
strlen64LingoB       78 cycles
_strlen (Agner Fog)  193 cycles

-- test 0, misaligned 0, 95 bytes
  Masm32 lib szLen   99 cycles
  crt strlen         75 cycles
strlen32sLAME        19 cycles
strlen64LingoB       10 cycles
_strlen (Agner Fog)  19 cycles

-- test 1, misaligned 1, 95 bytes
  Masm32 lib szLen   99 cycles
  crt strlen         79 cycles
strlen32sLAME        19 cycles
strlen64LingoB       10 cycles
_strlen (Agner Fog)  20 cycles

-- test 3, misaligned 3, 15 bytes
  Masm32 lib szLen   19 cycles
  crt strlen         17 cycles
strlen32sLAME        3 cycles
strlen64LingoB       1 cycles
_strlen (Agner Fog)  7 cycles

-- test 15, misaligned 15, 15 bytes
  Masm32 lib szLen   19 cycles
  crt strlen         19 cycles
strlen32sLAME        20 cycles
strlen64LingoB       17 cycles
_strlen (Agner Fog)  7 cycles

Press any key to exit...




[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: sinsi on March 13, 2009, 06:34:06 AM

Intel(R) Core(TM)2 Quad CPU    Q6600  @ 2.40GHz (SSE4)
100000000 bytes allocated

codesizes: strlen32sLAME=85, strlen64A=120, strlen64B=84, _strlen=66

-- test 16k, misaligned 0, 16384 bytes
strlen32sLAME        1454 cycles
strlen64LingoB       1183 cycles
_strlen (Agner Fog)  2759 cycles

-- test 4k, misaligned 11, 4096 bytes
strlen32sLAME        393 cycles
strlen64LingoB       330 cycles
_strlen (Agner Fog)  707 cycles

-- test 1k, misaligned 15, 1024 bytes
strlen32sLAME        101 cycles
strlen64LingoB       78 cycles
_strlen (Agner Fog)  193 cycles

-- test 0, misaligned 0, 95 bytes
  Masm32 lib szLen   99 cycles
  crt strlen         75 cycles
strlen32sLAME        19 cycles
strlen64LingoB       13 cycles
_strlen (Agner Fog)  19 cycles

-- test 1, misaligned 1, 95 bytes
  Masm32 lib szLen   99 cycles
  crt strlen         80 cycles
strlen32sLAME        19 cycles
strlen64LingoB       11 cycles
_strlen (Agner Fog)  21 cycles

-- test 3, misaligned 3, 15 bytes
  Masm32 lib szLen   19 cycles
  crt strlen         19 cycles
strlen32sLAME        3 cycles
strlen64LingoB       1 cycles
_strlen (Agner Fog)  7 cycles

-- test 15, misaligned 15, 15 bytes
  Masm32 lib szLen   19 cycles
  crt strlen         19 cycles
strlen32sLAME        21 cycles
strlen64LingoB       18 cycles
_strlen (Agner Fog)  7 cycles

Press any key to exit...

This is getting ridiculous. "1 cycles" - now we need logic to print "1 cycle"  :P

Since we are on the cutting-edge here, can we have some native 64-bit code for 64-bit windows please? I guess we'll need 'timers64.inc' or something...

horse>die>flog  :bg

edit: what horrible code in the .asm - most doesn't even get used, what a nightmare to follow...
Quote; this is a comment with trailing blanks
is that zen? :bdg
Title: Re: szLen optimize...
Post by: lingo on March 13, 2009, 07:22:48 AM
"can we have some native 64-bit code for 64-bit windows please?"
Thanks, it works for me fine...: :wink
Usage:
lea rax, szBuffer
call strlen64


align 16
db 8Dh,0A4h,24h,0,0,0,0,8Dh,48h,0,10h
strlen64:
pop rcx
movdqu         xmm2, [rax]
pxor xmm0, xmm0
pcmpeqb         xmm2, xmm0
pxor xmm1, xmm1
pmovmskb edx, xmm2
test edx, edx
jz @f
bsf eax, edx
jmp rcx
@@:
lea rcx, [rax+16]
and rax, -16
@@:
pcmpeqb         xmm0, [rax+16]
pcmpeqb         xmm1, [rax+32]
por xmm1, xmm0
add rax, 32
pmovmskb edx, xmm1
test edx, edx
jz @b
shl edx, 16
sub rax, rcx
pmovmskb ecx, xmm0
or edx, ecx
mov rcx, [rsp-8]
bsf edx, edx
add rax, rdx
jmp rcx


Title: Re: szLen optimize...
Post by: jj2007 on March 13, 2009, 08:12:35 AM
Quote from: sinsi on March 13, 2009, 06:34:06 AM

edit: what horrible code in the .asm - most doesn't even get used, what a nightmare to follow...


You are perfectly right. I took over some "organically grown" code, and certainly have not added to its readability. But the purpose was not beauty but rather to find those 88 bytes or so that would be considered good enough to replace len() for the next ten years. I know it is a bad habit to reopen old threads, but this one was unfinished business - now we have two algos that do the job. Lingo's is an edge faster, but I hate paying royalties to people who call me madjj :toothy

Quote; this is a comment with trailing blanks
is that zen? :bdg

Well, kind of. It is a leftover of my attempt to teach RichMasm to autoformat code that comes along with spaces and varying tab sizes. Example:
strlen64:
pop rcx
movdqu         xmm2, [rax]
pxor xmm0, xmm0
pcmpeqb         xmm2, xmm0
pxor xmm1, xmm1
pmovmskb edx, xmm2
test edx, edx
jz @f
bsf eax, edx
jmp rcx

This is actually nicely formatted, in comparison to many other snippets I have seen, but it forces the eye to jump a lot from left to right. Here is the autoformatted version:
strlen64:
pop rcx
movdqu xmm2, [rax]
pxor xmm0, xmm0
pcmpeqb xmm2, xmm0
pxor xmm1, xmm1
pmovmskb edx, xmm2
test edx, edx
jz @f
bsf eax, edx
jmp rcx


The first one looks more beautiful, the second one is less tiresome for the eyes. A matter of taste, I guess.
Title: Re: szLen optimize...
Post by: sinsi on March 13, 2009, 08:51:02 AM
jj, you are a mad bastard mate  :bg

Switch to using ml64, no worries about cpu types - don't they all support sse3 at least?
Opening old threads? I have no problem with that if it's relevant (I do it with threads I've started - saves remembering) and saves getting 6 million search results...

lingo, the trouble is building a native pe64 and getting some timings from it. If MichaelW doesn't mind, maybe someone can change the timers.asm...when I'm sober I might try (oops then it will never happen).
Title: Re: szLen optimize...
Post by: jj2007 on March 13, 2009, 09:21:43 AM
Quote from: lingo on March 13, 2009, 06:01:51 AM
"It seems Lingo has a little edge here."
madjj is never tired to convert my code in lame code... :lol

The only "conversion" I have ever made to your code is to pass the source over the stack, to make the benchmarks comparable to the others.

Quote
but, 'Lingo has a BIG edge here' :lol....

Intel(R) Pentium(R) 4 CPU 3.40GHz (SSE3)

-- test 16k, misaligned 0, 16384 bytes
strlen32sLAME        4797 cycles   <**********
strlen64LingoB       5045 cycles   <**********
_strlen (Agner Fog)  9599 cycles
...
-- test 3, misaligned 3, 15 bytes
  Masm32 lib szLen   47 cycles
  crt strlen         35 cycles
strlen32sLAME        11 cycles   <**********
strlen64LingoB       15 cycles   <**********
_strlen (Agner Fog)  23 cycles


Ever heard about hardware differences?
:lol
Title: Re: szLen optimize...
Post by: sinsi on March 13, 2009, 09:33:48 AM
I fix a lot of computers, and most of them are as the following:

AMD Athlon(tm) XP  2600+ (SSE1)
100000000 bytes allocated
ERROR in StrSizeA at ebx=4096: 4101 bytes instead of 4096
ERROR in strlen32c at ebx=4096: 4101 bytes instead of 4096
ERROR in strlen32sLAME at ebx=4096: 4101 bytes instead of 4096
ERROR in strlen64B at ebx=4096: 4101 bytes instead of 4096

codesizes: strlen32sLAME=85, strlen64A=120, strlen64B=84, _strlen=66

-- test 16k, misaligned 0, 16384 bytes
strlen32sLAME        22 cycles
strlen64LingoB       2758 cycles
_strlen (Agner Fog)  22683 cycles

-- test 4k, misaligned 11, 4096 bytes
strlen32sLAME        26 cycles
strlen64LingoB       729 cycles
_strlen (Agner Fog)  5713 cycles

-- test 1k, misaligned 15, 1024 bytes
strlen32sLAME        240 cycles
strlen64LingoB       220 cycles
_strlen (Agner Fog)  1476 cycles

-- test 0, misaligned 0, 95 bytes
  Masm32 lib szLen   158 cycles
  crt strlen         107 cycles
strlen32sLAME        68 cycles
strlen64LingoB       40 cycles
_strlen (Agner Fog)  192 cycles

-- test 1, misaligned 1, 95 bytes
  Masm32 lib szLen   158 cycles
  crt strlen         114 cycles
strlen32sLAME        71 cycles
strlen64LingoB       39 cycles
_strlen (Agner Fog)  161 cycles

-- test 3, misaligned 3, 15 bytes
  Masm32 lib szLen   26 cycles
  crt strlen         25 cycles
strlen32sLAME        60 cycles
strlen64LingoB       32 cycles
_strlen (Agner Fog)  50 cycles

-- test 15, misaligned 15, 15 bytes
  Masm32 lib szLen   26 cycles
  crt strlen         25 cycles
strlen32sLAME        26 cycles
strlen64LingoB       33 cycles
_strlen (Agner Fog)  73 cycles

Press any key to exit...


I realise that we're looking at sse2+ for ourselves, but most of them I fix are like this (p4's even without hypershite, athlons/durons).
Curious, why do I not get c0000097 (invalid opcode) running on the athlon?
Title: Re: szLen optimize...
Post by: jj2007 on March 13, 2009, 09:43:23 AM
Quote from: sinsi on March 13, 2009, 09:33:48 AM
I fix a lot of computers, and most of them are as the following:

AMD Athlon(tm) XP  2600+ (SSE1)


I realise that we're looking at sse2+ for ourselves, but most of them I fix are like this (p4's even without hypershite, athlons/durons).
Curious, why do I not get c0000097 (invalid opcode) running on the athlon?
Good question indeed. A library version should include a check for the SSE version.

For the madmen, here a new version. It includes now the pretty competitive algo posted by Nightware, 10.03.2009 (http://www.masm32.com/board/index.php?topic=1807.msg81075#msg81075), and Lingo's latest strlen64B with that cute pop ecx, jmp ecx trick. What a pity that it is a bit lame on an ordinary P4 :bg

EDIT: See below for current version.

[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: herge on March 13, 2009, 10:09:03 AM
 
Hi jj2007:

The latest results from herge.


Intel(R) Core(TM)2 Duo CPU     E4600  @ 2.40GHz (SSE4)
codesizes: strlen32s=85, strlen64A=120, strlen64B=84, _strlen=66

-- test 16k, misaligned 0, 16384 bytes
strlen32s            1516 cycles
strlen64LingoB       1221 cycles
NWStrLen             1193 cycles
_strlen (Agner Fog)  2804 cycles

-- test 4k, misaligned 11, 4096 bytes
strlen32s            424 cycles
strlen64LingoB       322 cycles
NWStrLen             333 cycles
_strlen (Agner Fog)  722 cycles

-- test 1k, misaligned 15, 1024 bytes
strlen32s            122 cycles
strlen64LingoB       79 cycles
NWStrLen             102 cycles
_strlen (Agner Fog)  196 cycles

-- test 0, misaligned 0, 95 bytes
  Masm32 lib szLen   101 cycles
  crt strlen         62 cycles
strlen32s            43 cycles
strlen64LingoB       11 cycles
NWStrLen             15 cycles
_strlen (Agner Fog)  19 cycles

-- test 1, misaligned 1, 95 bytes
  Masm32 lib szLen   99 cycles
  crt strlen         100 cycles
strlen32s            19 cycles
strlen64LingoB       11 cycles
NWStrLen             21 cycles
_strlen (Agner Fog)  20 cycles

-- test 7, misaligned 7, 15 bytes
  Masm32 lib szLen   19 cycles
  crt strlen         15 cycles
strlen32s            21 cycles
strlen64LingoB       18 cycles
NWStrLen             25 cycles
_strlen (Agner Fog)  6 cycles

-- test 15, misaligned 15, 15 bytes
  Masm32 lib szLen   20 cycles
  crt strlen         15 cycles
strlen32s            2 cycles
strlen64LingoB       2 cycles
NWStrLen             9 cycles
_strlen (Agner Fog)  7 cycles

Press any key to exit...


Regards herge

Title: Re: szLen optimize...
Post by: sinsi on March 13, 2009, 10:10:47 AM
Sorry jj but this .asm is getting way too complex and hard to follow. How about using .inc's for each algo, then we can get rid of the old, slow ones.
Cmon man, there are 16(?) procs in there and we are testing 4 of them...a 64k source file is too much. My brain gets lost in the labyrinth  :bdg
Title: Re: szLen optimize...
Post by: jj2007 on March 13, 2009, 01:49:14 PM
Quote from: sinsi on March 13, 2009, 10:10:47 AM
Sorry jj but this .asm is getting way too complex and hard to follow. How about using .inc's for each algo, then we can get rid of the old, slow ones.
Cmon man, there are 16(?) procs in there and we are testing 4 of them...a 64k source file is too much. My brain gets lost in the labyrinth  :bdg

OK, I put an effort into simplifying it. 27k instead of 64...
For the legal department: My latest version contains ideas ruthlessly stolen from Lingo:
strlen32s proc src:DWORD ; with lots of inspiration from Lingo, NightWare and Agner Fog
pop eax ; trash the return address to pop...
pop eax ; ...the src pointer
...
jmp dword ptr [esp-8] ; Lingo style equivalent to ret 4 ;-)
strlen32s endp


Although I don't quite understand why he wastes opcodes:

mov ecx, [esp-8]  < not needed
bsf edx, edx
add eax, edx
jmp [esp-8]         ; ecx < not needed
strlen64B endp


EDIT: Replaced attachment, with very minor modifications (2 bytes less for strlen32s :bg).
EDIT(2): New version - 80 bytes, now as fast as Lingo's algo even on modern hardware... :green

[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: lingo on March 13, 2009, 02:07:30 PM
Quote"It seems Lingo has a little edge here."
madjj is never tired to convert my code in lame code...

The only "conversion" I have ever made to your code is to pass the source over the stack, to make the benchmarks comparable to the others.

Really?
Do you want to read together?..."to make it comparable to the others"  :lol

Lingo's code:

align 16
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 ; align 15
strlen64xB proc szBuffer : dword ; old 85 bytes version, before [url=http://www.masm32.com/board/index.php?topic=1807.msg81266#msg81266]this post[/url]; commented by jj
;db 0cch
mov eax, [esp+4] ; pointer to src
movdqu xmm2, [eax] ; move 16 bytes into xmm2
pxor xmm0, xmm0 ; zero for comparison
pcmpeqb xmm2, xmm0 ; set bytes in xmm2 to FF if nullbytes found in xmm2
pxor xmm1, xmm1 ; zero for comparison
pmovmskb edx, xmm2 ; set byte mask in edx
lea ecx, [eax+16] ; ecx=eax+16
test edx, edx
jnz Ex_0
and eax, -16 ; align eax to para
@@:
add eax, 32
pcmpeqb xmm0, [eax-16]
pcmpeqb xmm1, [eax]
por xmm1, xmm0
pmovmskb edx, xmm1
test edx, edx
jz @B

shl edx, 16
sub eax, ecx
pmovmskb ecx, xmm0
;sub eax, [esp+4]
or edx, ecx
;sub eax, 16
bsf edx, edx
add eax, edx
ret 4
align 16
Ex_0:
bsf eax, edx
ret 4
strlen64xB endp



and LAME code:



align 16 ; jj2007, 12 March 2007, 85 bytes; 0.176 cycles/byte at 16k on Celeron M (0.3 on P4)
strlen32s proc src:DWORD ; with lots of inspiration from Lingo, NightWare and Agner Fog
mov eax, [esp+4] ; get pointer to string
movups xmm1, [eax] ; move 16 bytes into xmm1, unaligned (adapted from Lingo/NightWare)
pxor xmm0, xmm0 ; zero for comparison (no longer needed for xmm1 - thanks, NightWare)
pcmpeqb xmm1, xmm0 ; set bytes in xmm1 to FF if nullbytes found in xmm1
pmovmskb eax, xmm1 ; set byte mask in eax
bsf eax, eax ; bit scan forward
jne Lt16 ; less than 16 bytes, we can return the index in eax

@@:
push ecx ; all registers preserved, except eax = return value
push edx ; eax will be pointer to initial string, 16-byte aligned

mov ecx, [esp+12] ; get pointer to string
and ecx, -16 ; align initial pointer to 16-byte boundary
lea eax, [ecx+16] ; aligned pointer + 16 (first 0..15 dealt with by movups above)

@@: pcmpeqb xmm0, [eax] ; ---- inner loop inspired by Lingo, with adaptions -----
pcmpeqb xmm1, [eax+16] ; compare packed bytes in [m128] and xmm1 for equality
por xmm1, xmm0 ; or them: one of the mem locations may contain a nullbyte
lea eax, [eax+32] ; len counter (moving up lea or add costs 3 cycles for the 191 byte string)
pmovmskb edx, xmm1 ; set byte mask in edx
test edx, edx
jz @B

pmovmskb ecx, xmm0 ; set byte mask in ecx (has to be repeated, sorry)
shl edx, 16 ; create space for the ecx bytes
or edx, ecx ; combine xmm0 and xmm1 results
bsf edx, edx ; bit scan for the index
lea eax, [eax+edx-32] ; add scan index, subtract initial bytes

pop edx

sub eax, [esp+8]

pop ecx

Lt16: ret 4
strlen32s endp



" ; all registers preserved, except eax = return value"
Why except eax ?   You must preserve and eax too...to make the diagnosis pretty clear  :lol

"Although I don't quite understand why he wastes opcodes:"

I like your RichMasm editor and want to see more functionality in it..for instance: asm code highlighting,
so IMO will be better to spend your efforts to get there rather than to "understand why he wastes opcodes"... :lol


and later again...: :lol

Lingo's code:

align 16
db 8Dh,0A4h,24h,0,0,0,0,8Dh,48h,0,10h
strlen64B proc szBuffer : dword
pop ecx
pop eax
movdqu xmm2, [eax]
pxor xmm0, xmm0
pcmpeqb xmm2, xmm0
pxor xmm1, xmm1
pmovmskb edx, xmm2
test edx, edx
jz @f
bsf eax, edx
jmp ecx
@@:
lea ecx, [eax+16]
and eax, -16
@@:
pcmpeqb xmm0, [eax+16]
pcmpeqb xmm1, [eax+32]
por xmm1, xmm0
add eax, 32
pmovmskb edx, xmm1
test edx, edx
jz @B
shl edx, 16
sub eax, ecx
pmovmskb ecx, xmm0
or edx, ecx
mov ecx, [esp-8]
bsf edx, edx
add eax, edx
jmp ecx
strlen64B endp



and LAME code:



align 16 ; jj2007, 13 March 2007, 82 bytes; 0.176 cycles/byte at 16k on Celeron M (0.3 on P4)
strlen32s proc src:DWORD ; with lots of inspiration from Lingo, NightWare and Agner Fog
pop eax ; trash the return address to pop...
pop eax ; ...the src pointer
movups xmm1, [eax] ; move 16 bytes into xmm1, unaligned (adapted from Lingo/NightWare)
pxor xmm0, xmm0 ; zero for comparison (no longer needed for xmm1 - thanks, NightWare)
pcmpeqb xmm1, xmm0 ; set bytes in xmm1 to FF if nullbytes found in xmm1
pmovmskb eax, xmm1 ; set byte mask in eax
bsf eax, eax ; bit scan forward
jne Lt16 ; less than 16 bytes, we are done

mov edx, [esp-4] ; get pointer to string
and edx, -16 ; align initial pointer to 16-byte boundary
lea eax, [edx+16] ; aligned pointer + 16 (first 0..15 dealt with by movups above)

@@: pcmpeqb xmm0, [eax] ; ---- inner loop inspired by Lingo, with adaptions -----
pcmpeqb xmm1, [eax+16] ; compare packed bytes in [m128] and xmm1 for equality
por xmm1, xmm0 ; or them: one of the mem locations may contain a nullbyte
lea eax, [eax+32] ; len counter (moving up lea or add costs 3 cycles for the 191 byte string)
pmovmskb edx, xmm1 ; set byte mask in edx
test edx, edx
jz @B
        sub eax, [esp-4] ; subtract original src pointer
push ecx ; all registers preserved, except edx and eax = return value
shl edx, 16 ; create space for the ecx bytes
pmovmskb ecx, xmm0 ; set byte mask in ecx (has to be repeated, sorry)
or edx, ecx ; combine xmm0 and xmm1 results
bsf edx, edx ; bit scan for the index
pop ecx
lea eax, [eax+edx-32] ; add scan index
Lt16:
jmp dword ptr [esp-8] ; Lingo style equivalent to ret 4 ;-)
strlen32s endp




"We don't live in a perfect world so we cannot garrantee that the forum will always be "idiot free"
but we do our best to keep it friendly..."
by Hutch  :lol







Title: Re: szLen optimize...
Post by: jj2007 on March 13, 2009, 02:51:19 PM
Quote from: lingo on March 13, 2009, 02:07:30 PM
Quote
madjj is never tired to convert my code in lame code...

The only "conversion" I have ever made to your code is to pass the source over the stack, to make the benchmarks comparable to the others.

Really?
Do you want to read together?..."to make it comparable to the others"

Lingo, I had understood that you had accused me of modifying your algos to make them slower. Misunderstanding, sorry. I have never denied that I stole ideas from you. Where else should I get good ideas? :bg
(although some of your code looks suspiciously similar to what NightWare posted a long time ago ::))

Quote
" ; all registers preserved, except eax = return value"
Why except eax ?   You must preserve and eax too...to make the diagnosis pretty clear  :lol

Finally a glimpse of humour, voilà! NightWare preserves ecx and edx, too. But I dropped support for "safe edx". IMHO, ecx should be preserved because it is so often used as a counter.

Quote
"Although I don't quite understand why he wastes opcodes:"

I like your RichMasm editor and want to see more functionality in it..for instance: asm code highlighting,
so IMO will be better to spend your efforts to get there rather than to "understand why he wastes opcodes"... :lol


Automatic highlighting is a matter of taste. I prefer to see (mostly) plain text, so that I can highlight myself the few areas where I still have a problem to solve. And then, it seems as if there is a conflict between automatic highlighting and making full use of the RichEdit features, such as hyperlinks ("before this post (http://www.masm32.com/board/index.php?topic=1807.msg81266#msg81266); ", see below)

As to the wasted opcodes, check yourself (even with Notepad :wink); two bytes less, and one or two cycles faster.

(http://www.webalice.it/jj2006/pics/strlen64.png)
Title: Re: szLen optimize...
Post by: PBrennick on March 13, 2009, 03:02:11 PM
Okay, Lingo,

A little help here. Who is the better coder, Lingo or Lingo?

Paul
Title: Re: szLen optimize...
Post by: lingo on March 13, 2009, 04:03:00 PM
"NightWare preserves ecx and edx, too..."
but Hutch doesn't... :lol

"And then, it seems as if there is a conflict between automatic highlighting and making full use of the RichEdit features, such as hyperlinks .."
It seems...  you prefer to steal other's ideas and algos (for example: from lesson 35, Iczelion) rather than to  use your own automatic highlighting algo for .RTF  files to resolve the problems. :wink


"and one or two cycles faster."
Read A.Fog: Which one is faster - jump to register or jump to memory

For me:  mov ecx, [esp-8] ; this instruction is for free!!! :lol
       .......   
               jmp  ecx

is faster than
   jmp dword ptr [esp-8]

If you disagree just ask herge or sinsi to make tests for you (due to archaic type of your  CPUs).  :lol
Theirs CPUs are OK.
Title: Re: szLen optimize...
Post by: jj2007 on March 13, 2009, 05:17:17 PM
Quote from: lingo on March 13, 2009, 04:03:00 PM
It seems...  you prefer to steal other's ideas and algos (for example: from lesson 35, Iczelion) rather than to  use your own automatic highlighting algo for .RTF  files to resolve the problems. :wink

Tut 35 has 1265 lines, my RichMasm source has over 9500. No need to steal. Besides, I also wrote already that I don't like Xmas trees. I am beyond that age :bg

Quote
"and one or two cycles faster."
Read A.Fog: Which one is faster - jump to register or jump to memory

For me:  mov ecx, [esp-8] ; this instruction is for free!!! :lol
       .......   
               jmp  ecx

is faster than
   jmp dword ptr [esp-8]

If you disagree just ask herge or sinsi to make tests for you (due to archaic type of your  CPUs).  :lol
Theirs CPUs are OK.

I am a fan of Agner, but I am an even greater fan of MichaelW's timer.asm :U
47      cycles jmp directly
48      cycles mov+jmp

47      cycles jmp directly
48      cycles mov+jmp

47      cycles jmp directly
48      cycles mov+jmp


Divide by ten. And before you shoot from the hip again: I never said that jumping directly is much faster.

EDIT: Intel(R) Celeron(R) M CPU        420  @ 1.60GHz (SSE3): 40 for both of them, i.e. 4.00 cycles
Title: Re: szLen optimize...
Post by: NightWare on March 14, 2009, 04:12:12 AM
hmm... someone here said once "reading asm related posts, is better than smoking marijuna" or something like that... seriously he was under the truth...

just few things, before it degenerate more :

concerning the algos, even if 1000+ instructions in x86, the simd instructions to compare byte are not numberous, so using it is quite logical. since we generally use the same programming schemes, it seams quite logical to obtain a similar conception/algo/result... nothing "strange".

plus, if we post an algo we give to others the possibility to improve it... we implicitly encourage the "copy/use" of the original algo as basement... (it's the purpose of the laboratory, no ?).

to finish, like said by someone else, the benefit of this sort of algo is quite limited... nothing serious to fight for...

Title: Re: szLen optimize...
Post by: jj2007 on March 14, 2009, 10:32:43 AM
Quote from: NightWare on March 14, 2009, 04:12:12 AM
concerning the algos, even if 1000+ instructions in x86, the simd instructions to compare byte are not numberous, so using it is quite logical. since we generally use the same programming schemes, it seams quite logical to obtain a similar conception/algo/result... nothing "strange".
Indeed, I was not hinting at any copyleft issues :wink - it was merely an observation that apparently we (you, Lingo, myself) have pushed the CPU to its limits; so our algos must look almost identical. Tonight, I managed to squeeze out a few cycles by moving a line up or down, and then had the bright idea to unroll the inner loop, but nope, not a single cycle less, this is the limit. What counts in the end is a factor 5 improvement on szLen and crt_strlen, and a factor 10 on lstrlenA. For my part, this thread can be closed peacefully.
Title: Re: szLen optimize...
Post by: herge on March 14, 2009, 10:43:17 AM
 Hi jj2007:

We can't close yet we havn't got to 20 yet?
We at 18 we can do it!

Regards herge
Title: Re: szLen optimize...
Post by: lingo on March 14, 2009, 02:16:44 PM
"so using it is quite logical. since we generally use the same programming schemes, it seams quite logical to obtain a similar conception/algo/result... nothing "strange".
plus, if we post an algo we give to others the possibility to improve it... we implicitly encourage the "copy/use" of the original algo as basement... (it's the purpose of the laboratory, no ?)."


I implicitly encourage everyone to improve it too, but not to make it bad to worse.. :wink
In this case as a free human being I have a human right to tell my opinion too... :lol

What about criteria who is right or wrong?
The results!
But we have different results on different CPUs
as a jj respectfully stated "Ever heard about hardware differences?" :wink
Who makes code optimization for archaic CPUs?  IMO sick people... :lol
Who preserves ecx and edx registers in "this sort of algo"? IMO lame people...
I can continue with who and IMO... :wink


"the benefit of this sort of algo is quite limited ..."

A lot of people have similar opinion but fortunately some people from Intel
created new faster instructions exactly for "this sort of algo"...
The speed is never enough.

"nothing serious to fight for..."
As  an engineer I believe in numbers rather than in emotions and empty words as a serious,
unserious, etc...

Title: Re: szLen optimize...
Post by: jj2007 on March 14, 2009, 08:32:43 PM
For those who have followed this thread, here finally a "library package". All you really need is to extract the file slenSSE2.inc to \masm32\include\slenSSE2.inc

Here is the most basic usage example:

include \masm32\include\masm32rt.inc
include \masm32\include\slenSSE2.inc

.code
ShortString db "My short string", 0

start:
print offset ShortString, " has "
print str$(len(offset ShortString)), " bytes"

print chr$(13, 10, 10, "-- hit any key --")

getkey
exit

end start


If you use the len macro in your code, then the only difference to ordinary Masm32 code is line 2, i.e. you can make entire projects a bit faster just by adding this line.
By default, my own strlen32s algo will be used for len. Lingo's and Nightware's algos can be forced by adding...
SlenUseAlgos = 2 ; Lingo
SlenUseAlgos = 4 ; NightWare
...before the include (see strlenSSE2.asm for more detail, and benchmarks comparing all three).
These two are equally fast; however, only the default algo (strlen32s) has a check if the CPU allows SSE2 code. If that check fails, len will revert to crt_strlen - slow but still a factor 2 faster than the standard Masm32lib szLen.

Cheers, jj


EDIT: I removed the attachment in favour of the new version posted on page 19 (http://www.masm32.com/board/index.php?topic=1807.msg81400#msg81400). See remarks on preserving edx.
Title: Re: szLen optimize...
Post by: mitchi on March 14, 2009, 10:28:31 PM
Nice work, bit artisans  :bg
Title: Re: szLen optimize...
Post by: herge on March 14, 2009, 11:20:47 PM
 Hi jj2207:

Eh "Houston we Have Liftoff!".

Great work jj2007.

I almost used the wrong assembler, you have to
use the assembler that comes with VC2005 Express.

Regards herge
Title: Re: szLen optimize...
Post by: jj2007 on March 15, 2009, 08:34:38 AM
Quote from: herge on March 14, 2009, 11:20:47 PM
Hi jj2207:

Eh "Houston we Have Liftoff!".

Great work jj2007.

I almost used the wrong assembler, you have to
use the assembler that comes with VC2005 Express.

Regards herge

Thanxalot, herge. The credits go also to NightWare and Lingo, of course, whose algos can be activated easily as shown above.
@NightWare & Lingo: If you consider adding the CheckSSE2 to your algos, please let me know. The check costs only about one cycle (see below, bottom of tests: 5 instead of 4 cycles for the 15 byte string), and makes sure that code works fine on whatever archaic CPU the user runs :green

Re VC2005 Express: SSE2 code should also work with Masm 6.15, and it definitely works fine with JWasm (http://www.japheth.de/JWasm.html).

Intel(R) Celeron(R) M CPU        420  @ 1.60GHz (SSE3)
codesizes: strlen32s=124strlen64B=84NWStrLen=118, _strlen=66 bytes

-- test 16k, misaligned 0, 16434 bytes
strlen32s            2918 cycles
strlen64LingoB       2921 cycles
NWStrLen             2935 cycles
_strlen (Agner Fog)  4264 cycles

-- test 4k, misaligned 11, 4096 bytes
strlen32s            753 cycles
strlen64LingoB       740 cycles
NWStrLen             757 cycles
_strlen (Agner Fog)  1096 cycles

-- test 1k, misaligned 15, 1024 bytes
  Masm32 lib szLen   1308 cycles
  crt strlen         971 cycles
strlen32s            198 cycles
strlen64LingoB       192 cycles
NWStrLen             208 cycles
_strlen (Agner Fog)  272 cycles

-- test 0, misaligned 0, 100 bytes
  Masm32 lib szLen   132 cycles
  crt strlen         110 cycles
strlen32s            27 cycles
strlen64LingoB       25 cycles
NWStrLen             32 cycles
_strlen (Agner Fog)  34 cycles

-- test 1, misaligned 1, 100 bytes
  Masm32 lib szLen   132 cycles
  crt strlen         132 cycles
strlen32s            28 cycles
strlen64LingoB       25 cycles
NWStrLen             32 cycles
_strlen (Agner Fog)  34 cycles

-- test 5, misaligned 5, 15 bytes
  Masm32 lib szLen   24 cycles
  crt strlen         28 cycles
strlen32s            5 cycles
strlen64LingoB       4 cycles
NWStrLen             15 cycles
_strlen (Agner Fog)  14 cycles

-- test 15, misaligned 15, 15 bytes
  Masm32 lib szLen   25 cycles
  crt strlen         25 cycles
strlen32s            5 cycles
strlen64LingoB       4 cycles
NWStrLen             15 cycles
_strlen (Agner Fog)  14 cycles
Title: Re: szLen optimize...
Post by: ToutEnMasm on March 15, 2009, 09:19:00 AM
Hello,
That's a very good work,in masm syntax.
just a few words about compiled it.
It need a masm32rt_586.inc ,the one in masm32 is .486.
Write "   include slenSSE2.inc  ;include it in your masm32\include directory " in the lensse2.asm,avoid to search it.
Compile it in a console application with at least ml 7.0
That's all.
Title: Re: szLen optimize...
Post by: herge on March 15, 2009, 11:06:10 AM

Hi jj2007:

I seem to have problems debugging it in windbg.
The EXE works great from dos. But I don't think
windbg likes CPUID for some reason.


strslensse2!start+0x1ab [C:\Program Files\Microsoft Visual Studio 8\VC\bin\strslensse2.asm @ 235]:
00401330 33c0            xor     eax,eax
00401332 0fa2            cpuid
00401334 0f31            rdtsc
00401336 52              push    edx
00401337 50              push    eax
00401338 c705dcb7400020a10700 mov dword ptr [strslensse2!__counter__loop__counter__ (0040b7dc)],7A120h
00401342 33c0            xor     eax,eax
00401344 0fa2            cpuid


It's not your code it's windbg acting up?
It's screwing up on either a T or P ?
C:\Documents and Settings\User\My Documents\My Pictures\401332.zip

See attachment JPG EXE ASM

Regards herge



[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: jj2007 on March 15, 2009, 11:14:07 AM
Quote from: herge on March 15, 2009, 11:06:10 AM

Hi jj2007:

I seem to have problems debugging it in windbg.
The EXE works great from dos. But I don't think
windbg likes CPUID for some reason.


I remember having the same problem with OllyDbg, but right now I can't reproduce it. Any Olly experts around who could explain what's going on?
Title: Re: szLen optimize...
Post by: herge on March 15, 2009, 11:17:02 AM
 Hi jj2007:

We got the wrong EXE there oops!

Attachment EXE ASM

Regards herge

Title: Re: szLen optimize...
Post by: herge on March 15, 2009, 11:23:48 AM
hi jj2007:

Will try that agan

Let me know if you got it.

I am not having much luck with

Winrar today ir's a pain in the butt.

Regards herge

[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: herge on March 15, 2009, 02:34:14 PM
 Hi jj2007:

I am making some progress but I could be going backwards?

Application popup: windbg.exe - Application Error : The instruction at "0x65e36abb" referenced memory at "0x00106130".
The memory could not be "read".

Click on OK to terminate the program

Application popup: windbg.exe - Application Error : The instruction at "0x65e36abb" referenced memory at "0x000fe190".
The memory could not be "read".

And when you put a breakpoint on 65e36abb you get a 299 error.

whick get's you this.


Details
Product: Windows Operating System
ID: 26
Source: Application Popup
Version: 5.2
Symbolic Name: STATUS_LOG_HARD_ERROR
Message: Application popup: %1 : %2
   
Explanation
The program could not load a driver because the program user doesn't have sufficient privileges to access
the driver or because the drive is missing or corrupt.

   
User Action
To correct this problem:

Ensure that the program user has sufficient privileges to access the directory in which the driver is installed.
Reinstall the program to restore the driver to the correct location.
If these solutions do not work, contact Product Support Services.

//
// MessageId: STATUS_SHARED_POLICY
//
// MessageText:
//
// The policy object is shared and can only be modified at the root
//
#define STATUS_SHARED_POLICY             ((NTSTATUS)0xC0000299L)

Unable to insert breakpoint 10000 at 65e36abb, Win32 error 0n299
    "Only part of a ReadProcessMemory or WriteProcessMemory request was completed."
The breakpoint was set with BP.  If you want breakpoints
to track module load/unload state you must use BU.
go bp10000 at 65e36abb failed


I will keep you posted if we can get help from Microsoft,
but I won't hold my breath.

Regards herge
Title: Re: szLen optimize...
Post by: BlackVortex on March 15, 2009, 07:20:26 PM
Runs fine under Olly for me.
Title: Re: szLen optimize...
Post by: jj2007 on March 15, 2009, 10:33:08 PM
Good and bad news:

First, the bad news: The "fast len() with SSE2" package attached below will not work with the ml.exe version 6.14 that gets installed when you download the Masm32 package. The reason is simply that the old Masm 6.14 (Copyright (C) Microsoft Corp 1981-1997) does not yet understand SSE2.

Now the good news:

1. It will work perfectly with JWasm (http://www.japheth.de/JWasm.html) (freeware), and with any later Masm version that comes along with the various VC express etc. downloads (see masm 6.14 or 6.15? (http://www.masm32.com/board/index.php?topic=10863.msg79596#msg79596) - I have tested it only on ml.exe versions 6.15 and 9.0).

2. The default algo is now fully compatible with the Masm32lib len() macro. This means in practice that you can speed up existing projects that use len() simply by adding the include line:

include \masm32\include\masm32rt.inc
include \masm32\include\slenSSE2.inc

I should explain why I put now in red. There was an exchange of views between Lingo and myself on the value of preserving edx and ecx (Lingo: Who preserves ecx and edx registers in "this sort of algo"? (http://www.masm32.com/board/index.php?topic=1807.msg81342#msg81342)). In the end, I kept saving ecx (a valuable counter register) and trashed edx. And, bang, my RichMasm project misbehaved. Intense bug chasing revealed that I had previously and unwillingly relied on a non-documented feature of the Masm32lib szLen routine - the one that is behind the len() macro. It does preserve ecx and edx. Therefore, the new version attached below does the same, in order not to break existing code: ecx and edx are preserved. The same applies to NightWare's version (SlenUseAlgos = 4) but not for Lingo's version (SlenUseAlgos = 2).

Enjoy,
jj2007

[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: hutch-- on March 16, 2009, 01:36:56 PM
JJ,

The szLen algo is correct in its register usage. It only uses EAX and the stack pointer. If you have had problems using it with RichMASM it is because your register usage is non standard.


; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««

    .486
    .model flat, stdcall  ; 32 bit memory model
    option casemap :none  ; case sensitive

    .code

; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««

OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE

align 4

szLen proc src:DWORD

    mov eax, [esp+4]
    sub eax, 4

  @@:
    add eax, 4
    cmp BYTE PTR [eax], 0
    je lb1
    cmp BYTE PTR [eax+1], 0
    je lb2
    cmp BYTE PTR [eax+2], 0
    je lb3
    cmp BYTE PTR [eax+3], 0
    jne @B

    sub eax, [esp+4]
    add eax, 3
    ret 4
  lb3:
    sub eax, [esp+4]
    add eax, 2
    ret 4
  lb2:
    sub eax, [esp+4]
    add eax, 1
    ret 4
  lb1:
    sub eax, [esp+4]
    ret 4

szLen endp

OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef

; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««

end
Title: Re: szLen optimize...
Post by: jj2007 on March 16, 2009, 03:08:56 PM
Quote from: hutch-- on March 16, 2009, 01:36:56 PM
JJ,

The szLen algo is correct in its register usage. It only uses EAX and the stack pointer. If you have had problems using it with RichMASM it is because your register usage is non standard.


Hutch,

1. My register usage is standard,

2. I did not say that szLen was incorrect:

Quote from: jj2007 on March 15, 2009, 10:33:08 PM
I had previously and unwillingly relied on a non-documented feature of the Masm32lib szLen routine - the one that is behind the len() macro. It does preserve ecx and edx.

In contrast to most other Masm32lib functions, len() does preserve ecx and edx. But it is not documented. And when I wrote "unwillingly", it means that I had forgotten (in only one of 55 uses of len) to follow the ABI convention saying you must preserve ecx and edx yourself if you need them after an API or library call. It also means that, not knowing that len preserves ecx and edx, I reflected 54 times unnecessarily whether I needed to preserve them myself :(

"Feature" is a positive word, and there was no irony involved. For version 11, you might consider mentioning this in the documentation. It's good that a function so frequently used does preserve the registers, that's why in the end I chose to do the same in my implementation of len().

So can you please accept my friendly clap on the shoulder?
:U
Title: Re: szLen optimize...
Post by: PBrennick on March 16, 2009, 03:29:04 PM
JJ,

I am trying to understand so please help. Do you mean that szLen preserves ECX and EDX by virtue of the fact it does not use them? I am a little confused here.

Paul
Title: Re: szLen optimize...
Post by: ToutEnMasm on March 16, 2009, 03:40:46 PM

There is only one rule,esi edi and ebx must be preserved when a proc used them.That's all.
If your code use others registers than this one ,you must preserve them ,before a call to a subroutine.
If he don't made this,modify your code ,not the subroutine.It's a bad practice.
Title: Re: szLen optimize...
Post by: jj2007 on March 16, 2009, 03:47:09 PM
Quote from: ToutEnMasm on March 16, 2009, 03:40:46 PM

There is only one rule,esi edi and ebx must be preserved when a proc used them.That's all.
If your code use others registers than this one ,you must preserve them ,before a call to a subroutine.
If he don't made this,modify your code ,not the subroutine.It's a bad practice.


You are right, in principle. However, since I wrote code that claims to be a replacement for len() aka invoke szLen, offset My$, and since there a lots of newbies and oldbies around who might have written code that relies on this undocumented feature of szLen, I think it's better to modify the subroutine rather than the code. I have added include \masm32\include\slenSSE2.inc as line 2 of my 9,500 lines of RichMasm source, and it works perfectly. That was the goal: give SSE2 speed to an existing application without rewriting it.
Title: Re: szLen optimize...
Post by: MichaelW on March 16, 2009, 04:14:01 PM
JJ,

There are multiple procedures in the MASM32 library that like szLen alter only EAX. Why should they be documented as preserving ECX and EDX when they are following the documented register-preservation conventions? If your code is depending on EAX, ECX, or EDX to be preserved, then your register usage is non-standard by the conventions of the mainstream 32-bit x86 world.

Title: Re: szLen optimize...
Post by: jj2007 on March 16, 2009, 04:55:39 PM
Quote from: MichaelW on March 16, 2009, 04:14:01 PM
JJ,

There are multiple procedures in the MASM32 library that like szLen alter only EAX. Why should they be documented as preserving ECX and EDX when they are following the documented register-preservation conventions? If your code is depending on EAX, ECX, or EDX to be preserved, then your register usage is non-standard by the conventions of the mainstream 32-bit x86 world.


Michael,

You are right. However, my normal register usage is standard. I had a bug in my source, but I would never had noticed it if my new version of len() had not trashed edx.

However, my goal was to be compatible with the current len() implementation, and be sure that it won't break any existing code.
I invite everybody who uses the len() macro to add a few lines at the top of their biggest source:

len MACRO ptr
  invoke szLen, ptr
  xor ecx, ecx  ; trash two registers that can be legally trashed
  xor edx, edx  ; according to the convention
  EXITM <eax>
ENDM

According to the convention, nobody should experience any problems :boohoo:

P.S.: In \masm32\include\slenSSE2.inc, I added a TestMasmVersion for those who try to assemble with ml 614 (it would assemble with 614, but the code may fail unexpectedly, so I decided to throw an error).

New code attached above.
Title: Re: szLen optimize...
Post by: ToutEnMasm on March 16, 2009, 05:13:58 PM

A little publicite for my ide,If someone had too much trouble modifying a few lines,there is a tool in my ide who can help with this.
His name is cherche (search in engilsh).For example he can find a word in each header file of the sdk and give a result with the name of the file and the line(s) where he found the word.A right clic on the named file,is enough to view the file with notepad and modify it.
The search take about,30 seconds.
There is about 1200 header files in the sdk and i haven't make a count of the lines.

Title: Re: szLen optimize...
Post by: herge on March 17, 2009, 03:20:03 PM
 Hi There:

Some intresting results:

Intel(R) Core(TM)2 Duo CPU     E4600  @ 2.40GHz (SSE4)
codesizes: strlen32s=124strlen64B=84NWStrLen=118, _strlen=66 bytes

-- test 16k, misaligned 0, 16434 bytes
strlen32s            1467 cycles
strlen64LingoB       1213 cycles
NWStrLen             1323 cycles
_strlen (Agner Fog)  2804 cycles

-- test 4k, misaligned 11, 4096 bytes
strlen32s            394 cycles
strlen64LingoB       321 cycles
NWStrLen             342 cycles
_strlen (Agner Fog)  712 cycles

-- test 1k, misaligned 15, 1024 bytes
  Masm32 lib szLen   1055 cycles
  crt strlen         618 cycles
strlen32s            114 cycles
strlen64LingoB       85 cycles
NWStrLen             113 cycles
_strlen (Agner Fog)  197 cycles

-- test 0, misaligned 0, 100 bytes
  Masm32 lib szLen   106 cycles
  crt strlen         69 cycles
strlen32s            17 cycles
strlen64LingoB       11 cycles
NWStrLen             20 cycles
_strlen (Agner Fog)  21 cycles

-- test 1, misaligned 1, 100 bytes
  Masm32 lib szLen   106 cycles
  crt strlen         105 cycles
strlen32s            17 cycles
strlen64LingoB       11 cycles
NWStrLen             18 cycles
_strlen (Agner Fog)  21 cycles

-- test 5, misaligned 5, 15 bytes
  Masm32 lib szLen   19 cycles
  crt strlen         17 cycles
strlen32s            4 cycles
strlen64LingoB       1 cycles
NWStrLen             9 cycles
_strlen (Agner Fog)  7 cycles

-- test 15, misaligned 15, 15 bytes
  Masm32 lib szLen   19 cycles
  crt strlen         16 cycles
strlen32s            3 cycles
strlen64LingoB       2 cycles
NWStrLen             10 cycles
_strlen (Agner Fog)  7 cycles
-- hit any key --


And Under Windbg I can't wait till it finishes?

See Attachment.

It;s VERy VERY SLOW!

Regards herge



[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: Mark Jones on March 17, 2009, 03:29:12 PM
Latest:

AMD Athlon(tm) 64 X2 Dual Core Processor 4000+ (SSE3)
codesizes: strlen32s=132strlen64B=84NWStrLen=118, _strlen=66 bytes

-- test 16k, misaligned 0, 16434 bytes
strlen32s            3206 cycles
strlen64LingoB       3188 cycles
NWStrLen             3198 cycles
_strlen (Agner Fog)  14239 cycles

-- test 4k, misaligned 11, 4096 bytes
strlen32s            842 cycles
strlen64LingoB       826 cycles
NWStrLen             842 cycles
_strlen (Agner Fog)  3560 cycles

-- test 1k, misaligned 15, 1024 bytes
  Masm32 lib szLen   1240 cycles
  crt strlen         843 cycles
strlen32s            254 cycles
strlen64LingoB       240 cycles
NWStrLen             255 cycles
_strlen (Agner Fog)  917 cycles

-- test 0, misaligned 0, 100 bytes
  Masm32 lib szLen   140 cycles
  crt strlen         99 cycles
strlen32s            55 cycles
strlen64LingoB       40 cycles
NWStrLen             53 cycles
_strlen (Agner Fog)  139 cycles

-- test 1, misaligned 1, 100 bytes
  Masm32 lib szLen   140 cycles
  crt strlen         109 cycles
strlen32s            58 cycles
strlen64LingoB       43 cycles
NWStrLen             56 cycles
_strlen (Agner Fog)  103 cycles

-- test 5, misaligned 5, 15 bytes
  Masm32 lib szLen   22 cycles
  crt strlen         26 cycles
strlen32s            25 cycles
strlen64LingoB       22 cycles
NWStrLen             38 cycles
_strlen (Agner Fog)  36 cycles

-- test 15, misaligned 15, 15 bytes
  Masm32 lib szLen   23 cycles
  crt strlen         21 cycles
strlen32s            24 cycles
strlen64LingoB       21 cycles
NWStrLen             40 cycles
_strlen (Agner Fog)  35 cycles
Title: Re: szLen optimize...
Post by: Jimg on March 17, 2009, 04:29:48 PM
AMD Athlon(tm) XP 3000+ (SSE1)
ERROR in StrSizeA at ebx=4096: 20535 bytes instead of 4096
ERROR in strlen32c at ebx=4096: 0 bytes instead of 4096
ERROR in strlen64B at ebx=4096: 20535 bytes instead of 4096
ERROR in NWStrLen at ebx=4096: 20535 bytes instead of 4096
codesizes: strlen32s=132strlen64B=84NWStrLen=118, _strlen=66 bytes

-- test 16k, misaligned 0, 16434 bytes
strlen32s            14573 cycles
strlen64LingoB       2782 cycles
NWStrLen             2783 cycles
_strlen (Agner Fog)  22914 cycles

-- test 4k, misaligned 11, 4096 bytes
strlen32s            3661 cycles
strlen64LingoB       3453 cycles
NWStrLen             3470 cycles
_strlen (Agner Fog)  28603 cycles

-- test 1k, misaligned 15, 1024 bytes
  Masm32 lib szLen   1574 cycles
  crt strlen         931 cycles
strlen32s            944 cycles
strlen64LingoB       225 cycles
NWStrLen             227 cycles
_strlen (Agner Fog)  1487 cycles

-- test 0, misaligned 0, 100 bytes
  Masm32 lib szLen   169 cycles
  crt strlen         112 cycles
strlen32s            125 cycles
strlen64LingoB       40 cycles
NWStrLen             51 cycles
_strlen (Agner Fog)  193 cycles

-- test 1, misaligned 1, 100 bytes
  Masm32 lib szLen   169 cycles
  crt strlen         121 cycles
strlen32s            132 cycles
strlen64LingoB       40 cycles
NWStrLen             50 cycles
_strlen (Agner Fog)  161 cycles

-- test 5, misaligned 5, 15 bytes
  Masm32 lib szLen   26 cycles
  crt strlen         29 cycles
strlen32s            40 cycles
strlen64LingoB       32 cycles
NWStrLen             38 cycles
_strlen (Agner Fog)  50 cycles

-- test 15, misaligned 15, 15 bytes
  Masm32 lib szLen   26 cycles
  crt strlen         26 cycles
strlen32s            37 cycles
strlen64LingoB       33 cycles
NWStrLen             46 cycles
_strlen (Agner Fog)  72 cycles
                                        -- hit any key --
Title: Re: szLen optimize...
Post by: jj2007 on March 17, 2009, 05:04:13 PM
Quote from: Jimg on March 17, 2009, 04:29:48 PM
AMD Athlon(tm) XP 3000+ (SSE1)
ERROR in StrSizeA at ebx=4096: 20535 bytes instead of 4096
ERROR in strlen32c at ebx=4096: 0 bytes instead of 4096
ERROR in strlen64B at ebx=4096: 20535 bytes instead of 4096
ERROR in NWStrLen at ebx=4096: 20535 bytes instead of 4096


Yeah, it's mostly SSE2 only. My strlen32s is not in the error list because it reverts to crt_strlen for SSE<2 (compare the timings :bg)
But I wonder whether it would run on an SSE1 CPU...? The instructions I used (movups, pmovmskb, pcmpeqb) seem to be SSE1 ::)

Could you please make a test by adding CheckSSE2 = 0 before the include line, i.e.

CheckSSE2 =0
include \masm32\include\slenSSE2.inc
include \masm32\macros\timers.asm

in slen_timings.asm?
Title: Re: szLen optimize...
Post by: Jimg on March 17, 2009, 06:07:48 PM
Sure-
AMD Athlon(tm) XP 3000+ (SSE1)
ERROR in StrSizeA at ebx=4096: 20535 bytes instead of 4096
ERROR in strlen32c at ebx=4096: 0 bytes instead of 4096
ERROR in strlen32s at ebx=4096: 2 bytes instead of 4096
ERROR in strlen64B at ebx=4096: 20535 bytes instead of 4096
ERROR in NWStrLen at ebx=4096: 20535 bytes instead of 4096
codesizes: strlen32s=88strlen64B=84NWStrLen=118, _strlen=66 bytes

-- test 16k, misaligned 0, 0 bytes
strlen32s            25 cycles
strlen64LingoB       2782 cycles
NWStrLen             2785 cycles
_strlen (Agner Fog)  22940 cycles

-- test 4k, misaligned 11, 0 bytes
strlen32s            29 cycles
strlen64LingoB       3453 cycles
NWStrLen             3474 cycles
_strlen (Agner Fog)  28719 cycles

-- test 1k, misaligned 15, 0 bytes
  Masm32 lib szLen   1577 cycles
  crt strlen         933 cycles
strlen32s            29 cycles
strlen64LingoB       226 cycles
NWStrLen             227 cycles
_strlen (Agner Fog)  1489 cycles

-- test 0, misaligned 0, 100 bytes
  Masm32 lib szLen   169 cycles
  crt strlen         112 cycles
strlen32s            25 cycles
strlen64LingoB       40 cycles
NWStrLen             51 cycles
_strlen (Agner Fog)  193 cycles

-- test 1, misaligned 1, 0 bytes
  Masm32 lib szLen   170 cycles
  crt strlen         122 cycles
strlen32s            29 cycles
strlen64LingoB       40 cycles
NWStrLen             50 cycles
_strlen (Agner Fog)  161 cycles

-- test 5, misaligned 5, 0 bytes
  Masm32 lib szLen   27 cycles
  crt strlen         29 cycles
strlen32s            29 cycles
strlen64LingoB       32 cycles
NWStrLen             38 cycles
_strlen (Agner Fog)  50 cycles

-- test 15, misaligned 15, 0 bytes
  Masm32 lib szLen   26 cycles
  crt strlen         25 cycles
strlen32s            29 cycles
strlen64LingoB       33 cycles
NWStrLen             46 cycles
_strlen (Agner Fog)  72 cycles
                                        -- hit any key --
Title: Re: szLen optimize...
Post by: jj2007 on March 17, 2009, 06:19:56 PM
Quote from: Jimg on March 17, 2009, 06:07:48 PM
Sure-

Thanks. Remarkably fast, and remarkably incorrect :green
Title: Re: szLen optimize...
Post by: ToutEnMasm on March 17, 2009, 08:00:53 PM

Viewing the post,I see that he can be a possible problem with the location of the proc.
Proc with SSE code must be in a separate module to work.
I experiment with this,And find it as the only soluce.
That is put all the sse code in the slensse2.inc.
I don't know why it is like that ( i have ml 9.0),but i am certain of what is the problem.

a useful macro can be also added to the slensse2.inc.

Quote
numeroversion equ < @Version>
IF numeroversion LT 615
   %ECHO MASM numeroversion impossible de compiler SSE2
   .ERR  <Version Masm must be at least 6.15 to compile SSE2>
ENDIF
Title: Re: szLen optimize...
Post by: jj2007 on March 17, 2009, 08:43:39 PM
Quote from: ToutEnMasm on March 17, 2009, 08:00:53 PM

Viewing the post,I see that he can be a possible problem with the location of the proc.
Proc with SSE code must be in a separate module to work.
I experiment with this,And find it as the only soluce.
That is put all the sse code in the slensse2.inc.
I don't know why it is like that ( i have ml 9.0),but i am certain of what is the problem.
strlenSSE2.asc uses SSE2 code in the main module and the slenSSE2.inc. JimG has an old SSE1 CPU - I tried to dig out my oldest 6 year old puter, but it's SSE2 already. Nonetheless I have a suspicion that the algo could work with SSE1 - but I cannot test it...

Quote
a useful macro can be also added to the slensse2.inc.

Quote
numeroversion equ < @Version>
IF numeroversion LT 615
   %ECHO MASM numeroversion impossible de compiler SSE2
   .ERR  <Version Masm must be at least 6.15 to compile SSE2>
ENDIF

From the package (http://www.masm32.com/board/index.php?topic=1807.msg81400#msg81400)  (downloaded 10 times right now):

TestMasmVersion MACRO
  ifidn @Version, <614>
echo ####################################################
echo
echo You cannot use the SSE2 library with ml.exe version 614, sorry
echo
echo ####################################################
.err
  endif
ENDM
...
.code
TestMasmVersion


But thanks anyway, ToutEnMasm :U
Title: Re: szLen optimize...
Post by: Jimg on March 17, 2009, 08:44:49 PM
Is this better?
Timings for strlen32s:

25      cycles for len=3
29      cycles for len=3


Timings for Masm32lib szLen:

27      cycles for len=15
24882   cycles for len=16384

                                -- hit any key --
Title: Re: szLen optimize...
Post by: herge on March 17, 2009, 09:50:13 PM
 
Hi All:

A picture of some slow response on my computer while

debugging with winDbg. It would appear it's hanging on

the CPUID instruction, and taking it's sweet time.

See the pretty picture Note the Very Large Cycle times.

Also note strslensse2 and WinDbg Cpu time Useage.

Regards herge

[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: jj2007 on March 17, 2009, 10:21:37 PM
Quote from: Jimg on March 17, 2009, 08:44:49 PM
Is this better?


I am afraid the string lengths should be the same as for szLen...
Still trying to find a reliable database giving info which SSE version corresponds to which instruction.   This file (http://stuff.mit.edu/afs/athena/software/nasm_v2.02/info/nasm.info-9) documents NASM, but it's not that clear. ::)
Title: Re: szLen optimize...
Post by: PBrennick on March 17, 2009, 10:35:24 PM
JJ,

Take a look at my Opcode Database Project. SSE2 instructions are listed as same. SSE1 instructions are listed as SSE. It is not a fancy app but it has the info you need.

hth,
Paul


[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: jj2007 on March 17, 2009, 10:44:09 PM
Quote from: PBrennick on March 17, 2009, 10:35:24 PM
JJ,

Take a look at my Opcode Database Project. SSE2 instructions are listed as same. SSE1 instructions are listed as SSE. It is not a fancy app but it has the info you need.

hth,
Paul


Thanks, Paul, much appreciated. The problem is indeed that pcmpeqb and pmovskb exist as MMX and SSE2 versions. Which means that JimG has no luck - his SSE1 CPU does not throw an exception, but it cannot interpret the 66h prefix... sorry!
Title: Re: szLen optimize...
Post by: ToutEnMasm on March 18, 2009, 07:32:31 AM

Take care with the entry point of your code.
The slenSSE2.inc that repeat .686 .data and so on,is a bad thing.
Title: Re: szLen optimize...
Post by: jj2007 on March 18, 2009, 07:47:04 AM
Quote from: ToutEnMasm on March 18, 2009, 07:32:31 AM

Take care with the entry point of your code.
The slenSSE2.inc that repeat .686 .data and so on,is a bad thing.


Why?
Title: Re: szLen optimize...
Post by: ToutEnMasm on March 18, 2009, 08:06:08 AM

This explain why there is bad results given by the function .
This explain also why the program seems to works slowly.
I have tested that with windbg.Put your include file with code in the section code (The slenSSE2.inc  was in the declare section) ,without the repeat of .686...,and you will have a code that run faster and don't give random results.
That only the fact of an undeterminate entry point,that can be solve randomlly on various machines.
Title: Re: szLen optimize...
Post by: BlackVortex on March 18, 2009, 08:50:10 AM
Quote from: herge on March 17, 2009, 09:50:13 PM

Hi All:

A picture of some slow response on my computer while

debugging with winDbg. It would appear it's hanging on

the CPUID instruction, and taking it's sweet time.

See the pretty picture Note the Very Large Cycle times.

Also note strslensse2 and WinDbg Cpu time Useage.

Regards herge
Maybe use OllyDbg ?    :U
Title: Re: szLen optimize...
Post by: herge on March 18, 2009, 08:55:59 AM
 Hi  BlackVortex:

I tried to download ollydebug three times and all you get

ia a corrupt Zip file. If you can't download it one piece it's

Not going to be used!

Regards herge

Title: Re: szLen optimize...
Post by: BlackVortex on March 18, 2009, 09:33:42 AM
Quote from: herge on March 18, 2009, 08:55:59 AM
Hi  BlackVortex:

I tried to download ollydebug three times and all you get

ia a corrupt Zip file. If you can't download it one piece it's

Not going to be used!

Regards herge


http://www.ollydbg.de/odbg110.zip
This link ?  It works fine.
Title: Re: szLen optimize...
Post by: jj2007 on March 18, 2009, 09:46:56 AM
Quote from: herge on March 18, 2009, 08:55:59 AM
I tried to download ollydebug three times and all you get is a corrupt Zip file.

Try this link (http://www.ollydbg.de/odbg200i.zip) with Firefox and IE. For me, it always works fine (with both browsers).

Just for fun, I also downloaded WinDbg, >17 MB, and tried it. The user interface is disgusting, but it has no problem with the CPUID opcode. Googling for WinDbg CPUID is not very successful, either, so it might be something specific to your CPU ::)

Re entry points etc:

include \masm32\include\masm32rt.inc
include \masm32\include\slenSSE2.inc
txt50 equ <"Just some stüpid text containing exäctly 50 bytes ">
.data
szTest_1 db "My short string", 0
...
.code
start:
tmp$ CATSTR <chr$("Timings for >, StrLenAlgo$, <:")>
print tmp$, 13, 10, 10


I cannot see what could possibly wrong here, and more specifically I would like to see an example where the entry point is being determined by the machine rather than the code. What I see in Olly is that the SSE2 code starts at 00401000 (start of code section), while execution starts at 004010AC, called <ModuleEntryPoint>. That works fine, many coders put procedures before start in order to save the PROTO's.

Reviewing the posts above, it seems that the slenSSE2.inc works fine unless
a) you have a CPU that does not support SSE2 or
b) you use WinDbg and get hung at the CPUID instruction.

Is that correct? Is there any case where the code did not work properly on an SSE2 machine in normal (non-debugged) execution?
Title: Re: szLen optimize...
Post by: herge on March 18, 2009, 10:05:12 AM
 Hi jj2007:

Volume 2A:
Instruction Set Reference, A-M

NOTE: The Intel 64 and IA-32 Architectures Software Developer's Manual
consists of five volumes: Basic Architecture, Order Number 253665;
Instruction Set Reference A-M, Order Number 253666; Instruction Set
Reference N-Z, Order Number 253667; System Programming Guide,
Part 1, Order Number 253668; System Programming Guide, Part 2,
Order Number 253669. Refer to all five volumes when evaluating your
design needs.

Order Number: 253666-029US

CPUID—CPU Identification
Description
The ID flag (bit 21) in the EFLAGS register indicates support for the CPUID instruction.
If a software procedure can set and clear this flag, the processor executing the
procedure supports the CPUID instruction. This instruction operates the same in non-
64-bit modes and 64-bit mode.
CPUID returns processor identification and feature information in the EAX, EBX, ECX,
and EDX registers.1 The instruction's output is dependent on the contents of the EAX
register upon execution (in some cases, ECX as well). For example, the following
pseudocode loads EAX with 00H and causes CPUID to return a Maximum Return
Value and the Vendor Identification String in the appropriate registers:


Go to Intel!

You want CPUID page 228 thru 261.
VOL 2A 3-180 > VOL 2A 3-213



It's about thirty-three pages long and this
manual has 812 pages.

You also need adobe Reader to read it.

Regards herge
Title: Re: szLen optimize...
Post by: jj2007 on March 18, 2009, 10:37:58 AM
Herge, there is no problem with the CPUID instruction. It seems you have a very specific problem with your machine. Can you WinDbg the sample posted here (http://www.masm32.com/board/index.php?board=2;topic=11061.1#msg81562) (SSE2 but totally unrelated to szLen), and maybe insert just for fun the CPUID code to see if it makes any difference?

start:
   pushad
   push 1
   pop eax
   db 0Fh, 0A2h   ; cpuid 1
   xor eax, eax
   xor esi, esi
   bt edx, 25      ; edx bit 25, SSE1
   adc eax, esi
   bt edx, 26      ; edx bit 26, SSE2
   adc eax, esi
   bt ecx, esi      ; ecx bit 0, SSE3 (esi=0)
   adc eax, esi
   bt ecx, 9      ; ecx bit 9, SSE4
   adc eax, esi
   mov Win$, alloc$(1000000)
Title: Re: szLen optimize...
Post by: herge on March 18, 2009, 10:43:32 AM
 Hi jj2007:

It's got something to do with windbg and my computer.

It does not crash but iit is slow if you t or p a CPUID
instruction. It will take most of the day to run.

All cycles times are seven digits long.

The g works great!


00401325 6880000000       push    0x80
0040132a 50               push    eax
0040132b e82e320000       call    strslensse2!SetPriorityClass (0040455e)
strslensse2!start+0x1ab [C:\Program Files\Microsoft Visual Studio 8\VC\bin\strslensse2.asm @ 235]:
00401330 33c0            xor     eax,eax
00401332 0fa2            cpuid <<;; Don't t or P a CPUID
00401334 0f31            rdtsc
00401336 52              push    edx
00401337 50              push    eax
00401338 c705dcb7400020a10700 mov dword ptr [strslensse2!__counter__loop__counter__ (0040b7dc)],7A120h
00401342 33c0            xor     eax,eax
00401344 0fa2            cpuid



Regards herge
Title: Re: szLen optimize...
Post by: herge on March 18, 2009, 11:03:56 AM
 Hi jj2007:

I had no problems with CountLinesSSE2.exe
in WinDbg. I have two versions of Windbg and
both choke on CPUID.

Regards herge
Title: Re: szLen optimize...
Post by: herge on March 18, 2009, 11:46:13 AM
 Hi All:

Well we finally got OllyDEbug with Firefox and my internet radio OFF.
Olly ran Strslensse2.exe with no problem.
So it's either my computer or Windbg, or it's some
software I am running.

Regards herge
Title: Re: szLen optimize...
Post by: ToutEnMasm on March 18, 2009, 12:38:30 PM

To jj2007,
Quote
and more specifically I would like to see an example where the entry point is being determined by the machine rather than the code.

Perhaps did you search a machine wo write the code at your place ?.
The Entry point is always fixed by the code.

i will repeat  my upper post about the soluce.

Quote
You could'nt include code in the declare section
the includelib is just read by the linker and code is added at link time.

No need of special debugger or special machine to run SSE2 instructions.


consoles applications are irrelevent because they are finished before start when lauched with windows.Your one don't make that because there is bad writing in it.

I use windbg and he works perfectly with well written code.
Title: Re: szLen optimize...
Post by: herge on March 18, 2009, 01:57:55 PM

Hi All:

And the loser at 04.75 hrs is strslensse2.exe with WinDbg.

See Attachment.

Regards herge

[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: BlackVortex on March 18, 2009, 02:23:21 PM
@ herge

Why do you insist on using WinDbg ?  It's next to useless (except maybe as a system debugger)
Title: Re: szLen optimize...
Post by: jj2007 on March 18, 2009, 02:48:56 PM
Quote from: ToutEnMasm on March 18, 2009, 12:38:30 PM

To jj2007,
Quote
and more specifically I would like to see an example where the entry point is being determined by the machine rather than the code.

Perhaps did you search a machine wo write the code at your place ?.
The Entry point is always fixed by the code.

YES, that's perfectly correct. I knew that before, but a certain ToutEnMasm insisted that machines might fumble with the entry point:

Quote from: ToutEnMasm on March 18, 2009, 08:06:08 AM
That only the fact of an undeterminate entry point,that can be solve randomlly on various machines.

Quote
No need of special debugger or special machine to run SSE2 instructions.
You need a "special machine" that is capable of SSE2. If you had read the source, you would have discovered the macro that throws an error if you try to assemble it on an SSE1 machine. And there is run-time check in my code that reverts to crt_strlen if SSE1 is detected. That should be fool-proof, right?

Quote
consoles applications are irrelevent because they are finished before start when lauched with windows.Your one don't make that because there is bad writing in it.

Your phrase does not make sense at all, probably a language problem. Please explain, and use a code example.

SlenSSE2.inc works perfectly with console and GUI applications. The only change I had to make to my 9,500 lines RichMasm source was one line:
include \masm32\Gfa2Masm\Gfa2Masm.inc
include \masm32\include\slenSSE2.inc

Quote
I use windbg and he works perfectly with well written code.
Me too. It works perfectly well with all my code.
Title: Re: szLen optimize...
Post by: jj2007 on March 18, 2009, 07:43:34 PM
Quote from: herge on March 18, 2009, 01:57:55 PM
And the loser at 04.75 hrs is strslensse2.exe with WinDbg.

Impressing :bg

And I am very glad that for the 100 byte strings my algo is over 1000 cycles faster than Lingo's :cheekygreen:
Title: Re: szLen optimize...
Post by: herge on March 18, 2009, 07:46:05 PM
 Hi jj2007:

A small sugestion:
To protect against operator stupity.


   ENDM
@@:    
   inkey chr$(9, 9, 9, 9, 9, "-- Hit X Key --")
   cmp AL,"X"
   jnz @B
   exit


But make sure the inkey MACRO in C:\masm32\macros\macros.asm
is updated.


inkey MACRO user_text:VARARG
     IFDIF <user_text>,<NULL>                  ;; if user text not "NULL"
       IFNB <user_text>                        ;; if user text not blank
         print user_text                       ;; print user defined text
       ELSE                                    ;; else
         print "Press any key to continue ..." ;; print default text
       ENDIF
     ENDIF
     call wait_key
     push eax;; < Note push
     print chr$(13,10)
     pop eax;; < Note pop
   ENDM



Note the push and pop, it was
always returning 2 the length
of CRLF which explains why
a CMP AL,? was always
failing after a inkey call.

Regards herge
Title: Re: szLen optimize...
Post by: jj2007 on March 18, 2009, 08:00:04 PM
Quote from: herge on March 18, 2009, 07:46:05 PM
Hi jj2007:
A small sugestion:

Access violation when reading [herge's suggestion] - Shift+Run/Step to pass exception to the owner of the Masm32 macros :wink
Title: Re: szLen optimize...
Post by: herge on March 18, 2009, 08:08:05 PM
 
Hi jj2007:

I know the inkey MACRO has nothing to do with you.
But my suggestion will not work with the present
inkey MACRO which I suspect is not working
right at present.

I did mention it in another forum.

Regards herge
Title: Re: szLen optimize...
Post by: PBrennick on March 18, 2009, 09:40:33 PM
herge,

The inkey macro works correctly within the boundaries of what it was designed to do. The inkey function should not be expected to function in a polcat sort of way.

Use the getkey macro which calls ret_key if you expect to receive a value.

JJ,
About CPUID, not all parameters of this instruction are supported on all CPUs. CPUID should be called with EAX = 0 first, as this will return the highest calling parameter that the CPU supports. To obtain extended function information CPUID should be called with bit 31 of EAX set. To determine the highest extended function calling parameter, call CPUID with EAX = 80000000h. If the particular parameter you are trying to use is higher than that number, then report this to the user and do not use the instruction.

You reported in an earlier posting that you had trouble with CPUID, now you know why.

Paul
Title: Re: szLen optimize...
Post by: jj2007 on March 19, 2009, 06:41:36 AM
Quote from: PBrennick on March 18, 2009, 09:40:33 PM
JJ,
About CPUID, not all parameters of this instruction are supported on all CPUs.

You reported in an earlier posting that you had trouble with CPUID, now you know why.

Paul

Paul,

1. Open \masm32\include\slenSSE2.inc in GeneSys.exe
2. Search for CPUID
3. All you will find is:
ChkSSE2 proc         ; exactly 40 bytes
   pushad
   push 1
   pop eax
   db 0Fh, 0A2h   ; cpuid 1
   xor eax, eax
   xor esi, esi
   bt edx, 25      ; edx bit 25, SSE1
   adc eax, esi
   bt edx, 26      ; edx bit 26, SSE2
   adc eax, esi
   bt ecx, esi      ; ecx bit 0, SSE3 (esi=0)
   adc eax, esi
   bt ecx, 9      ; ecx bit 9, SSE4
   adc eax, esi
   mov MbSSE2, eax
   popad
   ret
ChkSSE2 endp

As you can see, none of the extended functions are being used.

4. Herge's program works perfectly when launched normally. It's WindDbg that has a problem, not my code.
Title: Re: szLen optimize...
Post by: herge on March 19, 2009, 09:15:48 AM
 Hi jj2007:

Your code is great!

But when I try to trace or Proceed a

CPUID

instruction. Windbg take's its sweet time.

The Go in Windbg works great. But on my

computer, Something weird is going on.

It's either my hardware or software

running on my computer.

Your program has always worked from a Dos

Box. [Command Prompt]

Regards herge
Title: Re: szLen optimize...
Post by: PBrennick on March 19, 2009, 11:40:54 AM
JJ,
Your implimentation looks fine to me. Looks liike he should ditch that debugger. There has always been issues with it, anyway. A few years back there was a thread about the pros and cons of it and it seems lots of people have had negative experiences with it.

By the way, I was not bashing your code, I was just wondering if his CPU has some limitations. Option number one is handled by all CPUs as far as I know, however.

Paul
Title: Re: szLen optimize...
Post by: herge on March 20, 2009, 02:50:36 AM
 Hi Paul:

What ever the problems with windbg, at least I can see my

code, I have a lot of trouble seeing the code in Ollly and have

not found out how to change it's font size.

Regards herge
Title: Re: szLen optimize...
Post by: Jimg on March 20, 2009, 02:25:20 PM
One of the secrets to tracing source code in Olly, is do NOT use the /Fl options when assembling.  It messes something up.

I usually use - /c /coff /Cp /nologo /Zi /Zd

and link - /SUBSYSTEM:WINDOWS /DEBUG /DEBUGTYPE:CV /INCREMENTAL:NO


To change the font of the source window, right click in the window, select appearance and font
Also you can change the font in the menu  Options/Appearance/Fonts
Title: Re: szLen optimize...
Post by: herge on March 20, 2009, 02:36:05 PM
 Hi Jimg:

1. Edit ollydbg.ini
2. Replace the last three lines with this:
Font name[7]=Font Herge
Font[7]=20,0,600,0,0,0,1,2,5,0,0
Face name[7]=Arial
3. Save and restart Olly
4. Right-click, choose Appearance/Font/Font Herge


jj2007 told me how to fix it.
Thanks jj2007.

Regards herge
Title: Re: szLen optimize...
Post by: BeeOnRope on March 31, 2009, 08:32:37 PM
Quote from: hutch-- on March 11, 2009, 10:21:19 AM
Years of reading posts leave you with a reasonably good idea of the value of an "atom cracking" string length algo. Let me think, "As useful as a hip pocket in a singlet", wjhat about the world's fastest "MessageBoxA" algo ? How about a hobbling horse in the Kentucky Derby ?  :P

Quote from: hutchNo I don't, I have been watching musical chairs on string length algos for at least the last 10 years, in about 99.9999999999999999999999999% of cases the slow byte scanner is more than fast enough and in the .0 --- 0001% of other cases Agner Fog's algo is even more than fast enough. Speed is greate but it must also be useful gains and string length algos are rarely ever a big deal.

I'm quite surprised by this statement.  I have been involved in writing and profiling enterprise software for years, and the str* functions are repeatedly found as some of the highest CPU users in various bits of code.  Sure, it is not going to be an issue for an MPEG encoder, but for applications that handle user input, communication with other components, whatever, I've often seen these functions be the bottleneck.  Using better string functions in cases has resulted in massive improvements in some workflows - even some we didn't know would be affected ahead of time.

Sure, you could argue that strlen itself is kind of useless, since at least someone new the length (at creation, for example) and this length could be passed around rather than using strlen, but the realities of software engineering, such as interop with other components, use of existing APIs, legacy code, and so on mean that it is useful in practice.  Other functions, such as strcpy are useful both in theory and in practice since they cannot be optimized away (unlike strlen, arguably). 

Saying that the str* functions are useless is like arguing that memcpy and friends aren't important either - since for many programs the former are used more than the latter.
Title: Re: szLen optimize...
Post by: NightWare on March 31, 2009, 09:49:15 PM
Quote from: BeeOnRope on March 31, 2009, 08:32:37 PM
Saying that the str* functions are useless is like arguing that memcpy and friends aren't important either - since for many programs the former are used more than the latter.
?
strlen algos are never used intensively (anway, not like memcopy) in a serious app, so the comparison is totally inappropriate. plus, if you code YOUR functions correctly (and stop using stupid win APIs), YOU DON'T NEED those algos, coz you "should" return the size with your function with a simple sub instruction... just for info, in ALL my sources i've used a strlen algo just ONCE, and only because i'm too lazy to update a counter, and because speed is not essential... i don't know of what your years of writing consist of, but you have things to learn... seriously...
Title: Re: szLen optimize...
Post by: MichaelW on March 31, 2009, 10:04:04 PM
Quote from: BeeOnRope on March 31, 2009, 08:32:37 PM
Sure, it is not going to be an issue for an MPEG encoder, but for applications that handle user input, communication with other components, whatever, I've often seen these functions be the bottleneck.  Using better string functions in cases has resulted in massive improvements in some workflows - even some we didn't know would be affected ahead of time.

A bottleneck for communication with other components, possibly, but the bottleneck for user input is obviously the user.


Title: Re: szLen optimize...
Post by: jj2007 on March 31, 2009, 10:17:57 PM
Quote from: NightWare on March 31, 2009, 09:49:15 PM
just for info, in ALL my sources i've used a strlen algo just ONCE

In all my sources, I use GOTO only once, and for a valid reason, but I just checked len() and found a value of about 6/kLine of code. I wouldn't mind getting rid of some of them, but it is not that easy in a general purpose app. For highly optimised graphics applications, that might be different, though.
Title: Re: szLen optimize...
Post by: hutch-- on April 01, 2009, 01:37:24 AM
Bee,

I agree that string functions generally need to be fast, particularly when you are doing complex parsing but I would hold to my original comment that almost all string length requirements are more than adequately handled by the simplest byte scanner using one register. It is very rare to use long strings (> 1 meg) and where you do have an unusual case that has to repeatedly scan strings for their length, you write a different algo. Agner Fog's 1995 DWORD algo is still a very good performer here but if your task requires it you write a dedicated string length algo that is faster.

This is my favourite type of string length algo.


; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««

OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE

slen proc pstr:DWORD

    mov ecx, [esp+4]
    mov eax, ecx
    sub eax, 1

  @@:
    add eax, 1
    cmp BYTE PTR [eax], 0
    jne @B

    sub eax, ecx

    ret 4

slen endp

OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef

; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««


A techynique I regularly use when tokenising a large string is to make a copy of the string if preserving the original matters, do a one pass in place tokenise on the data overwriting the line terminator with a zero and writing the start offset of each line to an array. Now this leaves me with an array of unaligned members but the tokenising method is faster than any data copy to array method by some considerable amount.

If I then need to get the length of any or all of the tokenised strings, I use the very small one above because in most instances its takeoff time makes it faster than the bigger clunkier ones that are very fast on single long strings but hopeless on variable length unaligned short strings.
Title: Re: szLen optimize...
Post by: NightWare on April 01, 2009, 02:51:17 AM
hi hutch,

   lea eax,[ecx-1]
instead of
   mov eax,ecx
   sub eax,1
no ?
Title: Re: szLen optimize...
Post by: jj2007 on April 01, 2009, 06:14:11 AM
Quote from: hutch-- on April 01, 2009, 01:37:24 AM

... and writing the start offset of each line to an array....

If I then need to get the length of any or all of the tokenised strings


Can't you just use (offset n+1)-(offset n)-2?
Title: Re: szLen optimize...
Post by: hutch-- on April 01, 2009, 07:33:44 AM
NightWare,

Its a good mod but I tend to avoid LEA on a PIV as it is laggy. I would be interested to see if it has become faster again on a core 2 duo or quad.

JJ,

that suggestion makes sense except that you have to calculate the length reduction of either or both the CR and LF. If the task suited it your mod would be faster as the data is already present but it gets untidy if you pass the address of the tokenised string to another procedure.
Title: Re: szLen optimize...
Post by: BeeOnRope on April 01, 2009, 06:20:35 PM
Quote from: NightWare on March 31, 2009, 09:49:15 PM
Quote from: BeeOnRope on March 31, 2009, 08:32:37 PM
Saying that the str* functions are useless is like arguing that memcpy and friends aren't important either - since for many programs the former are used more than the latter.
?
strlen algos are never used intensively (anway, not like memcopy) in a serious app, so the comparison is totally inappropriate. plus, if you code YOUR functions correctly (and stop using stupid win APIs), YOU DON'T NEED those algos, coz you "should" return the size with your function with a simple sub instruction... just for info, in ALL my sources i've used a strlen algo just ONCE, and only because i'm too lazy to update a counter, and because speed is not essential... i don't know of what your years of writing consist of, but you have things to learn... seriously...

That's an interesting statement.  You just said that string algos are *never* used in a serious app, yet I have been in developing several "serious" apps, and I've seen string functions, including strlen, be the bottleneck for interesting workflows for many of them.  It's very though to assert that something never happens when I'm saying plainly and without any particular secret motivation that I have seen exactly this in "serious" apps.

I didn't write the functions in question, rather noted the bottleneck in software developed by teams of hundreds of people - I already mentioned that in some cases it is possible to return a length (or to use a class that remembers it), but if you are interoperating with other code you may not have a choice because (a) you don't have the source (b) cannot legally modify the source (c) do not have the time to modify the source, etc.
Title: Re: szLen optimize...
Post by: BeeOnRope on April 01, 2009, 06:24:35 PM
Quote from: MichaelW on March 31, 2009, 10:04:04 PM
Quote from: BeeOnRope on March 31, 2009, 08:32:37 PM
Sure, it is not going to be an issue for an MPEG encoder, but for applications that handle user input, communication with other components, whatever, I've often seen these functions be the bottleneck.  Using better string functions in cases has resulted in massive improvements in some workflows - even some we didn't know would be affected ahead of time.

A bottleneck for communication with other components, possibly, but the bottleneck for user input is obviously the user.


Agreed - I wasn't totally clear there.  I meant dealing with text that originally came as user textual input, but is now be processed, perhaps repeatedly.  For example, string columns in a database often came originally from user input, but that happens (for example) once while the string itself may be queries, returned to clients, sorted, etc millions of times.  In such applications string functions may be useful and performance sensitive.
Title: Re: szLen optimize...
Post by: BeeOnRope on April 01, 2009, 06:32:25 PM
Quote from: hutch-- on April 01, 2009, 01:37:24 AM
Bee,

I agree that string functions generally need to be fast, particularly when you are doing complex parsing but I would hold to my original comment that almost all string length requirements are more than adequately handled by the simplest byte scanner using one register. It is very rare to use long strings (> 1 meg) and where you do have an unusual case that has to repeatedly scan strings for their length, you write a different algo. Agner Fog's 1995 DWORD algo is still a very good performer here but if your task requires it you write a dedicated string length algo that is faster.

This is my favourite type of string length algo.


; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««

OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE

slen proc pstr:DWORD

    mov ecx, [esp+4]
    mov eax, ecx
    sub eax, 1

  @@:
    add eax, 1
    cmp BYTE PTR [eax], 0
    jne @B

    sub eax, ecx

    ret 4

slen endp

OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef

; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««


A techynique I regularly use when tokenising a large string is to make a copy of the string if preserving the original matters, do a one pass in place tokenise on the data overwriting the line terminator with a zero and writing the start offset of each line to an array. Now this leaves me with an array of unaligned members but the tokenising method is faster than any data copy to array method by some considerable amount.

If I then need to get the length of any or all of the tokenised strings, I use the very small one above because in most instances its takeoff time makes it faster than the bigger clunkier ones that are very fast on single long strings but hopeless on variable length unaligned short strings.

I agree more or less - it is rare enough in most applications to take the length of long strings repeatedly, but it definitely does happen.  This code could definitely be re-written to make it faster in many cases, but a string algo that is 10x faster in the first place will be an automatic huge win in these places without the need to thread a length through 10s or 100s of functions.  Arguably I'm preaching to the wrong crowd here - programs written entirely in assembly perhaps aren't likely to reach the scale (in terms of lines of code) where this becomes a consideration, but in the kind of software I'm interesting in (HLL + selected routines in assembly) it counts.

One thing you are missing from this thread is that the "larger/clunkier" routines developed by Lingo, NW and jj are blazing fast on short and unaligned strings - much, much faster than than Agner's routine, the CRT routine etc.  For example, on some 15 byte misaligned strings, Lingo's routine takes as little as 1 cycle.  That's 19x faster than the existing routine, and 7x faster than Agner's routine.

With such speeds it is arguably faster, for short strings, to not bother passing around the length, but to call strlen when needed (especially since the length may be need to 4 bytes or more, if you can accommodate larger strings, even if they are usually short in practice).
Title: Re: szLen optimize...
Post by: jj2007 on April 01, 2009, 07:07:20 PM
Quote from: BeeOnRope on April 01, 2009, 06:32:25 PM
One thing you are missing from this thread is that the "larger/clunkier" routines developed by Lingo, NW and jj are blazing fast on short and unaligned strings - much, much faster than than Agner's routine, the CRT routine etc.  For example, on some 15 byte misaligned strings, Lingo's routine takes as little as 1 cycle.  That's 19x faster than the existing routine, and 7x faster than Agner's routine.

That seems a bit too optimistic, BeeOnRope. Lingo's algo is indeed blazing fast at 4 cycles, but first, the timings may not be so accurate at that scale, and second, it will hardly matter for such short strings - there will be plenty of slower code before and after.


Intel(R) Celeron(R) M CPU        420  @ 1.60GHz (SSE3)
codesizes: strlen32s=132strlen64B=84NWStrLen=118, _strlen=66 bytes

-- test 16k, misaligned 0, 16434 bytes
  Masm32 lib szLen   20648 cycles
  crt strlen         15255 cycles
strlen32s            2894 cycles
strlen64LingoB       2919 cycles
NWStrLen             2935 cycles
_strlen (Agner Fog)  4264 cycles

-- test 15, misaligned 15, 15 bytes
  Masm32 lib szLen   25 cycles
  crt strlen         25 cycles
strlen32s            6 cycles
strlen64LingoB       4 cycles
NWStrLen             15 cycles
_strlen (Agner Fog)  14 cycles


Quote
With such speeds it is arguably faster, for short strings, to not bother passing around the length, but to call strlen when needed (especially since the length may be need to 4 bytes or more, if you can accommodate larger strings, even if they are usually short in practice).

Laziness is indeed a valid argument - codesize maybe not. A harmless mov eax, len(offset MyString) costs 12 bytes...
Title: Re: szLen optimize...
Post by: lingo on April 01, 2009, 10:01:46 PM
Hutch,
don't use ecx in strlen because jj will improve your code
and  preserve his lovely "count" register ecx. :wink
After that he'll post a message that "his" code is faster... :lol

Title: Re: szLen optimize...
Post by: jj2007 on April 01, 2009, 10:12:26 PM
Quote from: lingo on April 01, 2009, 10:01:46 PM
Hutch,
don't use ecx in strlen because jj will improve your code
and  preserve his lovely "count" register ecx. :wink
After that he'll post a message that "his" code is faster... :lol


... and you will reply with a 1% faster SSE8 version, hehe :green2

Lingo, I have changed philosophy completely. No more preserving of count registers! No, just the opposite: How can one set all registers to zero (http://www.masm32.com/board/index.php?topic=11138.msg82449#msg82449) with less than 12 bytes in less than 5 cycles. I swear it's more exciting than solving crosswords. All the gurus here have already contributed a solution. We are waiting for you :thumbu
Title: Re: szLen optimize...
Post by: tetsu-jp on April 01, 2009, 10:28:39 PM

memory_search proc address:dword,data:dword,limit:dword
;returns position of char in eax.
;returns limit if char was not found.
;the limit can be set to avoid buffer overrun
push ecx
push edi
mov ecx,limit
mov eax,data
mov edi,address
repne scasb

mov eax,limit
sub eax,ecx
pop edi
pop ecx
ret
memory_search endp


the point was to prevent the program from locking up on errand strings.

i know the REP prefix is long depretated...how slow is it actually compared to the other implementations?

would it improve to have a sufficient number of extra zero's, and to use SCASD?

real-world code needs to prevent lock-up/overrun by all means. so to be fair, you'd also have to add test code to the other string functions.

http://www.azillionmonkeys.com/qed/asmexample.html
http://www.visionx.com/markl/optimization_tips.htm
http://forums.appleinsider.com/archive/index.php/t-27572.html
Title: Re: szLen optimize...
Post by: BeeOnRope on April 01, 2009, 10:32:29 PM
Quote from: jj2007 on April 01, 2009, 07:07:20 PM
Quote from: BeeOnRope on April 01, 2009, 06:32:25 PM
One thing you are missing from this thread is that the "larger/clunkier" routines developed by Lingo, NW and jj are blazing fast on short and unaligned strings - much, much faster than than Agner's routine, the CRT routine etc.  For example, on some 15 byte misaligned strings, Lingo's routine takes as little as 1 cycle.  That's 19x faster than the existing routine, and 7x faster than Agner's routine.

That seems a bit too optimistic, BeeOnRope. Lingo's algo is indeed blazing fast at 4 cycles, but first, the timings may not be so accurate at that scale, and second, it will hardly matter for such short strings - there will be plenty of slower code before and after.


Agreed - 1 cycle is not realistic and likely reflects overlapping of loops by the CPU which won't like occur in practice in real code.  Still - the point remains, the "clunky" routines test as significantly faster for short or misaligned strings than the simpler routines, contradicting Hutch's assertion that this quest is misguided because these routines will only work well for giant, aligned strings.


Quote
Laziness is indeed a valid argument - codesize maybe not. A harmless mov eax, len(offset MyString) costs 12 bytes...

Actually I was referring to data size, not code size in this case.  Imagine 1 million length 0 strings - you might use 4 MB with length-prefixed strings, compared to 1 MB with null terminated strings.  Don't get me wrong, I'm nearly always in favor of explicit length for stings, but the termination technique can be appealing for many very short strings.
Title: Re: szLen optimize...
Post by: NightWare on April 01, 2009, 10:35:18 PM
Quote from: hutch-- on April 01, 2009, 07:33:44 AM
I would be interested to see if it has become faster again on a core 2 duo or quad.
concerning lea, i've never seen difference between my p3-500/celeron-700/P4-2Ghz/Core2-2Ghz
(my P4 was a northwood i think..., maybe there is difference with a prescot)

Quote from: BeeOnRope on April 01, 2009, 06:20:35 PM
That's an interesting statement.  You just said that string algos are *never* used in a serious app, yet I have been in developing several "serious" apps, and I've seen string functions, including strlen, be the bottleneck for interesting workflows for many of them.  It's very though to assert that something never happens when I'm saying plainly and without any particular secret motivation that I have seen exactly this in "serious" apps.

I didn't write the functions in question, rather noted the bottleneck in software developed by teams of hundreds of people - I already mentioned that in some cases it is possible to return a length (or to use a class that remembers it), but if you are interoperating with other code you may not have a choice because (a) you don't have the source (b) cannot legally modify the source (c) do not have the time to modify the source, etc.
here the problem come from the approach, most coders "think" in term of tasks, and in the contrary they should have a global approach. i don't know a case where updating a counter from time to time is slower than process an entire area, especially a large one.
(hundred/thousand/million people doing the same thing does not mean they're right, especially when they've been paid to quickly produce a "serious" app. it's a well known fact, you only obtain the product you have paid for. and asking coders to respect time limits certainly not encourage them to "think their code").

Quote from: jj2007 on April 01, 2009, 07:07:20 PM
A harmless mov eax, len(offset MyString) costs 12 bytes...
correct, the code size isn't for free, it's ok for fast algo, but only if you can be sure it's maintained in the cache (and the larger the code is, the more often the cache will be updated).
Title: Re: szLen optimize...
Post by: BeeOnRope on April 02, 2009, 12:31:54 AM
Quote from: NightWare on April 01, 2009, 10:35:18 PM

here the problem come from the approach, most coders "think" in term of tasks, and in the contrary they should have a global approach. i don't know a case where updating a counter from time to time is slower than process an entire area, especially a large one.
(hundred/thousand/million people doing the same thing does not mean they're right, especially when they've been paid to quickly produce a "serious" app. it's a well known fact, you only obtain the product you have paid for. and asking coders to respect time limits certainly not encourage them to "think their code").


Sure, that approach is fine for a monolithic application written by one or a few people (most assembly-only programs will fall into this category).  In practice, with components being written by teams around the globe with differing delivery schedules, coding styles, etc, it is important to think in terms of self-contained components, tasks, APIs, whatever.  It simply isn't possible for any one person to have the whole end-to-end workflow or code-flow in his mind at once.  If you don't believe this, you have never worked on a large, distributed software project.

Even if you don't believe it, it doesn't answer the point about interaction with legacy or proprietary APIs that you cannot change.
Title: Re: szLen optimize...
Post by: hutch-- on April 02, 2009, 06:02:00 AM
The problem as I see it is the "one size fits all" approach. Having written libraries for many years commercially back in the 90s I am as a matter of fact familiar with distributed projects but I am also familiar with their defects, the risk of being a headless monster screwed together with a multitude of compromises to fit the deviations of opinion, technique and disposition of the sum total of its contributors, a situation that is something like the corporate decision making process but with a random factor added.

I don't see any problem at all with keeping a dozen different routines to do similar tasks and simiply dial up the one that best fits your need if in fact any of them will fit the need, the alternative is to write another that in fact does do exactly what you need.
Title: Re: szLen optimize...
Post by: ecube on April 02, 2009, 06:14:01 AM
I put together a library of all the fastest procedures i've found on here awhile ago, under stress testing and even general use a lot of them messed up on me a lot(great deal of the procedures being loco's code, since he's won most of the speed contests). I've come to love the slower yet reliable code :) old trusty, heh.

and on to more pressing business, Hutch I don't know if you've noticed but you have the devil sign in your post count and your names in red in the user list... :'( 
Title: Re: szLen optimize...
Post by: jj2007 on April 02, 2009, 08:37:22 AM
Quote from: E^cube on April 02, 2009, 06:14:01 AM
I put together a library of all the fastest procedures i've found on here awhile ago, under stress testing and even general use a lot of them messed up on me a lot

Interesting. Post some example code, please.
Title: Re: szLen optimize...
Post by: hutch-- on April 02, 2009, 10:36:40 AM
Cube,

In 3 posts time I will have four (4) sixes as my post count, I wonder if that encapsulates both old Nick AND his sidekick ?  :bg
Title: Re: szLen optimize...
Post by: donkey on April 02, 2009, 11:08:20 AM
Quote from: hutch-- on April 02, 2009, 10:36:40 AM
AND his sidekick ?  :bg

Not sure what George W Bush has to do with posting here  :bg
Title: Re: szLen optimize...
Post by: tetsu-jp on April 02, 2009, 02:43:04 PM
add Bill Gates ASCII and you get 666, as well MS DOS 6.22 and Windows 3.11!

so the name, DOS and Windows version have been adjusted to sum up to 666. it is known for many years.

for the topic, I've made such a test program in 1997 (won't you guess, it is long lost).

different methods for memory copy! it was a 80386 SLC, some obscure variant.

now, alignment was absolutely irrelevant to performance.

and also 16 bytes, then 16K bytes, is no good testing.

you must test 256 bytes, 1K, 4K (common cache size/page size).

and then you must differentiate:

-linear access within a page
-linear access within L1/L2 cache
-always accessing the same location (cheating the cache)
-random access within a page
-random access within L1/L2 chaches
-long range random access

if you don't implement all this, your test algorithm is more than questionable.

and I've read the new AMD manuals, the REP SCAS is explicitely recommended for small strings!

so, would you include it, and show the result for REP SCASB as well?

if you have extra time, implement all data sizes: 8 bit to 64 bit, and alignment as well.
Title: Re: szLen optimize...
Post by: jj2007 on April 02, 2009, 05:23:48 PM
Quote from: tetsu-jp on April 02, 2009, 02:43:04 PM

and I've read the new AMD manuals, the REP SCAS is explicitely recommended for small strings!

so, would you include it, and show the result for REP SCASB as well?


Mr tetsu,

I am afraid my knowledge of assembler is not sufficient to implement a strlen algo based on rep scasb. But we have all read your posts with great interest, and are eager to see how you would do it. Could you post a snippet, please? We know that all your sources got lost, but maybe out of your head, or with the help of the AMD manual?

Thank you so much.
Title: Re: szLen optimize...
Post by: tetsu-jp on April 02, 2009, 05:34:52 PM
Thanks, on occasion, i allow people simply to call me "Alex".

I'm really honestly interested in such a banchmark, because i wrote such a program in 1997 myself.

Yes unfortunately due to my life circumstances, i have lost many source codes.

this is what i wrote 5 years ago to get string length: http://www.masm32.com/board/index.php?topic=1807.msg82540#msg82540

And i am thinking to write a benchmark (again), for strlen, memcopy and the like,

including 64bit!

I'm not assembly professional, let say, intermediate, the largest source i've ever produced was about 300K.

the purpose to visit the forum is to improve my skills, among having some fun!

so I could really write a benchmark using MASM, if people ask me to do it.

simply cheating the cache, always accessing the same string, is not serious testing.

there was IBM service program, it has done testing upwards, downwards, in certain steps, backwards, random, and twenty other options!
I don't think they just accessed one fixed location.

so all the feature i've listed above must be implemented!
I can do this...but I am not the pro, so it is uncertain, when this is going to happen.
for instance, i do not use the "pro" string length algorithms introduced here in this thread (some of them would make sense for certain applications).

It would be a research project to documentate the REP SCASB (SCAS) performance for all CPUs, over the years, I've read it degraded a little on Pentium, but recently, there might have been new implementations (on AMD CPUs).

I can't do it, I do not have many different computers. someone here might be able to create such a software,
with 100s of options, and donate it to the community!

what i think is that alignment is not so much relevant anymore (tough it can cause some trade-off).
Title: Re: szLen optimize...
Post by: NightWare on April 03, 2009, 01:04:55 AM
Quote from: BeeOnRope on April 02, 2009, 12:31:54 AM
Even if you don't believe it, it doesn't answer the point about interaction with legacy or proprietary APIs that you cannot change.
what i'm supposed to answer ? laws are what they are (i haven't defined them). all i can say is : life is made of choices, nothing else. and you must assume the results of those choices..., so IF a work doesn't follow YOUR SPECIFICATIONS, the work is supposed to be refused, IF NOT the work has been made correctly !
IF, later, you want modifications, then ask to the developpers, and pay for... it's the normal PRICE to pay when you don't code your apps yourself...
Title: Re: szLen optimize...
Post by: FORTRANS on April 03, 2009, 03:53:23 PM
Hi,

Quotebut maybe out of your head

; - - - String length routine.  - - -
; Use SCASB to find a C style string's length,
; 3 April 2009, SRN
StrLenS:
        CLD                     ; Search forward.
        MOV     EDI,OFFSET Test_Str ; Point destination index to string buffer.
        MOV     ECX,Limit       ; Maximum string length.
        MOV     AL,0            ; Character to search for.
  REPNE SCASB                   ; Scan for character.
        MOV     EAX,Limit
        SUB     EAX,ECX         ; Return length in EAX (includes the zero).

        RET


   Or some such.

Cheers,

Steve N.
Title: Re: szLen optimize...
Post by: jj2007 on April 03, 2009, 06:02:27 PM
Quote from: FORTRANS on April 03, 2009, 03:53:23 PM
Hi,

Quotebut maybe out of your head

; - - - String length routine.  - - -
; Use SCASB to find a C style string's length,
; 3 April 2009, SRN
StrLenS:
        CLD                     ; Search forward.
        MOV     EDI,OFFSET Test_Str ; Point destination index to string buffer.
        MOV     ECX,Limit       ; Maximum string length.
        MOV     AL,0            ; Character to search for.
  REPNE SCASB                   ; Scan for character.
        MOV     EAX,Limit
        SUB     EAX,ECX         ; Return length in EAX (includes the zero).

        RET


   Or some such.

Cheers,

Steve N.

Thanksalot, Steve :bg

Quote from: tetsu-jp on April 02, 2009, 02:43:04 PM

and I've read the new AMD manuals, the REP SCAS is explicitely recommended for small strings!

so, would you include it, and show the result for REP SCASB as well?

tetsu-san,

following your request, I have added Steve's code to the testbed, see attachment and timings below. Now we are of course curious how your code will perform on your AMD, and how you will optimise it.


Intel(R) Celeron(R) M CPU        420  @ 1.60GHz (SSE3)
codesizes: strlen32s=132strlen64B=84NWStrLen=118, _strlen=66 bytes

-- test 16k, misaligned 0, 16434 bytes
StrLenS (FORTRANS)   68312 cycles
strlen32s            3019 cycles
strlen64LingoB       3037 cycles
NWStrLen             3061 cycles
_strlen (Agner Fog)  4444 cycles

-- test 4k, misaligned 11, 4096 bytes
StrLenS (FORTRANS)   17029 cycles
strlen32s            768 cycles
strlen64LingoB       770 cycles
NWStrLen             789 cycles
_strlen (Agner Fog)  1142 cycles

-- test 1k, misaligned 15, 1024 bytes
  Masm32 lib szLen   1362 cycles
  crt strlen         1012 cycles
StrLenS (FORTRANS)   4302 cycles
strlen32s            206 cycles
strlen64LingoB       199 cycles
NWStrLen             215 cycles
_strlen (Agner Fog)  284 cycles

-- test 0, misaligned 0, 100 bytes
  Masm32 lib szLen   136 cycles
  crt strlen         114 cycles
StrLenS (FORTRANS)   471 cycles
strlen32s            30 cycles
strlen64LingoB       25 cycles
NWStrLen             34 cycles
_strlen (Agner Fog)  37 cycles

-- test 1, misaligned 1, 100 bytes
  Masm32 lib szLen   138 cycles
  crt strlen         127 cycles
StrLenS (FORTRANS)   473 cycles
strlen32s            28 cycles
strlen64LingoB       27 cycles
NWStrLen             34 cycles
_strlen (Agner Fog)  35 cycles

-- test 5, misaligned 5, 15 bytes
  Masm32 lib szLen   26 cycles
  crt strlen         29 cycles
StrLenS (FORTRANS)   125 cycles
strlen32s            6 cycles
strlen64LingoB       5 cycles
NWStrLen             17 cycles
_strlen (Agner Fog)  14 cycles

-- test 15, misaligned 15, 15 bytes
  Masm32 lib szLen   27 cycles
  crt strlen         26 cycles
StrLenS (FORTRANS)   124 cycles
strlen32s            7 cycles
strlen64LingoB       3 cycles
NWStrLen             15 cycles
_strlen (Agner Fog)  14 cycles



[attachment deleted by admin]
Title: Re: szLen optimize...
Post by: herge on April 03, 2009, 07:36:08 PM
 Hi jj2007:

The results from here:

Friday, April 03, 2009 3:32 PM
Intel(R) Core(TM)2 Duo CPU     E4600  @ 2.40GHz (SSE4)
codesizes: strlen32s=132strlen64B=84NWStrLen=118, _strlen=66 bytes

-- test 16k, misaligned 0, 16434 bytes
strlen32s            1522 cycles
strlen64LingoB       1231 cycles
NWStrLen             1334 cycles
_strlen (Agner Fog)  2844 cycles

-- test 4k, misaligned 11, 4096 bytes
strlen32s            395 cycles
strlen64LingoB       322 cycles
NWStrLen             348 cycles
_strlen (Agner Fog)  735 cycles

-- test 1k, misaligned 15, 1024 bytes
  Masm32 lib szLen   1071 cycles
  crt strlen         629 cycles
strlen32s            111 cycles
strlen64LingoB       85 cycles
NWStrLen             111 cycles
_strlen (Agner Fog)  182 cycles

-- test 0, misaligned 0, 100 bytes
  Masm32 lib szLen   107 cycles
  crt strlen         69 cycles
strlen32s            17 cycles
strlen64LingoB       11 cycles
NWStrLen             18 cycles
_strlen (Agner Fog)  21 cycles

-- test 1, misaligned 1, 100 bytes
  Masm32 lib szLen   105 cycles
  crt strlen         100 cycles
strlen32s            17 cycles
strlen64LingoB       11 cycles
NWStrLen             18 cycles
_strlen (Agner Fog)  21 cycles

-- test 5, misaligned 5, 15 bytes
  Masm32 lib szLen   19 cycles
  crt strlen         17 cycles
strlen32s            5 cycles
strlen64LingoB       1 cycles
NWStrLen             8 cycles
_strlen (Agner Fog)  7 cycles

-- test 15, misaligned 15, 15 bytes
  Masm32 lib szLen   19 cycles
  crt strlen         16 cycles
strlen32s            4 cycles
strlen64LingoB       2 cycles
NWStrLen             9 cycles
_strlen (Agner Fog)  7 cycles
-- Hit X Key --


Regards herge
Title: Re: szLen optimize...
Post by: tetsu-jp on April 03, 2009, 07:52:07 PM
(http://i25.photobucket.com/albums/c69/nikemoto2511/web200902/strlen.png)

how can the exe file be produced? i tried with the include file from previous attachment,
all i get is a blank line, and then command prompt (and the exe file locked).

i have copied ML.EXE from the VC directory, it is 9.0, it is assembling,
and also linking works.

but the program can not work correctly!

what i am doing wrong???
I've just started with MASM32!
any idea why it can not act?

and yes, REP SCAS is slower...

if i can get the source working, I'll try SCASW, SCASD, and SCASQ (should be faster).

I have tried both linkers, the original MASM32, and from the VC directory: 43520 bytes exe file
Title: Re: szLen optimize...
Post by: jj2007 on April 03, 2009, 09:44:54 PM
> how can the exe file be produced?

Did you choose CONSOLE assembly? I use RichMasm, which autodetects console/windows, but in other IDE's you might need to specify that explicitly.
Title: Re: szLen optimize...
Post by: tetsu-jp on April 03, 2009, 09:51:15 PM
I can assemble the supplied MASM32 examples, both via IDE, and via CLI:

-using the supplied .BAT file
-typing the command directly ~(ARGHH ..... this can work via copying binaries into the work directory.

so all this works, but the .EXE can not perform anything. something is not set up right.
I've removed the .EXE, and it is freshly generated, so assembler and linker work.

EDIT: I get along now! as i've guessed, the options have not been set up correctly, MASM32 just performs a plain call.

well, i had some fun with AZTEC C in a similar manner (and it requires a small file from a commercial SDK, one disk is defective a little, so people wo don't know, well they can try forever).

2 hours or 3 hours (I did other things as well).

by the way, the thumbnail is 70Kbyte, and the fullsize PNG just 16K

(http://i25.photobucket.com/albums/c69/nikemoto2511/web200902/richmasm.png)

so please, include information about how to build this project. not everyone can read your mind.
Title: Re: szLen optimize...
Post by: tetsu-jp on April 03, 2009, 10:20:45 PM
therre are many other threads.

so i think your code (strlen) will be gently skipped (by me).
the problem was there is no makefile.

i wanted the timing for the SCAS, and that's the point.
someone already added it.

by the way i can understand most of the strlen sources, thanks.

it's really a waste of time to write you a reply but here you go.
Title: Re: szLen optimize...
Post by: tetsu-jp on April 03, 2009, 10:22:49 PM
1. You are new in assembly ->"I've just started with MASM32!"

(http://i25.photobucket.com/albums/c69/nikemoto2511/web200902/th_DailyComic_Page.jpg) (http://i25.photobucket.com/albums/c69/nikemoto2511/web200902/DailyComic_Page.jpg)

you don't read carefully. i used MASM32 before, and wrote other assembly programs as well.

i just...had a break of 5 years.

i hope..you will not experience "the beans" in your life.
some people...just experience it, you know.

PS: it works now, see screenshot.

so the correct spelling is: I've just started with MASM32 (again) on a new machine...after a break of 5 years (not using assembly language).
Title: Re: szLen optimize...
Post by: ecube on April 03, 2009, 10:34:36 PM
tetsu-jp,
while lingo's personality is strong and he can be very direct, he is one of if not the most gifted assembly programmer on this forum/anywhere.  Rarely can anyone write faster code than him, which signify's that he has deep underlying system understanding so keeping that in mind, and what he said to you, i'd listen to him. The Gensis project is aimed at helping people quick start with MASM and i'm sure they help with assembly questions ingeneral.
Title: Re: szLen optimize...
Post by: NightWare on April 03, 2009, 10:36:04 PM
hmm, the laboratory is certainly not the appropriate place, yes.
however, his comment concerning SCACSB isn't totally wrong, it slower yes, but it use a hack to avoid branch misprediction (similar to movcc), so it WAS faster for small string... unfortunately, later, simd instructions have been introduced, and of course the speed difference has changed... yep, things must always been replaced in their context...

Title: Re: szLen optimize...
Post by: tetsu-jp on April 03, 2009, 11:22:04 PM
so he's an assembler wiz.

I've run a few tests, and notice differences each time the program runs.
large differences, upto 30 percent (no modification).

also i have modified the code for REP SCASD (within cache), and now the difference is only 4x.

it is OK you are the pro's, for years, if not decades? but who can deal with you?

i am willing to do it, and i can understand all of the source code, no worry.

don't understand your trouble.

i do not have general assembly questions, and just to work with the examples supplied,
there would be no need to deal with the forum.
it is just for fun, i do not use assembly for commercial projects.

so i have added SCASW and SCASD, and figured out, 30% difference each time the program is started.
so the numbers are not very relieable- performance can depend on many factors (usually there are more programs running at the same time, occupying the cache and all that).

but yes, you are the small group of pro's, and know, SCAS is ten times slower, 15 times slower.

i guess, SCASQ is just two times slower, in some contexts.

but saves people from artistic code (which also can be good).

(http://i25.photobucket.com/albums/c69/nikemoto2511/web200902/repnescasd.png)

this is the result for REP SCASD, cache=0
Title: Re: szLen optimize...
Post by: tetsu-jp on April 03, 2009, 11:26:44 PM
Quote from: E^cube on April 03, 2009, 10:34:36 PM
tetsu-jp,
while lingo's personality is strong and he can be very direct, he is one of if not the most gifted assembly programmer on this forum/anywhere.  Rarely can anyone write faster code than him, which signify's that he has deep underlying system understanding so keeping that in mind, and what he said to you, i'd listen to him. The Gensis project is aimed at helping people quick start with MASM and i'm sure they help with assembly questions ingeneral.

I'm not gifted...you can clearly see that, i need two hours to get the source working!
but makefile is no shame, you could call a makefile:
providing "deep underlying system understanding" for people who don't have it for some reason
(for instance, they can not read the brains of a small insider group).

there are people who h&te assembler, or refuse to deal with it completely. well i like it, but I understand why.

because the assembler wiz, who simply does not know that his world is just a special case, and not real world.

someone wrote "the strlen algorithms are not used in commercial applications"?
i think i've read this 3 pages ago.
so the wiz status is just to show off, in reality, the numbers are different.

yes, i like this SSE2 stuff, and will read all your code.
Title: Re: szLen optimize...
Post by: NightWare on April 04, 2009, 01:31:53 AM
Quote from: tetsu-jp on April 03, 2009, 11:26:44 PM
someone wrote "the strlen algorithms are not used in commercial applications"?
i think i've read this 3 pages ago.
so the wiz status is just to show off, in reality, the numbers are different.
no, i've said it's never used in SERIOUS apps, and it's not exactly what commercial applications are... by this you "should" have understood : nationnal app/database systems for administrations, army, etc... you've just avoided another occasion to keep some credibility...  :(
Title: Re: szLen optimize...
Post by: jj2007 on April 04, 2009, 01:44:56 AM
Quote from: tetsu-jp on April 03, 2009, 11:22:04 PM
this is the result for REP SCASD, cache=0

You might have a look at the second line of your screenshot.
Title: Re: szLen optimize...
Post by: tetsu-jp on April 04, 2009, 12:43:48 PM
yes i know, error message. i have not modified the macro, which is generating the string sequence.
so i think it is just a buggy message.


align 16
db 11 dup (0) ; misaligned by 11
szTest_4k db txt50

REPEAT 80-1
db txt50
ENDM
db txt50, "4096 bytes************************************", 0,0,0,0,0

align 16
szTest_16k db txt50


still should be 4K?!?

**************

I was thinking to extend the software:

-use random strings, random length, random location, get some more parameters for that.
CPU will behave differently than just one&the same string, same length, again and again.

-link with a VB program, and put the results in a database!
then it can be compared using EXCEL.

-provide a web service, to upload results!
then after a while, numerous CPUs can be compared.

-add more algorithms: memory copy, search for specific pattern

-64 bit support

I can do all this, but...as you write, there are members with superior knowledge.
so, why hijack the project, and steal the show?

I mean, i just made requests, and suggestions.
the question i had was "HOW MUCH FASTER compared to SCAS".

and, yes, why not waste a few bytes, and use longword instructions?
NP if you have Gbytes of memory.

so what do you think about the extensions?

for instance, you could generate a list in C++/C#/VB, and pass it to the assembly program.
this would be "real world data", not just a static string.

I never wrote my solution is superior, or i am the better wizard.

just, there are features missing in this software, i just wrote a few of them.
Title: Re: szLen optimize...
Post by: jj2007 on April 04, 2009, 12:57:59 PM
Quote from: tetsu-jp on April 04, 2009, 12:43:48 PM
yes i know, error message. i have not modified the macro, which is generating the string sequence.
so i think it is just a buggy message.


Njet. The message is correct. It's your code that is buggy.
Title: Re: szLen optimize...
Post by: tetsu-jp on April 04, 2009, 01:17:04 PM
unlikely if you compare the relation to 16K (which is about 1:4)

anyway, i will investigate later on today. the source is not that difficult, it's about the level i can follow without major problems.

and the CPU detect- I think I'll borrow that for my own projects- and give a copyright reference.
no need to re-invent such a code...

so i have some fun...

Uhm...i can copy strings (in C)


StrLenS proc src:DWORD

          push esi
mov esi,[esp+8]
xor ecx,ecx
pxor mm0,mm0
          xor ebx,ebx

_reloop:
movq mm1,[esi]
pcmpeqd mm1,mm0
inc ecx
movq mm2,[esi+8]
pcmpeqd mm2,mm0
lea esi,[esi+16]
          por mm1, mm2
          pmovmskb eax, mm1
test eax,eax
jz _reloop

clc
shl ecx,4
mov eax,ecx
pop esi

; push eax
; push edi
; mov edi,[esp+12]
; add edi,eax
; mov ecx,32
; xor eax,eax
; std
; repe scasb
; mov eax,32
; sub eax,ecx
; mov ecx,eax
; pop edi
;
; cld
; pop eax
; sub eax,ecx

ret 4
StrLenS endp


i wrote this (using 64bit MMX).
it's a little faster than Agner Fog's stuff.
but i can not fix the string length stuff correctly!
at least, not today.

so you see, i have examined your codes a little.
I've just downloaded the manuals with 128bit instructions a few days ago.
they must be aligned, or exception will happen.

my idea is to use 64bit, do not care about alignment at all (maybe enfore it in software anyway),
and fix the length via SCAS.

short strings can be copied to aligned space.
long strings- unaligned, and determine their length? i can not think of such a case.

i understand your efforts are to align the data, and also to test byte by byte.

is such code really required? i try to think of a real-world software, which has large unaligned strings.

(http://i25.photobucket.com/albums/c69/nikemoto2511/web200902/th_tetsu-1.gif) (http://i25.photobucket.com/albums/c69/nikemoto2511/web200902/tetsu-1.gif)

now, i have made modifications...
can't get the correct string length. what's wrong with the code?

it works using 64bit MMX, not 128bits.
so it's hard to be the 128bit MMX!

also i think the Genesys is not active at all- and there won't be an explanation what's wrong with the string length.

the code at MyTest is strange (patching bytes). can someone explain? i tried an hour to determine the extra bytes.

[attachment deleted by admin]