StrLen1 <--- from agner
StrLen2 <--- modification from original
szLen1 <--- original
szLen2 <--- modification from original
lstrlen <--- original from system
Quote
1472 -- 1378 clocks StrLen1
1472 -- 1196 clocks StrLen2
1472 -- 3077 clocks szLen1
1472 -- 2233 clocks szLen2
1472 -- 5870 clocks lstrlen
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
include masm32rt.inc
;include timers.asm
StrLen1 proto :DWORD
StrLen2 proto :DWORD
szLen1 proto :DWORD
szLen2 proto :DWORD
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
.data
str0 db 64 dup ("my other brother darryl"),0
.code
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
start:
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
;repeat 3 ; Used to check sensitivity to alignment
; nop
;endm
invoke StrLen1, addr str0
print ustr$(eax)
print chr$(" -- ")
counter_begin 10000000, REALTIME_PRIORITY_CLASS
invoke StrLen1, addr str0
counter_end
print ustr$(eax)
print chr$(" clocks StrLen1",13,10)
invoke StrLen2, addr str0
print ustr$(eax)
print chr$(" -- ")
counter_begin 10000000, REALTIME_PRIORITY_CLASS
invoke StrLen2, addr str0
counter_end
print ustr$(eax)
print chr$(" clocks StrLen2",13,10)
invoke szLen1, addr str0
print ustr$(eax)
print chr$(" -- ")
counter_begin 10000000, REALTIME_PRIORITY_CLASS
invoke szLen1, addr str0
counter_end
print ustr$(eax)
print chr$(" clocks szLen1",13,10)
invoke szLen2, addr str0
print ustr$(eax)
print chr$(" -- ")
counter_begin 10000000, REALTIME_PRIORITY_CLASS
invoke szLen2, addr str0
counter_end
print ustr$(eax)
print chr$(" clocks szLen2",13,10)
invoke lstrlen, addr str0
print ustr$(eax)
print chr$(" -- ")
counter_begin 10000000, REALTIME_PRIORITY_CLASS
invoke lstrlen, addr str0
counter_end
print ustr$(eax)
print chr$(" clocks lstrlen",13,10)
exit
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
align 4
StrLen1 proc item:DWORD
push ebx
mov eax, [esp+2*4] ; get pointer to string
lea edx, [eax+3] ; pointer+3 used in the end
align 4
@@:
mov ebx, [eax] ; read first 4 bytes
add eax, 4 ; increment pointer
lea ecx, [ebx-01010101h] ; subtract 1 from each byte
not ebx ; invert all bytes
and ecx, ebx ; and these two
and ecx, 80808080h
jz @B ; no zero bytes, continue loop
test ecx, 00008080h ; test first two bytes
jnz @F
shr ecx, 16 ; not in the first 2 bytes
add eax, 2
@@:
shl cl, 1 ; use carry flag to avoid branch
sbb eax, edx ; compute length
pop ebx
ret 1*4
StrLen1 endp
align 4
StrLen2 proc src:DWORD
mov ecx, [esp+1*4]
test ecx, 3
jz @max8
@bucle:
mov al, [ecx]
add ecx, 1
test al, al
jz @lb1
test ecx, 3
jnz @bucle
align 4
@max8:
mov eax, [ecx]
mov edx, 7EFEFEFFh
add edx, eax
xor eax, 0FFFFFFFFh
xor eax, edx
add ecx, 4
test eax, 81010100h
jz @max8
mov eax, [ecx-4]
test al, al
jz @lb4
test ah, ah
jz @lb3
test eax, 0FF0000h
jz @lb2
test eax, 0FF000000h
jnz @max8
@lb1:
lea eax, [ecx-1]
mov ecx, [esp+1*4]
sub eax, ecx
ret 1*4
@lb2:
lea eax, [ecx-2]
mov ecx, [esp+1*4]
sub eax, ecx
ret 1*4
@lb3:
lea eax, [ecx-3]
mov ecx, [esp+1*4]
sub eax, ecx
ret 1*4
@lb4:
lea eax, [ecx-4]
mov ecx, [esp+1*4]
sub eax, ecx
ret 1*4
StrLen2 endp
align 4
szLen1 proc src:DWORD
mov eax, [esp+1*4] ; src
sub eax, 4
align 4
@@:
add eax, 4
cmp BYTE PTR [eax], 0
je @lb1
cmp BYTE PTR [eax+1], 0
je @lb2
cmp BYTE PTR [eax+2], 0
je @lb3
cmp BYTE PTR [eax+3], 0
jne @B
sub eax, [esp+1*4] ; src
add eax, 3
ret 1*4
@lb3:
sub eax, [esp+1*4] ; src
add eax, 2
ret 1*4
@lb2:
sub eax, [esp+1*4] ; src
add eax, 1
ret 1*4
@lb1:
sub eax, [esp+1*4] ; src
ret 1*4
szLen1 endp
align 4
szLen2 proc src:DWORD
mov eax, [esp+1*4] ; src
sub eax, 4
xor ecx, ecx
align 4
@@:
add eax, 4
cmp BYTE PTR [eax], cl
je @lb1
cmp BYTE PTR [eax+1], cl
je @lb2
cmp BYTE PTR [eax+2], cl
je @lb3
cmp BYTE PTR [eax+3], cl
jne @B
sub eax, [esp+1*4] ; src
add eax, 3
ret 1*4
@lb3:
sub eax, [esp+1*4] ; src
add eax, 2
ret 1*4
@lb2:
sub eax, [esp+1*4] ; src
add eax, 1
ret 1*4
@lb1:
sub eax, [esp+1*4] ; src
ret 1*4
szLen2 endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
end start
Have a look at Donkeys string utils for a truely fast length api.
(you will have to search this board)
hi striker
I have looked for in the forum and only I have found a few references to donkey but they do not speak to anything referring, if you could say something to me, you help me, since I am beginning with the assembler and I like much.
right now I do not have long time, and I think that was faster to write here, but will look for the network.
Quote
1472 -- 1378 clocks StrLen1
1472 -- 1196 clocks StrLen2
1472 -- 3077 clocks szLen1
1472 -- 2233 clocks szLen2
1472 -- 2241 clocks szLen3
1472 -- 2104 clocks szLen4
1472 -- 5870 clocks lstrlen
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
align 4
szLen3 proc src:DWORD
mov eax, [esp+1*4] ; src
sub eax, 4
xor ecx, ecx
align 4
@@:
add eax, 4
mov ecx, [eax]
;cmp BYTE PTR [eax], 0
test cl, cl
je @lb1
;cmp BYTE PTR [eax+1], 0
test ch, ch
je @lb2
mov ecx, [eax+2]
;cmp BYTE PTR [eax+2], 0
test cl, cl
je @lb3
;cmp BYTE PTR [eax+3], 0
test ch, ch
jne @B
sub eax, [esp+1*4] ; src
add eax, 3
ret 1*4
@lb3:
sub eax, [esp+1*4] ; src
add eax, 2
ret 1*4
@lb2:
sub eax, [esp+1*4] ; src
add eax, 1
ret 1*4
@lb1:
sub eax, [esp+1*4] ; src
ret 1*4
szLen3 endp
align 4
szLen4 proc src:DWORD
mov eax, [esp+1*4] ; src
sub eax, 4
;xor ecx, ecx
align 4
@@:
add eax, 4
movzx ecx, word ptr [eax]
;cmp BYTE PTR [eax], 0
test cl, cl
je @lb1
;cmp BYTE PTR [eax+1], 0
test ch, ch
je @lb2
movzx ecx, word ptr [eax+2]
;cmp BYTE PTR [eax+2], 0
test cl, cl
je @lb3
;cmp BYTE PTR [eax+3], 0
test ch, ch
jne @B
sub eax, [esp+1*4] ; src
add eax, 3
ret 1*4
@lb3:
sub eax, [esp+1*4] ; src
add eax, 2
ret 1*4
@lb2:
sub eax, [esp+1*4] ; src
add eax, 1
ret 1*4
@lb1:
sub eax, [esp+1*4] ; src
ret 1*4
szLen4 endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
(sorry by the language, I am learning english and assembler :dazzled:)
I couldn't find any reference to Donkey's libraries on this forum, but they are available from his web site:
http://donkey.visualassembler.com/
Donkey uses GoAsm, so the library code will need to be converted to MASM syntax. I seem to recall that Donkey or someone else posted instructions for performing the conversion, but I couldn't find the post. In any case the conversion is not difficult.
Another readily available string length procedure is the strlen function from MSVCRT.DLL. In the MASM32 include file it is named crt_strlen. I don't recall how it compares speed-wise to the optimized procedures.
Here is a small app that performs a function test of the procedures. I moved your procedures into an include file that contains the procedure code with a leading ".code" directive (and no END directive).
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
.486 ; create 32 bit code
.model flat, stdcall ; 32 bit memory model
option casemap :none ; case sensitive
include \masm32\include\windows.inc
include \masm32\include\masm32.inc
include \masm32\include\kernel32.inc
include \masm32\include\msvcrt.inc
includelib \masm32\lib\masm32.lib
includelib \masm32\lib\kernel32.lib
includelib \masm32\lib\msvcrt.lib
include \masm32\macros\macros.asm
include procs.asm
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
.data
str0 db 0
str1 db 'X',0
str2 db 'XX',0
str3 db 'XXX',0
str4 db 'XXXX',0
str5 db 'XXXXX',0
str15 db 15 dup('X'),0
str16 db 16 dup('X'),0
str17 db 17 dup('X'),0
str255 db 255 dup('X'),0
str1000 db 250 dup('X')
db 250 dup('X')
db 250 dup('X')
db 250 dup('X'),0
.code
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
start:
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
FOR teststr,<str0,str1,str2,str3,str4,str5,\
str15,str16,str17,str255,str1000>
FOR testproc,<StrLen,StrLen1,StrLen2,szLen,szLen1,\
szLen2,szLen3,szLen4,crt_strlen,lstrlen>
invoke testproc,ADDR teststr
print ustr$(eax),32
ENDM
print chr$(13,10)
ENDM
mov eax,input(13,10,"Press enter to exit...")
exit
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
end start
I have seen a lot of string length algos in my time but they still break down to 2 types, variations of Agner Fog's strlen DWORD version and various forms of byte scanners. The DWORD version is faster but it has problem with alignment and it can under some circumstances fail if the length is at the end of a memory page. The "szLen" algo is a classic byte scanner unrolled by 4 that is both insensitive to alignment and has no page ending problems so it is properly general purpose.
From memory Donkey had a variation of the Agner Fog design where he aligned the beginning of the algo then read in DWORD size chunks which solves the alignment problem but not the page end problem.
Most zero terminated strings are relatively short (< 128 bytes) and with a byte scanner running at something like 100 meg/sec, I wonder if there is a gain in chasing faster but less general purpose algos.
hello MichaelW
thx for the link to "donkey´s stable" and thx very much for the small app for test the procedures, it´s help me :U
the StrLen1 that i post is from msvcr70.dll
I am finishing modifying the masm32.lib, and your small application is to me helpful for test the different versions.
the main changes are changes of the comparisons memory-immediate in the loops, elimination of stack´s frame in algo without locals, and some small tricks.
the average speed has raised in a 37% and I am very happy.
I like to learn from the bests as some gurus. like (hutch-, you, etc)
and I continue learning
thx MichaelW
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
include masm32rt.inc
include procs.asm
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
.data
str0 db 0
str1 db 128 dup ('Xx'),0
str2 db 512 dup ('Xx'),0
str3 db 1024 dup ('Xx'),0
align 4
txt0 db "0 bytes",0
txt1 db "256 bytes",0
txt2 db "1024 bytes",0
txt3 db "2048 bytes",0
align 4
tb_txt dd txt0,txt1,txt2,txt3
alg0 db "StrLen",0
alg1 db "StrLen1",0
alg2 db "szLen",0
alg3 db "lstrlen",0
align 4
tb_alg dd alg0,alg1,alg2,alg3
.code
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
start:
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
push edi
push esi
lea edi, [tb_alg]
lea esi, [tb_txt]
FOR teststr,<str0,str1,str2,str3>
FOR testproc,<StrLen,StrLen1,szLen,lstrlen>
counter_begin 10000000, REALTIME_PRIORITY_CLASS
invoke testproc,ADDR teststr
counter_end
print ustr$(eax),9
print " clocks for proc - "
print [edi],13,10
add edi, 4
ENDM
lea edi, [tb_alg]
print "-------------------------------- "
print [esi],13,10
add esi, 4
print chr$(13,10)
ENDM
mov eax, input("Press enter to exit...")
pop esi
pop edi
exit
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
end start
Here's my hacked and botched attempt. Be nice to the new guy. :) Results on an AMD XP 1800+. Two results shown, with and without the Sleep API.
Zero Sleep between tests:
StrLen StrLen1 StrLen2 szLen szLen1 szLen2 szLen3 szLen4 lstrlen
0 bytes: 12 10 10 8 5 6 8 5 33
1 bytes: 12 10 11 9 9 8 9 8 39
2 bytes: 14 12 11 13 9 8 10 10 42
3 bytes: 14 12 11 13 9 10 11 10 56
4 bytes: 18 16 13 18 11 10 11 12 45
5 bytes: 18 16 14 18 14 13 13 12 48
15 bytes: 25 22 22 30 24 23 29 29 105
16 bytes: 31 25 25 32 26 25 30 33 96
17 bytes: 31 25 25 34 27 29 34 35 110
255 bytes: 283 249 248 359 342 355 435 451 822
1023 bytes: 1086 933 945 1346 1332 1345 1728 1734 3208
250ms Sleep between tests:
StrLen StrLen1 StrLen2 szLen szLen1 szLen2 szLen3 szLen4 lstrlen
0 bytes: 12 10 9 8 8 8 6 7 33
1 bytes: 12 10 11 9 8 7 7 9 39
2 bytes: 14 13 12 13 8 9 10 10 42
3 bytes: 14 13 12 13 9 10 11 10 44
4 bytes: 18 16 15 18 10 11 12 14 45
5 bytes: 18 16 16 18 12 14 13 14 48
15 bytes: 25 23 23 30 23 24 26 29 93
16 bytes: 31 28 28 32 25 27 30 31 96
17 bytes: 31 28 29 33 26 29 32 32 99
255 bytes: 283 282 296 347 344 345 425 409 823
1023 bytes: 1064 1076 1137 1324 1333 1332 1659 1589 3203
Interesting. Unsure what's going on there. It might be interesting to calculate the mean deviation of some datasets in an effort to determine if Sleep is increasing or decreasing accuracy.
Michael, could I request a CPUID function in timers.asm, so we can have our CPU details right in the console? :toothy
[attachment deleted by admin]
Quote from: Mark Jones on June 02, 2005, 04:40:46 AM
Michael, could I request a CPUID function in timers.asm, so we can have our CPU details right in the console? :toothy
Good idea. I already have one, but it's a hack (the original meaning of the term :eek). I'll see what I can do.
result on PIV 2800@3300
Zero Sleep tests:
Quote
StrLen StrLen1 StrLen2 szLen szLen1 szLen2 szLen3 szLen4 lstrlen
0 bytes: 7 2 1 2 10 -1 -3 8 41
1 bytes: 1 9 2 -2 10 2 1 -1 49
2 bytes: 2 14 10 0 2 1 5 0 40
3 bytes: 2 1 14 15 5 4 11 11 45
4 bytes: 4 5 10 11 12 4 3 10 48
5 bytes: 8 13 6 4 8 4 14 4 56
15 bytes: 13 5 26 19 33 36 22 18 62
16 bytes: 16 16 19 18 32 22 28 29 90
17 bytes: 17 16 27 38 58 24 33 27 78
255 bytes: 238 236 229 353 548 419 422 352 620
1023 bytes: 844 854 849 1341 2077 1573 1586 1333 2325
250ms Sleep tests:
Quote
StrLen StrLen1 StrLen2 szLen szLen1 szLen2 szLen3 szLen4 lstrlen
0 bytes: 3 2 0 -3 -3 -3 6 -1 34
1 bytes: 1 1 3 -1 -1 -1 1 -3 38
2 bytes: 14 3 3 3 2 0 0 0 42
3 bytes: 2 5 5 2 14 2 3 0 46
4 bytes: 4 5 6 3 6 3 8 3 47
5 bytes: 4 4 6 4 9 5 5 4 47
15 bytes: 24 13 16 19 32 26 18 19 81
16 bytes: 18 17 19 17 33 22 23 20 75
17 bytes: 16 17 17 26 57 23 39 30 85
255 bytes: 228 222 221 353 535 407 414 349 647
1023 bytes: 838 819 809 1350 2079 1562 1551 1331 2379
note: in my system, StrLen = Strlen1 and szLen = szLen4 (masm32.lib modified), szLen1 is the original without stack´s frame
Dont know if you have seen this one or not...
I saved this from quite awile back when the same thing was tried on another board.
I think the tail end can be modified for faster performance (I remember playing with it) but the inner loop is where this one shines...
If I remember correctly the inner loop, because of the U/V pipes, ran 1 DWORD per 2 cycles? Maybe my memories faulty :green2
; FStrLen - buliaNaza, Lingo12
;
; Returns the length of a null terminated
; string not including the null.
FStrLen proto :dword
.code
option prologue:none
option epilogue:none
FStrLen proc string:dword
push esi
push ebx
mov esi, [esp+8]
mov ebx, 80808080h
mov eax, [esi]
xor edx, edx
FStrLen_Loop:
lea ecx, [eax-10101010h]
inc edx
and ecx, ebx
mov eax, [esi+edx*4]
jz FStrLen_Loop
bsf ebx, ecx
dec edx
shr ebx, 3
lea eax, [ebx+edx*4]
pop ebx
pop esi
ret
FStrLen endp
option prologue:PrologueDef
option epilogue:EpilogueDef
Did some searching on the old boards and found the original posting with comments...
Would be interesting to do a compare with this one :wink
FStrLen proc string:dword
mov esi, [esp+8]
mov eax, [esi] ; get a dword (buffer is aligned)
mov ebx, 80808080h ; we'll use register ebx rather then immediate 80808080h
xor edx, edx ; edx=0
C2_loop:
lea ecx, [eax-1010101h] ; sub 1 from each byte in eax
inc edx ; ready for next dword
test ecx, ebx ; test sign ; ebx= 80808080h
mov eax, [esi+edx*4] ; get next dword
jz C2_loop ; if not loop again
test eax, 000000FFh ; is al zero?
jz C2_minus4 ;
test eax, 0000FF00h ; is ah zero?
jz C2_minus3 ;
test eax, 00FF0000h ; is zero?
jz C2_minus2 ;
test eax, 0FF000000h ; is zero?
jnz C2_loop ; if not zeroes loop again
lea eax, [edx-1] ; eax= length of string
ret
C2_minus2:
lea eax, [edx-2] ; eax= length of string
ret
C2_minus3:
lea eax, [edx-3] ; eax= length of string
ret
C2_minus4:
lea eax, [edx-4] ; eax= length of string
ret
FStrLen endp
First thing to do with that version is to swap EAX and EDX, then use ADD or SUB for the corections instead of LEA which is slow on post PIII hardware.
Maelstrom
option prologue:none
option epilogue:none
FStrLen proc string:dword
push esi
push ebx
mov esi, [esp+8] ; <<< It´s wrong, It has to be 12, because 2 push
hmmmm,
In its current form, it always returns zero.
Yeah your right, what a mess. So much for copying and pasting from older threads...
I went through the thread from the beginning and debugged it though. This one is corrected.
FStrLen proc string:dword
mov esi, [esp+8]
mov ebx, 80808080h ; we'll use register ebx rather than immediate 80808080h
mov eax, [esi]
xor edx, edx ; edx = 0
@@:
lea ecx, [eax-1010101h] ; sub 1 from each byte in eax
inc edx ; ready for next dword
and ecx, ebx ; test sign ; ebx= 80808080h
mov eax, [esi+edx*4] ; get next dword
jz @B ; if not loop again
test ecx, 000000FFh ; is al zero?
jnz C_minus4 ;
test ecx, 0000FF00h ; is ah zero?
jnz C_minus3 ;
test ecx, 00FF0000h ; is zero?
jnz C_minus2 ;
C_minus1:
lea eax, [edx*4-1]
ret
C_minus2:
lea eax, [edx*4-2]
ret
C_minus3:
lea eax, [edx*4-3]
ret
C_minus4:
lea eax, [edx*4-4]
ret
FStrLen endp
Finally got FStrLen working. Looks like it's faster on my P3 for everything larger than 3 byte strings with the modifications that I made. Originally, It was about the same as StrLen1 when it had to preserve ebx and esi to be a 'fair' comparison with the other procedures. I re-wrote it but maintained the spirit of doing 4 bytes at a time. Here's the modified procedure:
FStrLen proc string:dword
mov ecx, [esp+4]
xor eax, eax ; clear initial size
@@:
mov edx, [ecx+4*eax]
lea edx, [edx-1010101h] ; sub 1 from each byte in eax
add eax,1 ; ready for next dword
and edx, 80808080h ; check sign bit in all byte
jz @B ; more to do until we have a sign
test edx, 000000FFh ; is byte-1 zero?
jnz C_minus4 ;
test edx, 0000FF00h ; is byte-2 zero?
jnz C_minus3 ;
test edx, 00FF0000h ; is byte-3 zero?
jnz C_minus2 ;
C_minus1:
lea eax,[4*eax-1]
ret 1*4
C_minus2:
lea eax,[4*eax-2]
ret 1*4
C_minus3:
lea eax,[4*eax-3]
ret 1*4
C_minus4:
lea eax,[4*eax-4]
ret 1*4
FStrLen endp
I should point out that this procedure isn't always safe. If the final null byte of the string is at the end of an allocated memory segment then the last dword mov could cause a general protection fault if the string wasn't aligned on a 4-byte boundry. To make it safe, more code would be needed up front to do things a byte at a time until 4-byte alignment was assured. It would probably still be faster on strings larger than 6 or 7 bytes but it would slow it down a bit. Finally, the timings on a 996 MHz P3:
Bytes 0 1 2 3 4 5 15 16 17 255 1023
===== ===== ===== ===== ===== ===== ===== ===== ===== ===== =====
FStrLen 6 9 9 9 9 15 16 18 21 209 785
StrLen 15 15 16 15 18 18 25 28 28 278 1046
StrLen1 12 12 12 12 16 16 23 25 25 276 1042
StrLen2 11 10 13 13 15 14 25 27 26 286 1058
szLen 8 10 12 14 17 19 32 35 36 465 1808
szLen1 5 7 9 11 13 15 28 30 31 374 1449
szLen2 5 8 11 11 13 15 29 30 32 403 1560
szLen3 8 7 9 10 14 15 28 34 31 412 1595
szLen4 5 7 10 10 13 14 29 27 29 353 1349
lstrlen 44 44 46 47 49 62 92 95 98 812 3120
The crossover point where it becomes consistently faster might be higher than 3 bytes on some machines but I think it's a general win for average and large string sizes. Thanks for posting the algorithm and also the szlen program that I also modified and renamed fszlen to avoid confusion.
[attachment deleted by admin]
I changed the lea adjustments to sub's because they should be faster on many machines.
FStrLn2 proc string:dword
mov ecx, [esp+4]
xor eax, eax ; clear initial size
@@:
mov edx, [ecx+eax]
lea edx, [edx-1010101h] ; sub 1 from each byte in eax
add eax, 4 ; ready for next dword
and edx, 80808080h ; check sign bit in all byte
jz @B ; more to do until we have a sign
test edx, 000000FFh ; is byte-1 zero?
jnz C_minus4
test edx, 0000FF00h ; is byte-2 zero?
jnz C_minus3
test edx, 00FF0000h ; is byte-3 zero?
jnz C_minus2
C_minus1:
sub eax,1
ret 1*4
C_minus2:
sub eax,2
ret 1*4
C_minus3:
sub eax,3
ret 1*4
C_minus4:
sub eax,4
ret 1*4
FStrLn2 endp
And the timing, again on a 996 MHz P3
Bytes 0 1 2 3 4 5 15 16 17 255 1023
===== ===== ===== ===== ===== ===== ===== ===== ===== ===== =====
FStrLen 6 9 10 9 9 15 16 18 21 209 785
FStrLn2 7 8 9 9 18 11 15 19 20 209 785
StrLen 15 15 16 15 18 18 25 28 28 278 1047
StrLen1 11 11 12 12 15 15 21 24 24 231 856
StrLen2 12 12 12 13 14 14 24 26 26 269 986
szLen 8 10 12 14 17 19 32 35 37 464 1808
szLen1 5 7 9 11 13 15 28 28 31 371 1443
szLen2 6 7 9 12 13 15 30 30 31 408 1580
szLen3 5 8 9 10 13 15 28 33 30 413 1589
szLen4 6 7 9 10 13 14 27 30 31 403 1553
Notice that StrLen1 and StrLen2 changed dramatically from the previous version. I carefully removed the new function FStrLn2 from the test and commented the procedure definition so it wouldn't be loaded into memory and the StrLen functions went back to their previous slower timings. It's unusual and I repeated the test by carefully editing the code to put my updated function in and they when back to the more likely speeds that we see here. I don't understand how the position in memory can affect the timing this dramatically.
[attachment deleted by admin]
Phil,
In the short term, try aligning each procedure entry at align 16 and if that does not help, try another dirty trick,
REPEAT 1024
nop
ENDM
After each ALIGN 16 but before the procedure. The idea is to isolate each procedure.
QuoteI should point out that this procedure isn't always safe. If the final null byte of the string is at the end of an allocated memory segment then the last dword mov could cause a general protection fault if the string wasn't aligned on a 4-byte boundry.
It should also be noted that these routines do not work on ascii character > 128
For example, change test string 5 to
str5 db 'X',130,'XXX',0 ; was 'XXXXX',0
run the test and look at the length printed in the headers.
JimG: Thanks for pointing out that many of these functions only work with 7-bit ASCII. I hadn't noticed that until you mentioned it.
Hutch: Thanks for the suggestions on issolating the functions. I'll try the align 16 and modifiy this post later with the results.
I wrote this some time ago, it is for exceeding long strings (>128bytes) using MMX. Note that it is assumed the the entry of the proc is on a paragraph boundary, something that is gauranteed if it is in a static lib using GoAsm. Also the string must start on a 16 byte boundary.
lszLenMMX FRAME pString
mov eax,[pString]
nop
nop ; fill in stack frame+mov to 8 bytes
pxor mm0,mm0
nop ; fill pxor to 4 bytes
pxor mm1,mm1
nop ; fill pxor to 4 bytes
: ; this is aligned to 16 bytes
movq mm0,[eax]
pcmpeqb mm0,mm1
add eax,8
pmovmskb ecx,mm0
or ecx,ecx
jz <
sub eax,[pString]
bsf ecx,ecx
sub eax,8
add eax,ecx
emms
RET
ENDF
To the Ineffable All,
Perhaps all those interested should peruse this link. Ratch
http://board.win32asmcommunity.net/index.php?topic=16299.msg128369;topicseen#msg128369
Very nice Ratch, the fastest one that works on all ascii characters so far.
I couldn't leave good enough alone, of course, so I played a little with your code. First my appologies for messing with your code, I don't get along well with loops and repeats and such, so I coverted it to brute force code to play with.
I'm getting some strange results. My first try only cut a few cycles off ( proc RatchX), but then I inserted a nop in preparation for some other tests, and on my screwy athlon, it made the code run much faster on the large string (proc RatchX2). The nop misaligned the main loop, so it should have made it run much slower. Here are my results:
Bytes 0 1 2 3 4 5 15 16 31 255 1023
===== ===== ===== ===== ===== ===== ===== ===== ===== ===== =====
StrLen2 11 11 11 11 13 14 22 25 36 283 927
Ratch 8 11 12 15 10 14 26 21 39 243 875
RatchX 8 10 13 13 13 14 22 22 35 234 864
RatchX2 8 11 14 14 14 14 20 21 35 217 800
Press enter to exit...
What's going on here.
If someone with an intel chip would try this out to see if there is a similar effect, I'd appreciate it.
[attachment deleted by admin]
Here are the results on a 996 MHz P3
Bytes 0 1 2 3 4 5 15 16 31 255 1023
===== ===== ===== ===== ===== ===== ===== ===== ===== ===== =====
StrLen2 9 11 12 13 13 15 24 42 56 301 1047
Ratch 18 25 32 39 22 29 49 31 72 258 893
RatchX 18 24 31 33 21 29 41 31 63 253 884
RatchX2 19 23 30 33 20 27 41 33 68 247 881
And a complete pass for all procedures included in this thread.
Proc/Bytes 0 1 2 3 4 5 15 16 17 31 1023
========== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
FStrLen 7 7 10 8 18 10 17 19 19 43 786
FStrLn2 7 8 9 8 13 11 15 19 20 43 786
Ratch 18 25 32 39 22 29 49 31 38 72 893
RatchX 18 24 31 33 21 29 41 31 38 63 883
RatchX2 19 23 30 33 20 27 41 33 37 68 882
StrLen 15 15 16 16 18 18 25 28 28 53 1047
StrLen1 12 12 12 12 16 16 23 25 25 53 1042
StrLen2 9 11 11 13 13 15 24 39 27 56 1047
szLen 8 10 12 14 17 19 32 49 48 73 1808
szLen1 5 7 9 11 13 15 28 30 32 65 1566
szLen2 6 7 9 11 13 15 29 43 45 67 1580
szLen3 6 6 10 10 12 14 27 51 72 65 1571
szLen4 5 7 8 10 14 14 27 30 46 63 1552
lstrlenA 48 41 44 45 48 62 91 94 97 139 3117
Also, I defined a fld$ macro that produces a fixed length field and allows right alignment. The code is list driven so it is easier to modify. The complete source is included in the zip but here are the defining elements that show the idea:
include \masm32\include\masm32rt.inc ; defaults to .386
.586
include timers.asm ; requires a pentium
LOOPCOUNT equ 1000000
PROCS TEXTEQU <FStrLen,FStrLn2, \
Ratch,RatchX,RatchX2, \
StrLen,StrLen1,StrLen2, \
szLen,szLen1,szLen2,szLen3,szLen4, \
lstrlen >
SIZES TEXTEQU <0,1,2,3,4,5,15,16,17,31,1023>
%FOR proc,<PROCS>
proc proto :DWORD
ENDM
MAXWIDTH equ 20
HDRWIDTH equ 10
COLWIDTH equ 5
.data
%FOR len,<SIZES>
align 16
str&len& db len dup ('X'),0
ENDM
fld$ MACRO DDpointer,DDwidth,DDalign:=<0>
LOCAL rvstring
.data
rvstring db MAXWIDTH+4 dup (0)
align 16
.code
invoke sxFld,reparg(DDpointer),ADDR rvstring,DDwidth,DDalign
EXITM <ADDR rvstring>
ENDM
.code
start:
print fld$(chr$("Proc/Bytes"),HDRWIDTH)
%FOR len,<SIZES>
invoke StrLen,ADDR str&len&
print fld$(ustr$(eax),COLWIDTH,1)
ENDM
print chr$(13,10)
print fld$(chr$("=========="),HDRWIDTH)
%FOR len,<SIZES>
invoke StrLen,ADDR str&len&
print fld$(" ====",COLWIDTH)
ENDM
print chr$(13,10)
%FOR proc,<PROCS>
print fld$(chr$("&proc&"),HDRWIDTH)
%FOR len,<SIZES>
counter_begin LOOPCOUNT, HIGH_PRIORITY_CLASS
invoke proc,ADDR str&len&
counter_end
mov ebx,eax
print fld$(ustr$(ebx),COLWIDTH,1)
ENDM
print chr$(13,10)
ENDM
mov eax,input(13,10,"Press enter to exit...")
exit
[attachment deleted by admin]
Here are the timings on my Prescott PIV.
Bytes 0 1 2 3 4 5 15 16 31 255 1023
===== ===== ===== ===== ===== ===== ===== ===== ===== ===== =====
StrLen2 6 9 8 6 10 9 27 25 38 302 862
Ratch 1 1 13 4 4 6 19 13 43 260 910
RatchX -1 0 1 1 7 3 16 13 30 256 910
RatchX2 -1 2 1 1 3 4 13 13 30 248 882
Hutch-
Strange. The misaligned code is faster on yours also. Not as dramatic, but suprising anyway. Thanks.
Ratch, RatchX, RatchX2, StrLen1, and StrLen2 all seem to be quite sensitive to alignment. For some reason FStrLen and FStrLen1 were not. The following table was produced by loading 13 different copies of each procedure into memory at various alignments. Before each procedure is:
align 16
repeat PAD
nop
endm
Ratch&PAD& proc arg:DWORD ; procedure definition and entry begins here.
.... full procedure definition
Ratch&PAD& endp
The &PAD& appends the pad value to the procedure name to avoid duplicate names as the same procedure is loaded with different padding. This is the last of PAD values from 4 to 16. The values are easily alterable in the source if you care to play with them. I was quite surprised because the 16-byte alignment doesn't always seem to be best. Also, the timings are for a 1023 dup('X') string that was used in many of the preceeding tests and the different timings at various alignments help explain wide differences as we would add or remove the 250 ms sleep which changed the alignment.
Pad nops 4 5 6 7 8 9 10 11 12 13 14 15 16
========== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
Ratch 1067 1066 1332 1077 1335 1333 1333 1334 1078 889 1067 883 893
RatchX 1061 1061 1313 1059 1313 1313 1313 1313 1060 882 1059 872 883
RatchX2 1061 1313 1058 1312 1312 1312 1314 1059 885 1059 879 880 882
StrLen1 1298 1046 1048 1046 1043 870 873 874 859 1046 1046 1043 1043
StrLen2 987 978 977 992 1059 1059 1058 1059 1302 1302 1302 1302 1048
RatchX2 will be fastest when it is preceeded by 'align 16' followed by 14 or 15 'nops'.
StrLen2 is fastest when preceeded by 'align 16' followed by 2 nops, etc.
I don't understand but it is interesting!
[attachment deleted by admin]
Nice technique Phil, I kept thinking I was gonna get around to something similar :U
Here are my results, except I changed 16 to zero to test against original configuration:
Pad nops 4 5 6 7 8 9 10 11 12 13 14 15 0
========== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
Ratch 1058 904 1063 1060 1058 1059 1189 1059 933 892 893 809 873
RatchX 1056 894 1053 1052 1055 1054 1180 1054 929 888 884 801 864
RatchX2 887 1052 1052 1053 1057 1180 1056 926 881 884 798 800 800
StrLen1 1058 931 928 930 927 870 865 866 865 1053 1056 1054 1056
StrLen2 909 910 913 910 1007 1007 1008 1007 1122 1122 1122 1122 930
I would have thought 12 pads on the Ratch procs would have been the best since it would align the loop at 16 byte, but didn't work out that way. Still a mystery here :dazzled:
Also, I think looping a million times is probably not the best indication of how fast a routine can be used in a normal program. Perhaps calling each routine once in sequence, loop back, call each one again, etc., keeping a running average?
JimG: Doing what you suggest would well produce a better indication of how the code might perform in a typical program. However, I think doing it like this is probably better when comparing the raw speed of an algorithm. It produces values which can be reproduced and that is very important when we are trying to tune, or slightly de-tune, our code for greater performance! Like you said, we still have mysteries here. I still don't know how mis-aligning a loop by adding a nop can make it run faster ... but it certainly seems to be the case occasionally!
Donkey: Finally got your MMX strlen for long strings into the test procedure. Sorry it took me so long to get around to it. As you said earlier, the strings must begin at a 16-byte boundry and it gains it's speed using MMX to load and examine 8-bytes at a time. Great job! It certainly seems to fly! I've added some commentary at the end of this block ... please let me know if I've missed anything or mis-understood anything. Here's your routine in MASM format:
align 16
lszLenMMX proc pString:DWORD ; Donkey's MMX strlen for long strings
.mmx
.xmm
mov eax,[esp+4]
nop
nop ; fill in stack frame+mov to 8 bytes
pxor mm0,mm0
nop ; fill pxor to 4 bytes
pxor mm1,mm1
nop ; fill pxor to 4 bytes
@@: ; this is aligned to 16 bytes
movq mm0,[eax]
pcmpeqb mm0,mm1
add eax,8
pmovmskb ecx,mm0 ; edit: by Phil - is this Polish instruction?
or ecx,ecx ; ... ah, here's one that i understand!
jz @B
sub eax,[esp+4]
bsf ecx,ecx
sub eax,8
add eax,ecx
emms
ret 1*4
lszLenMMX endp
First, the pxor's clear both mm0 and mm1. Then 8-bytes are loaded into mm0 with movq. Next the index is updated and pcmpeqb compares all 8-bytes to the zeros stored in mm1. If any of the 8-bytes are equal to nul then the corresponding byte in mm0 is set to all ones. Otherwise, the corresponding byte in mm0 is set to all zeros. Finally, the pmovmskb (pmov mask byte) copies the most significant bit of each byte in mm0 into the destination register which is ecx. If any of the bits are set then the end of the zero terminated string has been found. The bsf instruction scans forward (right-to-left) thru the bits in ecx and returns the index of the first bit that was set. The index returned would be 0 if bit 0 was set, indicating that the terminator was in the first (least significant) byte of the 8-bytes being checked. Subtracting 8 and adding the index to the difference between the original and final pointers produces the string length that is returned in eax!
Here are the timings for all routines now included in timesz.asm using the longer strings. Again, many of them only support 7-bit ASCII. Donkey's lszLenMMX procedure nicely handles 8-bit extended ASCII and there is no danger of page boundry over-run at the end because the source string must be aligned on a 16-byte boundry. Thanks for sharing your MMX code with us Donkey!
Proc/Bytes 0 1 2 3 4 5 15 16 17 127 255 1023 2047
========== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
lszLenMMX 16 16 16 16 16 16 20 22 22 76 122 411 795
FStrLen 7 7 10 8 18 10 17 19 19 113 210 785 1554
FStrLn2 7 8 9 9 18 11 15 19 20 113 210 785 1555
Ratch 18 25 32 39 22 29 49 31 38 148 259 895 1746
RatchX 18 24 31 33 21 29 41 31 38 142 253 884 1721
RatchX2 19 23 31 33 20 27 41 33 37 146 248 881 1722
StrLen 15 15 16 16 18 18 25 28 28 150 278 1047 2071
StrLen1 11 11 12 12 15 15 21 24 24 127 231 857 1701
StrLen2 13 13 12 14 15 15 24 27 27 142 264 978 1929
szLen 8 10 12 14 17 19 32 49 47 241 464 1809 3604
szLen1 5 7 9 11 13 15 28 30 43 210 406 1566 3113
szLen2 6 7 9 12 13 15 29 30 45 212 408 1580 3149
szLen3 6 6 10 10 12 15 27 55 61 213 409 1571 3148
szLen4 5 7 8 10 14 14 26 30 30 208 399 1551 3090
lstrlenA 48 41 44 45 48 62 91 94 97 427 812 3117 6192
scasx 45 48 53 57 59 63 102 105 109 548 1055 4109 8179
The slowest algorithm by far is the 'repnz scasb' cutie that I also added to this version. The files in the zip have the same names as the previous timesz.asm. Be sure to rename your older files if you'd like to save them before you extract from this archive.
[attachment deleted by admin]
I think it is wrong to call donkey's routine MMX though it uses MMX register as pmovmskb is an SSE opcode and not part of the original MMX instruction set. Therefore older processors without SSE would be unable to use the routine.
roticv: Thanks for pointing that out that it does require SSE. I hadn't realized that until I started reading about the instructions to figure out how it worked and then I failed to mention it in the commentary. It would also have be nice if I had posted the info link (http://www.tommesani.com/SSEPrimer.html#PMOVMSKB) since it clearly identified it as an SSE instruction.
Now that the contest for the main loop seems to be decided in favor of
the aligned dword search implementation, how about a little different
spin on the code around it--namely aligning the search pointer, and
locating the zero in the last dword.
The idea (as always) is to do things in the biggest chunks possible
and eliminate 8-bit operations completely from the code. The cpu does
like it when you switch between 8/32 bit modes in her registers, and
will take it out on your clock counts without telling you.
So, to align the search, we back up :eek to the dword we are starting in,
load the dword, stuff 1's into the leading bytes to skip over, then drop
into the middle of the normal dword search and continue. It is faster than
the byte search method on three bytes, it is likely slower than a 1-byte
search.
To locate the zero in the last dword, we handle the upper and lower halves
of the dword separately, computing the final lengths with straight 32-bit
operations directly. This new method seems to have lower clocks than any
other method I have seen.
Incidentally, I should point out that strlen( ) is a special case of memchr( ),
that searches memory for any character, not just zero. This work on strlen( )
can easily be extended for a fast memchr( ) implementation.
Please forgive the C++ decoration, I use this routine as a direct
replacement for strlen( ) in my C++ work--enjoy...
// search to find length of a null-terminated string
// fast performance: strings of 1/10/100/1000 bytes require 7/13/103/776 cycles (Athlon64)
int __declspec( naked ) szLength (const void* src) {
_asm {
mov edx, [esp + 4] ; point edx to start of string
test edx, 3
jz lenscan ; branch ahead if already aligned
mov ecx, edx
and edx, ~3 ; edx points to aligned addr
and ecx, 3 ; ecx = bytes to skip
shl ecx, 3 ; ecx = bits to skip
mov eax, 1
shl eax, cl ; put cl 1's into eax from the bottom up
sub eax, 1
or eax, [edx] ; combine with first four bytes
jmp resume ; catch up with the aligned search...
align 4
lenscan: mov eax, [edx] ; load next four bytes into eax
resume: add edx, 4 ; advance ptr
lea ecx, [eax - 1010101h]
not eax ; for each byte in ecx: (charval-1) & ~charval & 80h
and eax, ecx
and eax, 80808080h
jz lenscan ; repeat while no zeros found
sub edx, [esp + 4] ; subtract the base address
test eax, 8080h ; test first two bytes
jz upper2 ; jmp if not found in the lower 2 bytes
shr eax, 8 ; set carry from bit7 of 1st byte
sbb edx, 3 ; edx = (edx-4) + (1-carry)
mov eax, edx ; return as the result
ret
upper2: shl eax, 9 ; set carry from bit7 of 3rd byte
sbb edx, 1 ; edx = (edx-2) + (1-carry)
mov eax, edx ; return as the result
ret
}
}
]
Codewarp: I haven't taken the time yet to have a close look at your algorithm but I stripped the headings and added it to the test set. I also removed the other tests that are not listed here. The FStrLen figures may not be a fair comparison because it only does 7-bit ASCII ... Ratch, and lszLenSSE both do 8-bit extended ASCII. Here are the results for various string sizes.
Proc/Bytes 0 1 2 3 5 8 13 21 34 55 89 144 233
========== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
lszLenSSE 16 16 16 16 17 19 20 22 28 48 63 86 116
FStrLen 7 7 10 9 10 13 16 37 47 59 86 128 194
Ratch 18 25 32 39 29 25 35 51 66 92 105 144 227
szLength 19 20 19 19 23 25 30 51 60 81 121 176 264
szLen 8 10 12 14 19 22 30 56 77 115 175 270 429
Tune it up a bit if you like and see if you can top Ratch! I'll have a closer look at your algorithm later. I used it to print the lengths that are displayed in the header so I know it works, at least with 7-bit ASCII.
[attachment deleted by admin]
I think Codewarp's idea was to make a faster routine for unaligned strings. I inserted 1-3 bytes before each test string using for example %FOR len,<SIZES>
align 16
db 0
str&len& db len dup ('X'),0
ENDM
to see the effect and got:
Aligned
Proc/Bytes 0 1 2 3 5 8 13 21 34 55 89 144 233
========== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
lszLenSSE 25 25 25 25 25 28 28 32 38 47 82 117 169
FStrLen 6 8 10 9 11 12 16 21 32 61 89 132 199
Ratch 8 11 12 15 14 14 20 28 39 78 101 142 221
szLength 10 11 10 11 13 17 20 28 38 77 112 168 257
szLen 8 9 13 12 18 22 28 38 54 92 139 207 321
Misaligned by 1 byte
Proc/Bytes 0 1 2 3 5 8 13 21 34 55 89 144 233
========== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
lszLenSSE 28 28 28 28 28 30 30 37 41 55 90 126 178
FStrLen 6 8 10 9 11 12 17 24 35 72 102 150 230
Ratch 8 11 12 15 18 15 23 30 40 85 109 154 241
szLength 14 13 13 16 17 20 22 32 44 85 116 174 262
szLen 8 9 13 13 18 22 28 39 57 93 139 207 323
Misaligned by 2 bytes
Proc/Bytes 0 1 2 3 5 8 13 21 34 55 89 144 233
========== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
lszLenSSE 28 28 28 28 28 30 30 37 41 55 89 125 177
FStrLen 6 8 10 9 11 12 17 24 35 71 102 149 229
Ratch 8 11 12 15 18 16 23 30 40 85 109 155 240
szLength 14 13 16 16 17 20 22 32 64 84 116 172 262
szLen 8 8 13 13 18 22 28 39 54 93 139 208 328
Misaligned by 3 bytes
Proc/Bytes 0 1 2 3 5 8 13 21 34 55 89 144 233
========== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
lszLenSSE 28 28 28 28 28 30 30 35 45 54 91 123 174
FStrLen 6 7 10 9 11 12 17 25 35 71 102 149 229
Ratch 8 11 12 15 18 15 23 30 41 85 109 154 240
szLength 13 16 16 17 21 21 29 38 64 83 121 172 266
szLen 8 9 13 13 18 22 28 38 54 93 139 207 321
So his code is less affected by misalignment than the others.
Jimg: Nice work! I am unclear as to nature of this Ratch algorithm. But it doesn't
matter anyway, since further work on szLength( ) makes it the fastest :dance:. You were
right--my algorithm is intended to enhance performance of both misaligned strings
and of short strings. The new version below ratchets up :wink the performance for long
strings with a little tuck-and-unroll. I have also exchanged the roles of
eax and edx, to avoid one extra mov instruction at the end of every call.
; search to find length of a null-terminated string
; fast performance: strings of 1/10/100/1000 bytes require 7/13/101/653 cycles (Athlon64)
__declspec( naked ) int szLength (const void* src) {
_asm {
mov eax, [esp + 4] ; point eax to start of string
test eax, 3
jnz fixalign ; jmp to fix misalignment
align 4
lenscan: mov edx, [eax] ; load next four bytes
resume: lea ecx, [edx - 1010101h]
not edx ; on each byte in ecx: (byte-1) & ~byte & 80h
and ecx, edx
and ecx, 80808080h
jnz found ; branch if found
mov edx, [eax + 4] ; load next four bytes
add eax, 8 ; advance ptr (twice)
lea ecx, [edx - 1010101h]
not edx ; on each byte in ecx: (byte-1) & ~byte & 80h
and ecx, edx
and ecx, 80808080h
jz lenscan ; repeat while no zeros found
sub eax, 4 ; back off to last dword
found: sub eax, [esp + 4] ; subtract the base address
test ecx, 8080h ; test first two bytes
jz upper2 ; jmp if not found in the lower 2 bytes
shr ecx, 8 ; set carry from bit7 of 1st byte
sbb eax, -1 ; return eax = eax + (1-carry)
ret
upper2: shl ecx, 9 ; set carry from bit7 of 3rd byte
sbb eax, -3 ; return eax = (eax+2) + (1-carry)
ret
fixalign: mov ecx, eax
and eax, ~3 ; eax points to aligned addr
and ecx, 3 ; ecx = bytes to skip
shl ecx, 3 ; ecx = bits to skip
mov edx, 1
shl edx, cl ; put cl 1's into edx from the bottom up
sub edx, 1
or edx, [eax] ; combine with first four bytes
jmp resume ; start up in the aligned search
}
}
Codewarp-
On first test, it seems much faster. Unfortunately, it's crashing on misaligned strings. I haven't had a chance to figure out why yet.
Later...
ok, it the last fixalign instruction-
or edx, [eax] ; combine with first four bytes
Jimg: I had expected that there would have been trouble in your earlier alignment tests with the lszLenSSE routines. I had thought that it required 16-byte alignment for the strings. Anyway, I saw that it ran okay for you and I haven't looked into it further. I just wanted to say something in case the crashes you are encountering are caused by it and not szLength.
Quote from: Jimg on June 21, 2005, 12:59:37 AM
Codewarp-
On first test, it seems much faster. Unfortunately, it's crashing on misaligned strings. I haven't had a chance to figure out why yet.
Later...
ok, it the last fixalign instruction-
or edx, [eax] ; combine with first four bytes
Jimg -
I have been unable to reproduce any crash, nor can I find any erroneous result. That is not to say there isn't one,
but I can assure you that the
or edx, eax is the essence of the fixalign, and not an oversight. What it does, is
to fill the beginning ragged edge with 1's just in case those bytes contained zeros. Remember, on misalignments, I
back-up the pointer to the starting dword, then I force 1-2-3 bytes to 1's, depending on the misalignment.
I have single-stepped through this process and found that it does exactly what it should (at the register level).
I am calling it from within larger software that passes a diversity of string lengths and alignments to it, and it all executes
without any apparent difficulty.
I am eager to repair this algorithm if it is indeed in error, but I need some evidence to take your claim seriously. :naughty:
Please be sure that you have included all my lines. Note that the VC++ compiler places all routines on 16-byte
boundaries--your tests should do the same. I see that same unexpectedly large swings in clock counts with only
minor changes in code and entry point alignments, reported in many postings.
Codewarp: Here's an updated zip including your recent modifications. The results weren't all that different from your previous version but in these tests all strings and procedures are aligned on 16-byte boundries.
Proc/Bytes 0 1 2 3 5 8 13 21 34 55 89 144 233
========== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
lszLenSSE 16 16 16 16 16 19 20 22 28 48 63 86 116
FStrLen 7 7 10 9 10 13 16 35 47 59 86 128 194
Ratch 18 25 32 39 29 25 35 52 66 92 105 144 227
szLength 19 20 19 19 23 25 30 51 60 81 121 176 265
szLength2 19 19 19 19 23 24 30 37 47 81 116 173 261
szLen 8 10 12 14 19 22 30 56 77 115 175 270 428
Jimg: Could you zip up and post your recent test source and results so we could use the same methods to test various alignments? I know you said that you just added 1-3 nops before each string in your runs and that sounds easy enough but the actual source would be helpful in case something unusual is happening.
[attachment deleted by admin]
Phil--
I am getting different clock counts than you are, which is likely due to different CPUs (mine=athon64, yours=?) and
different benchmarking technique. What I am seeing with the different strlen( ) methods is this:
(1) The DWORD-101010h & ~DWORD method is the basis for all the highest performing strlen( ) methods. I
use it in all of my implementations, as does Rachet (i.e. identical code).
(2) This method takes 15-25% longer on long misaligned strings, without realignment. This loss occurs in all
implementations of it, but does not seem to show up well in your tests.
(3) By unrolling the loop for 8 bytes per iteration, the method gains noticable speed, using 15-25% less time. This
does not show up at all on your tests, but I was testing 1000 byte searches, yours were 233.
(4) Handling misalignment costs 2-3 cycles up front, on every call, misaligned or not. On misaligned calls it
costs an additional 4-5 clocks. This 6-8 cycle overhead is paid by the first 3 or 4 dwords. The overhead
of byte-at-a-time realignment is excessive after the first byte because of the jmps involved. Memory
misalignment costs vary between processors, but typical "abc" quoted strings are not aligned.
(5) Cost of locating the byte in the last dword shows up in short strings of a few bytes. Short strings are far more
commonly passed to strlen( ) than long strings. The fewest number of jmps to do this runs the fastest.
My benchmark practice is as follows:
(1) t1 = clocks for 1000 empty, baseline iterations
(2) t2 = clocks for 1000 iterations of a target routine
(3) t3 = overhead reading the clock
(4) report (t2 - t1 - t3)/1000 as the time for the target routine.
I run this whole thing at least three times, and note the result stability, rerunning as needed to get a stable
lowest clock count benchmark. This all assures that I am not simply measuring the efficiency of my memory
system (or of interrupt processing), but getting as close as possible to the algorithmic cost itself. I have
validated this approach by benchmarking routines of known cost, and observing just that.
I believe it is wise to be suspicious of all benchmarks, questioning their basis and understanding their limitations.
A vigorous discussion of these topics will benefit everyone.
I certainly agree that discussion is key to understanding what's happening here. I plugged your unrolled code into the test as szLength2 and didn't see much of a difference as indicated by the result. I'm testing on a 996 MHz P3 and the trials I see are very consistent on this machine. I'm using MichaelW's timer macros that just repeat an invoke szLength the number of times specified by LOOP_COUNT which then returns the average number of CPU clocks for each iteration as the result. I've added complexity by defining list driven macros that, hopefully, make it easier to modify the tests and add procedures. As far as I can tell there is no compensation in MichaelW's timer macros to discount the result based on the overhead costs of reading the clock. The math to compute the averages is obviously done outside the loop after the final clock is read.
Are you able to assemble and run these tests directly on your machine? The only part of the source that I did not include are Michael's timer macros that I'll zip up and include here for your convenience. They are also posted in several other places and have been used and tested extensively. We do, however, see unusual results from time to time but I don't think it's associated with his macros.
I'd be glad to run your benchmark on my machine if you care to post the source and executable. I can see that unrolling the loop should indeed make a big difference in the timing as you have said.
[attachment deleted by admin]
Phil,
My macros compensate for the loop and timing overhead by timing an empty loop that is otherwise identical to the test loop, and subtracting the cycle count for the empty loop from the cycle count for the test loop.
hmmmm,
Quote
My benchmark practice is as follows:
(1) t1 = clocks for 1000 empty, baseline iterations
(2) t2 = clocks for 1000 iterations of a target routine
(3) t3 = overhead reading the clock
(4) report (t2 - t1 - t3)/1000 as the time for the target routine.
I run this whole thing at least three times, and note the result stability, rerunning as needed to get a stable
lowest clock count benchmark. This all assures that I am not simply measuring the efficiency of my memory
system (or of interrupt processing), but getting as close as possible to the algorithmic cost itself. I have
validated this approach by benchmarking routines of known cost, and observing just that.
This is in fact an interesting notion but I am wary of what is left as it will still depend on the opcode implimentation from processor to processor which differ substantially over time and between different manufacturers. Usually the reference to a known code is more useful but this also has its limitations in that an algo that is fast one one machine can be slow on another if its written to use a specific characteristic of one form of hardware.
Memory speed is of course a factor but on the same box testing two different routines, one known and the other developmental there is no advantage or disadvantage to either. What I am inclined to trust is algo comparison on a range of different boxes with different processors to see which works better on what box which is the basics of writing mixed model code that is general purpose.
Codewarp-
Sorry, I misinterpreted one of your instructions-
and eax, ~3
I haven't seen the tilde uesd before, and thought it was just a spurious character and deleted it :red. Phil fixed it properly-
and eax, 0FFFFFFFCh ; ~3 ; eax points to aligned addr
Works perfectly so far and is the fastest (at least on my machine).
Phil-
The code I used is identical to yours, I just ran it four separate times, inserting one more byte in the string definition macro each time as I said, I didn't automate it at all-
%FOR len,<SIZES>
align 16
db 0 ;,0,0
str&len& db len dup ('X'),0
ENDM
Phil-
QuoteI certainly agree that discussion is key to understanding what's happening here. I plugged your unrolled code into the test as szLength2 and didn't see much of a difference as indicated by the result. I'm testing on a 996 MHz P3 and the trials I see are very consistent on this machine.
The routine is definately the fastest on my athlon (other than the sse code). I've seen these timing differences between the P3 and Athlons before....
Here are the two block of code that that deserve serious notice. I hope this may be included into your (today's) advancements on this subject ... I founded it though serious searching. Could this be included in your test. We all really need to see the results here.
Thank you
Jens_Duttke_StrLen proc PROC Source:DWORD
mov ecx, Source
@@:
mov eax, dword ptr [ecx]
add ecx, 4
lea edx, [eax - 01010101h]
xor eax, edx
and eax, 80808080h
jz @B
and eax, edx
jz @B
bsf edx, eax
sub edx, 4
shr edx, 3
lea eax, [ecx + edx - 4]
sub eax, Source
RET
Jens_Duttke_StrLen endp
Jens_fast_strlen PROC item:DWORD
mov ecx, item
@@:
mov eax, dword ptr [ecx]
add ecx, 4
lea edx, [eax - 01010101h]
xor eax, edx
and eax, 80808080h
jz @B
and eax, edx
jz @B
bsf edx, eax
sub edx, 4
shr edx, 3
lea eax, [ecx + edx - 4]
sub eax, item
RET
Jens_fast_strlen ENDP
Results on Athlon XP 3000+
Test routines for correctness:
lszLenSSE 0 1 2 3 5 8 13 21 34 55 89 144 233
FStrLen 0 1 2 3 5 8 13 21 34 55 89 144 233
Ratch 0 1 2 3 5 8 13 21 34 55 89 144 233
szLength 0 1 2 3 5 8 13 21 34 55 89 144 233
szLen 0 1 2 3 5 8 13 21 34 55 89 144 233
Jens_fast_ 0 1 2 3 5 8 13 21 34 55 89 144 233
Strings aligned:
Proc/Bytes 0 1 2 3 5 8 13 21 34 55 89 144 233
========== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
lszLenSSE 25 26 22 25 25 28 29 32 38 48 80 118 169
FStrLen 6 7 12 12 11 13 16 21 32 63 88 135 201
Ratch 7 12 12 15 14 14 20 29 39 77 101 142 220
szLength 9 10 9 10 11 15 16 26 34 49 90 132 198
szLen 7 8 13 13 18 23 28 38 54 91 140 207 323
Jens_fast_ 20 20 20 20 21 27 29 36 46 69 99 145 217
Strings misaligned by 1 byte:
Proc/Bytes 0 1 2 3 5 8 13 21 34 55 89 144 233
========== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
lszLenSSE 30 28 28 28 29 30 30 37 42 55 90 124 179
FStrLen 6 7 9 8 11 12 17 26 35 71 102 150 230
Ratch 8 10 11 15 18 16 23 31 40 86 108 156 241
szLength 13 14 14 15 15 18 21 28 37 56 96 137 207
szLen 9 9 12 13 18 22 29 39 54 93 140 207 322
Jens_fast_ 21 21 20 20 28 29 33 42 50 77 107 156 234
Strings misalinged by 2 bytes:
Proc/Bytes 0 1 2 3 5 8 13 21 34 55 89 144 233
========== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
lszLenSSE 28 28 28 28 28 30 30 37 41 55 90 126 178
FStrLen 6 7 9 10 10 12 19 25 35 72 102 150 229
Ratch 7 11 12 15 18 15 23 30 40 85 109 155 240
szLength 14 14 15 15 15 19 21 27 52 54 96 139 206
szLen 8 9 13 13 18 22 28 38 54 94 139 207 323
Jens_fast_ 20 21 20 20 29 29 33 42 51 77 107 157 235
Jimg, I made a mistake and posted identical Jens Duttke code. Below is the one that was supposed to be slower. Funny it gave slightly difference results for the same code. Could it be back to back run in. I guest it really don’t matter seeing that FstrLen is the fastest anyway. This is really great.
Also i see you caught the flaw.
Thanks a lot for displaying the results quickly
Jens_Duttke_StrLen proc PROC item:DWORD
mov ecx, item
@@:
mov eax, dword ptr [ecx]
add ecx, 4
lea edx, [eax - 01010101h]
xor eax, edx
and eax, 80808080h
and eax, edx
jz@B
bsf edx, eax
sub edx, 4
shr edx, 3
lea eax, [ecx + edx - 4]
sub eax, item
ret
Jens_Duttke_StrLen endp
Phil- Ok, here is a version that tests the string misalignment automatically. I also added a print to verify that the routines were working correctly, and I added a string with all the possible ascii characters (the 999 string). As you can see, the FStrLen routine stops at the first ascii character over 128 an so it's cycle counts for that string are not correct.
Test routines for correctness:
lszLenSSE 0 1 2 3 5 8 13 21 34 55 89 144 233 999
FStrLen 0 1 2 3 5 8 13 21 34 55 89 144 233 128
Ratch 0 1 2 3 5 8 13 21 34 55 89 144 233 999
szLength 0 1 2 3 5 8 13 21 34 55 89 144 233 999
szLen 0 1 2 3 5 8 13 21 34 55 89 144 233 999
Jens_fast 0 1 2 3 5 8 13 21 34 55 89 144 233 999
Proc/Byte 0 1 2 3 5 8 13 21 34 55 89 144 233 999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
Misaligned by 0 bytes:
lszLenSSE 25 25 27 25 25 28 29 32 38 47 82 118 168 596
FStrLen 6 8 9 9 11 12 16 21 33 60 89 131 199 119
Ratch 9 10 12 15 14 14 20 29 39 77 101 142 220 857
szLength 10 8 9 10 11 15 18 26 33 64 90 133 199 784
szLen 6 11 12 15 19 23 30 61 80 93 140 208 323 1293
Jens_fast 20 20 20 21 21 26 29 36 44 69 100 146 217 923
Misaligned by 1 bytes:
lszLenSSE 28 28 28 28 28 30 31 33 43 55 92 123 179 623
FStrLen 6 8 9 9 11 12 17 25 35 71 101 149 229 135
Ratch 8 11 11 15 18 15 23 30 40 85 108 154 240 955
szLength 14 13 14 15 15 18 21 28 38 55 96 135 205 785
szLen 7 11 12 15 20 23 33 39 54 92 185 208 351 1293
Jens_fast 20 20 20 20 25 28 32 40 48 75 105 154 233 1001
Misaligned by 2 bytes:
lszLenSSE 27 29 27 28 28 31 30 36 41 56 89 127 179 621
FStrLen 6 7 9 9 12 12 17 25 35 73 102 149 228 136
Ratch 8 11 12 14 18 16 23 30 40 87 110 157 241 954
szLength 14 14 15 15 15 21 21 28 40 55 97 140 207 787
szLen 8 11 12 12 18 22 27 40 54 93 139 213 322 1291
Jens_fast 20 20 20 20 24 28 32 40 47 75 105 153 232 994
Misaligned by 3 bytes:
lszLenSSE 28 28 28 28 28 30 31 33 41 56 91 124 177 629
FStrLen 7 8 9 9 10 12 17 27 35 71 103 151 230 136
Ratch 8 11 13 15 18 15 23 31 40 85 110 156 242 953
szLength 14 15 16 15 17 20 24 31 40 55 98 140 207 792
szLen 8 11 12 15 20 23 28 61 69 92 148 208 321 1291
Jens_fast 20 20 21 20 24 28 32 41 47 76 104 154 234 998
Press enter to exit...
[attachment deleted by admin]
Jimg: Thanks for automating ... Especially for the verification routine and string 999 that shows FStrLen 7-bit short-comings!
Here are the results for a 996 MHz P3:
Test routines for correctness:
lszLenSSE 0 1 2 3 5 8 13 21 34 55 89 144 233 999
FStrLen 0 1 2 3 5 8 13 21 34 55 89 144 233 128
Ratch 0 1 2 3 5 8 13 21 34 55 89 144 233 999
szLength 0 1 2 3 5 8 13 21 34 55 89 144 233 999
szLen 0 1 2 3 5 8 13 21 34 55 89 144 233 999
Jens_fast 0 1 2 3 5 8 13 21 34 55 89 144 233 999
Proc/Byte 0 1 2 3 5 8 13 21 34 55 89 144 233 999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
Misaligned by 0 bytes:
lszLenSSE 16 16 16 16 17 19 19 22 29 48 63 86 117 404
FStrLen 7 7 10 8 10 13 16 37 47 59 86 128 194 116
Ratch 18 25 32 39 29 25 35 51 66 95 105 144 227 872
szLength 19 19 19 19 23 25 30 37 47 81 116 173 261 1025
szLen 8 10 12 14 19 22 30 56 77 115 175 270 428 1767
Jens_fast 12 12 12 12 15 18 19 38 47 63 88 130 196 870
Misaligned by 1 bytes:
lszLenSSE 16 16 16 16 16 26 19 40 32 54 93 113 171 649
FStrLen 7 7 10 8 10 13 18 53 49 59 87 133 210 125
Ratch 18 25 32 39 29 25 35 74 77 96 124 182 286 1149
szLength 24 25 25 30 29 30 33 41 54 88 120 176 265 1032
szLen 8 10 12 14 19 22 30 56 77 115 175 270 428 1768
Jens_fast 12 12 12 12 15 18 19 63 51 66 99 145 224 981
Misaligned by 2 bytes:
lszLenSSE 16 17 16 16 16 26 19 40 32 54 93 113 251 648
FStrLen 7 7 10 8 10 13 16 53 49 59 87 133 288 124
Ratch 18 25 32 39 29 25 35 73 77 96 124 182 371 1149
szLength 25 25 30 30 29 31 33 41 58 88 120 177 265 1032
szLen 8 10 12 14 19 22 30 56 77 115 175 270 428 1770
Jens_fast 12 12 12 12 15 18 19 63 51 64 99 145 310 978
Misaligned by 3 bytes:
lszLenSSE 16 16 16 16 16 26 19 22 45 60 81 117 184 627
FStrLen 7 7 10 8 10 13 16 37 48 66 96 144 209 123
Ratch 18 25 32 39 29 25 35 51 73 110 135 191 287 1140
szLength 25 30 30 29 30 31 37 45 58 88 125 178 269 1033
szLen 8 10 12 14 19 22 30 59 76 115 176 270 428 1767
Jens_fast 12 12 12 12 15 18 19 38 53 77 106 155 227 1005
Press enter to exit...
Quote from: ic2 on June 21, 2005, 02:05:01 PM
Here are the two block of code that that deserve serious notice. I hope this may be included into your (today's) advancements on this subject ... I founded it though serious searching. Could this be included in your test. We all really need to see the results here.
Thank you
ic2: Interesting algorithm, though is has some shortcomings:
(1) Is seems to work only on 7-bit ascii, not 8-bit.
(2) Its loop uses two jmps instead of one. I believe the first one is unnecessary.
(3) the BSR implementation has been tried and examined thoroughly. It looks so elegant...
Too bad the BSR is such a dog, see the szLength( ) for a better impl. of this tail-end part
of the routine.
(4) No misalignment handling makes this method slow for long misaligned strings.
Quote from: hutch-- on June 21, 2005, 08:35:03 AM
Quote
This is in fact an interesting notion but I am wary of what is left as it will still depend on the opcode implementation from processor to processor which differ substantially over time and between different manufacturers. Usually the reference to a known code is more useful but this also has its limitations in that an algo that is fast one one machine can be slow on another if its written to use a specific characteristic of one form of hardware.
Memory speed is of course a factor but on the same box testing two different routines, one known and the other developmental there is no advantage or disadvantage to either. What I am inclined to trust is algo comparison on a range of different boxes with different processors to see which works better on what box which is the basics of writing mixed model code that is general purpose.
Hutch --
This benchmarking thing really gets down to the heart of the matter, doesn't it? I agree with everything you have said, and it gets right down to what your code is written for. Code tends to stick around, but processors tend to fade away. There simply isn't any way to code something so that is runs the fastest on all CPUs. You have to pick and choose, and to know what your strategy is. Several strategies come to mind:
(1) Separate libraries for each processor
(2) An Intel library, and an AMD library
(3) Single library optimized for the present day hardware, but compatible back to the PII.
(4) Single library like (3), with dynamic inclusion of advanced cpu features (like sse, etc)
(5) Single library optimized with every trick from tomorrows hardware.
Actually, all of these are desirable, each with serious benefits and baggage. However, clients on 5 year old hardware don't tend to complain about software performance too much. It's the one's driving the shiny new XP-zazz that want all that speed. Do you really want to avoid MUL instructions, simply because somebody might run it on a P4? I think not, and as for my own effort, most of it goes in the direction of approach (3)--as in my szLength( ) routine, and in (4) when needed.
I been so pleased with the szLength( ) results, that I turned it into a killer memchr( ) implementation (faster than anything I had before). Memchr( ) is a much more useful function than strlen( ) that can have a bigger impact on overall sofware speed than strlen( ). Should I post this as a new topic, or as further evolution in szLen( ) ??
Quote from: Codewarp on June 21, 2005, 10:16:04 PM
I been so pleased with the szLength( ) results, that I turned it into a killer memchr( ) implementation (faster than anything I had before). Memchr( ) is a much more useful function than strlen( ) that can have a bigger impact on overall sofware speed than strlen( ). Should I post this as a new topic, or as further evolution in szLen( ) ??
My vote would be a new topic. That would allow others to pick up the new discussion from the beginning. We already have a great deal of discussion going on here and a lot to be considered.
Quote from: Codewarp on June 21, 2005, 10:10:02 PM
Quote from: ic2 on June 21, 2005, 02:05:01 PM
Here are the two block of code that that deserve serious notice. I hope this may be included into your (today's) advancements on this subject ... I founded it though serious searching. Could this be included in your test. We all really need to see the results here.
Thank you
ic2: Interesting algorithm, though is has some shortcomings:
(1) Is seems to work only on 7-bit ascii, not 8-bit.
(2) Its loop uses two jmps instead of one. I believe the first one is unnecessary.
(3) the BSR implementation has been tried and examined thoroughly. It looks so elegant...
Too bad the BSR is such a dog, see the szLength( ) for a better impl. of this tail-end part
of the routine.
(4) No misalignment handling makes this method slow for long misaligned strings.
Thanks to JimG's validation it's clear that FStrLen is the only procedure with the 7-bit ASCII limitation. Also, on the P3 I am using Jens_fast is quicker than szLength with all alignments. JimG's results show that szLength is quicker on an Atholon. I'm not sure if that is related to the BSR usage or not. Anyway, that's my two-cents worth for the moment.
Phil,
First of all, thank you for your response to all of this, along with everyone else too, of course.
I wanted to point out some things regarding (what I call) the DWORD search method, which is used by all of the faster strlen( ) implementations. Let's look at logic of it:
[<fix alignment>] optional misalignment fixup
<locate dword> find the dword containing a zero
<locate byte> find the first zero in the dword
<return len> return the byte address - string base
You will notice that the <fix align> is optional, but all other steps are mandatory--you cannot omit any to speed it up without breaking it.
Now, the point of all this that <locate byte> has a variety of implementations, soom good, some not so good, but every call passes through it, so clocks saved here speed up every call :thumbu.
There are a number of methods for <locate byte>:
(1) inc, test and jz each byte (3 times)
(2) bsr div 8
(3) inc, shr 8 and jc each byte (3 times)
(4) separate upper/lower, add 1-bit7 to address
Ratch uses (3), szLength uses (4). I use (4) because substituting the other methods in anybody's implementation will increase clock counts (by 2-5), and because it requires fewer jmps. BSR would be perfect, if it were not so poorly ScotchTaped to the CPU as an afterthought :tdown--its performance is an extreme disapointment. BSR seems marginally useful when you have no idea where the bit of interest resides within the dword. If you know more than that, shifts and masks will be faster. Method (1) looks promising, because no shifts are involved, but both (1) and (3) suffer from having so many instructions.
So, for example, you could take Ratch, substitute its <locate byte> method (3) with (4), and voila, you shave 2 or 3 cycles off every call (for faster short strings). This is where my comments to ic2 came from--no method using BSR will ever beat method (4), unless a future CPU changes things.
Quote from: Codewarp on June 22, 2005, 01:51:36 AM
Ratch uses (3), szLength uses (4). I use (4) because substituting the other methods in anybody's implementation will increase clock counts (by 2-5), and because it requires fewer jmps. BSR would be perfect, if it were not so poorly ScotchTaped to the CPU as an afterthought :tdown--its performance is an extreme disapointment. BSR seems marginally useful when you have no idea where the bit of interest resides within the dword. If you know more than that, shifts and masks will be faster. Method (1) looks promising, because no shifts are involved, but both (1) and (3) suffer from having so many instructions.
So, for example, you could take Ratch, substitute its <locate byte> method (3) with (4), and voila, you shave 2 or 3 cycles off every call (for faster short strings). This is where my comments to ic2 came from--no method using BSR will ever beat method (4), unless a future CPU changes things.
Thank you for your analysis. What you've said makes sense but it doesn't seem to flow with the results I'm seeing on this machine.
Please download the attached zip, browse thru the source to make sure I have incorporated your routine correctly, assemble if you like or run the included exe file and share the results on your machine with us. I bumped LOOP_COUNT back up to 1000000 and ran the test 3 times to make sure my results were consistent. They varied in some cases by 4 or 5 clocks but the trends are quite consistent. Again, for *some reason* Jens_fast is topping szLength in all cases on a 996 MHz P3. I removed the unnecessary jz as you and P1 suggested and it slowed it down considerably for mis-aligned strings. szLength is certainly least affected by the alignments as you can see from these results but all of the other procedures use BSF and Jens_fast is always slightly faster than szLength. The SBB instruction that you use is slower on this machine ... maybe that's the difference?
Proc/Byte 0 1 2 3 5 8 13 21 34 55 89 144 233 999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
Misaligned by 0 bytes:
szLength 19 19 19 19 23 24 30 37 47 81 116 173 261 1026
Ratch 18 25 32 39 29 25 35 51 66 92 105 144 227 871
Jens_fast 12 12 12 12 15 18 19 41 47 63 88 130 196 870
Jens_slow 10 10 10 10 15 17 20 41 52 68 99 146 220 849
Misaligned by 1 bytes:
szLength 24 25 25 30 29 30 33 41 54 88 120 176 265 1033
Ratch 18 25 32 39 29 25 35 72 77 96 124 182 286 1150
Jens_fast 12 12 12 12 15 17 20 63 51 65 99 145 224 978
Jens_slow 10 10 10 10 15 17 20 56 62 75 113 181 283 1146
Misaligned by 2 bytes:
szLength 25 25 30 30 29 31 33 41 58 88 120 177 266 1033
Ratch 18 25 32 39 29 25 35 72 77 96 124 182 371 1150
Jens_fast 12 12 12 12 15 18 19 63 51 65 99 145 310 979
Jens_slow 10 10 10 10 15 17 20 53 62 74 113 181 362 1147
Misaligned by 3 bytes:
szLength 25 30 30 29 30 31 37 45 58 88 125 177 269 1032
Ratch 18 25 32 39 29 25 35 51 73 110 135 191 287 1141
Jens_fast 12 12 12 12 15 18 19 38 53 77 106 155 227 1005
Jens_slow 10 10 10 10 15 17 20 41 57 85 128 192 282 1140
To me, it's not about who's got the fastest procedure or algo here ... it's about understanding what some of the differences in our architectures or CPU's are that cause us the see things that don't fully make any sense until we understand why and what's happenin' :dance:
[attachment deleted by admin]
Phil --
I think we are talking about different things. At the moment, let me address the p3 issue... I love the p3, it has everything that is necessary, its fast, and it doesn't heat up the room. But to go ever faster, the silicon guys had to start slanting things. Certain instructions, the basic ones like add, adc, and, or, not, mov, cmp, etc get the serious silicon, while others get the micro-coded put-on. By sticking to the basic set, your code fits into the groove that the CPU has been finely tuned to perform. Add to this, some careful instruction ordering to keep multiple execution units humming, and you have code that executes considerably faster on a contemporary CPU.
The p3 doesn't know how to take advantage of all that. If you want the fastest code on a p3, then--hands down--use a p3-only library and optimize the @#$%@% out of it :bdg! However, my interest is in code that runs the fastest on today's machines, but compatible all the way back to the PII. If that code ran really terrible on a p3 :red, a compromise might be order--but that doesn't appear to be an issue in this case.
======================
By the way, there is actually another idea for an even faster szLength( ):
- start off with the 7-bit search
- when the "zero" if found, return if it really is a zero
- otherwise continue from there with an 8-bit search to completion
For the vast majority of arguments to strlen( ) which are 7-bit sz, the faster search will suffice. But as soon as bit7=1, it would switch over to 8-bit. The 7-bit search would be unrolled like the 8-bit search, so it would be faster than any of the 7-bit impl we have seen so far.
Codewarp: Thanks for the 7-bit to 8-bit suggestion. I've been considering ways that to fit FStrLen so it can handle 8-bit ASCII.
I've also found this All About Strings (http://win32asmcommunity.net/phpwiki/index.php?pagename=AllAboutStrings) link that was written by tenkey, roticv, and others. It also contains many algorithms that aren't in our tests yet.
To make sure we are talking about the same thing, can you download the test suite and post the results on your machine? You said earlier that it's not good to use BSR because it's slow but the routines that are using it in this test suite on the P3 I am using appear to be faster than the one that doesn't. I understand what you are saying about many non-crucial instructions being relagated to microcode and that can, in some instances, slow them down considerably. However, the bit instructions are crucial to many operating systems and the trace cache might just help make it fast enough in short loops like this that it might be okay to use. I'm just looking for results that confirm much of what you are saying. It seems that you are quite happy with szLength as it is and it is faster on the Athlon XP 3000+. I don't recall seeing any results for these recent tests from a PIV yet and I'm curious to know what the results would be. In trying to determine where the differences are I'm guessing that the SBB might be slowing your routine down on my machine ... but then, I think it is also slow on the PIV.
I'm going to play with a new test that incorporates some of the procedures described in the previous link and see if I can fix the FStrLen procedure so that handles 8-bit ASCII. For me, this is all about learning more about the various architectures, limitations, and advantages and certainly what you have said has been quite helpful. Thanks again.
It's okay if you are using Linux and can't run the tests. It's, obviously, okay too if you just don't have the time or if you just don't want to. I had offered earlier to produce the results of your benchmark on this machine if you could zip it up and post it but I obviously can't do that if its not Windows or Dos. It just helps to know some of the story behind the story sometimes. I am reading what you are saying, understanding, and learning as much as I can ... but without an apples to apples comparison of the same procedures in different orchards (various machine architectures) our words are just that. Food for thought.
I would also like to see a new thread for your memchr algorithm as well. I'm sure others would also be interested.
Phil --
What's happening is this: I thought your tests are not valid because Jen-fast/slow are both 7-bit routines. You are pitting 8-bit strlen( ) calls (i.e. szLength( ) and ratch( )) against 7-bit routines, then declaring the 7-bit routines the fastest--that's utter nonsense, I thought. But I had actually misinterpreted Jens as 7-bit, but it was actually 8-bit, creating confusion in my mind--my apologies Phil. The only difference between szLength loop and Jens (now) is szLength uses NOT EDX, and Jens uses XOR EDX, ECX, for the same effect. The NOT is necessary in later processors to avoid a register dependency and subsequent slowdown.
Further, don't get hung up on one SBB instruction at the very end--the loop is where all the action is. BSR remains a poor choice, and you could speed up Jens a tiny amount using the byte locator from my code.
Codewarp: I certainly hope that are not raving mad! Both Jens_fast and Jens_slow handle 8-bit extended ASCII. JimG put in the validation routine before the timing tests and added the 999 byte string with 8-bit ASCII. I removed the 7-bit FStrLen test.
It's okay, Bud. You can can be right and have your cake too. I understand.
QuoteI not taking credit for anything other than finding the link. Here is where I founded Jens Duttke code. It's the biggest discussion ever when it come to stlen in asm. Make sure you have a big pot of coffee ready. You got to read before you start the new thread.
http://board.win32asmcommunity.net/index.php?PHPSESSID=abcf67ef9a161ce95dd0c8f181663739&topic=4058.0
Hmmm. So all the big guns are just sitting back and grinning at us because they went through all this two years ago???
:bg
I would not lose sleep over treading the same ground, if no-one did it you would never get improvements.
:toothy I remember the thread very clearly, but it was nice to see another discussion on it. :green2
You can't expect hutch-- to direct you to threads. I know for sure he wanted to, i could feel it in his first few posts.
Rule one, search the world first own your own. My teacher did not give me the test before i studied. Scientist work from ground up. If it an new discussion going on do you think it should stop just because of an old dead one. Life goes on with new and old members from around the world.
I learned more from this thread than the link i founded, read and posted.
It's the little things that count... it's all about improvements so i hope you will continue or i will never post any thread ever again to help people who seek improvement. I know where everything is but this don't mean i know the meaning of it all. I love searching but i love The Laboratory more.
The laboratory is basically a place for bashing algos to death to get them faster or smaller or smarter or whatever else can be done with them. While not everyone has the time to track the discussions in real detail, battering around the edges of algos is the way they have been made faster over time so for those who have the time and the interest, its a worthwhile passtime as it is basically research that is being shared.
I am much easier to please with string length algos, I prefer a classic byte scanner for general purpose work and when I have the luxury of working with aligned data with a buffer that is safely larger then the source in it, I use the Agner Fog versions as it is a good average performer on most hardware.
One thing that is worth stressing with algos pointed at general purpose work is to try them out across different hardware and you learn all of the joys of writing mixed model code that has to perform reasonably on most hardware.
Dont wanna burst anyones bubble because, of course, Jens code rocked but there was another thread after that...
Jens came in with the original thread. This routine was improved upon by buliaNaza. After that Lingo12 rocked the boat with the last one posted to any thread ive seen.
Should be noted that FStrLen is, basically, the exact same routine as Lingo12's...
The continuance to Jens thread was here for reference:
http://board.win32asmcommunity.net/index.php?topic=8330.msg60805#msg60805
ic2: Wow .. what a link! The thread that just won't die! Looks like FStrLen in this thread came from buliaNaza in 2002 and then modified by Lingo! Small world we live in!
buliaNaza is the same person as lingo if I am not wrong. Oh well.
What got me intersted was timelen.
if so im glad he stuck around. To me buliaNaza is one of THE greatest.
Hope we can see the new results on Intel and AMD and improvements if possible. Old news was igood news in this case.
Good luck
Phil,
Let me take another crack at how it is that Jens is faster on p3 while szLength is faster on Athlon 32/64. The code fragments below are from the heart of the algo for both methods (with identical register for this discussion). The only difference is the NOT ECX vs XOR ECX,EDX in szLength and Jens, respectively.
; Jens method
lea edx, [ecx-01010101h]
xor ecx, edx
and ecx, 80808080h
and ecx, edx
; szLength method
lea edx, [ecx-01010101h]
not ecx
and ecx, 80808080h
and ecx, edx
The problem for the Athlon is the LEA/XOR pair has a register dependency, so LEA has to finish before the XOR can start. The NOT ECX can start at the same time as the LEA, causing its 1 cycle to disappear on every iteration (in Athlons).
The p3 may not be able to take advantage of this opportunity, so its clock cycle mix is determined by other things, like instruction times, pipelining, etc. This is why it is next to impossible to develop single-routines that execute "the fastest" on a broad range of CPUs.
If you are a p3 fan, then Jens is for you. If you are fan of multiple execution units and parallel computation, then methods like szLength are for you. You can't make a judgement on this one, without choosing sides and without being biased. I freely admit to being biased toward using as much of recent advances in CPU architecture as reasonable compatibility will permit, and if there is a better place to draw the line, I would like to hear about it.
Ok then... would someone run the dang thing on a P5??
Jimg -
Here is the timelen for my Athlon64, with the shocking details. Also, I have attached a new timelen3.zip with a new version of szLength that may do better on the p3. I am not currently able to rebuild timelen, so if someone could add the new .exe back to this file, I would be grateful.
Running on Athlon64, family.model.step = 15.4.10
Test routines for correctness:
lszLenSSE 0 1 2 3 5 8 13 21 34 55 89 144 233 999
FStrLen 0 1 2 3 5 8 13 21 34 55 89 144 233 128
Ratch 0 1 2 3 5 8 13 21 34 55 89 144 233 999
szLength 0 1 2 3 5 8 13 21 34 55 89 144 233 999
szLen 0 1 2 3 5 8 13 21 34 55 89 144 233 999
Jens_fast 0 1 2 3 5 8 13 21 34 55 89 144 233 999
Proc/Byte 0 1 2 3 5 8 13 21 34 55 89 144 233 999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
Misaligned by 0 bytes:
lszLenSSE 19 19 19 21 19 22 22 25 30 35 69 89 123 408
FStrLen 4 6 8 7 9 10 15 21 48 63 89 129 199 117
Ratch 4 7 14 16 10 10 16 22 34 70 93 132 201 779
szLength 6 6 6 6 9 12 13 18 27 59 83 126 182 708
szLen 3 7 8 9 16 20 49 63 88 129 197 309 494 2019
Jens_fast 17 17 17 17 18 19 21 26 33 65 92 134 200 858
Misaligned by 1 bytes:
lszLenSSE 19 20 19 19 19 25 25 28 33 38 71 91 125 411
FStrLen 4 6 8 7 9 10 15 21 52 68 92 131 199 120
Ratch 4 7 14 16 10 11 16 22 34 79 93 133 201 784
szLength 12 12 12 12 12 17 17 22 34 64 84 119 174 655
szLen 3 7 8 9 16 20 48 63 88 129 197 309 485 2020
Jens_fast 17 17 17 17 21 22 23 32 37 71 95 140 203 884
Misaligned by 2 bytes:
lszLenSSE 19 19 19 20 19 25 25 28 33 38 71 91 125 409
FStrLen 4 6 8 7 9 10 15 21 52 68 92 131 199 120
Ratch 4 7 14 16 10 11 16 22 34 76 93 133 201 784
szLength 12 12 12 12 12 17 17 22 52 64 84 119 174 655
szLen 3 7 8 9 16 20 47 63 88 129 197 309 485 2019
Jens_fast 17 17 17 17 21 22 23 32 37 71 95 140 203 883
Misaligned by 3 bytes:
lszLenSSE 19 20 19 19 19 25 25 29 33 38 71 92 125 409
FStrLen 4 6 8 7 9 10 17 21 52 68 92 131 199 120
Ratch 4 7 14 16 10 11 16 22 34 76 93 133 201 784
szLength 12 12 12 12 17 17 22 27 52 64 87 119 177 655
szLen 3 7 8 9 16 20 49 63 88 129 197 309 486 2019
Jens_fast 17 17 17 17 21 22 23 32 37 71 95 140 203 885
[attachment deleted by admin]
Thanks. What I meant was you and I both have athlons, and Phil has the P3. We need someone with a later pentium to run the timings.
You new routine runs about 20 cycles faster for the 999 string on my Athlon. Good job!
2.8 gig Prescott.
Test routines for correctness:
lszLenSSE 0 1 2 3 5 8 13 21 34 55 89 144 233 999
FStrLen 0 1 2 3 5 8 13 21 34 55 89 144 233 128
Ratch 0 1 2 3 5 8 13 21 34 55 89 144 233 999
szLength 4294942949 2 3 5 8 13 21 34 55 89 144 233 998
szLen 0 1 2 3 5 8 13 21 34 55 89 144 233 999
Jens_fast 0 1 2 3 5 8 13 21 34 55 89 144 233 999
Proc/Byte 0 1 2 3 5 8 13 21 34 55 89 144 233 996
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
Misaligned by 0 bytes:
lszLenSSE 1642949 11 11 25 15 16 31 28 47 53 142 204 663
FStrLen 2 3 5 15 6 5 9 30 33 65 109 153 234 150
Ratch 1 1 18 8 8 5 1 17 37 74 123 156 218 944
szLength 1 1 142949 12 9 25 24 29 63 107 144 221 841
szLen 42949 3 5 27 11 17 39 69 110 180 216 353 527 2186
Jens_fast42949 142949 11 4 33 26 42 59 73 79 154 210 967
Misaligned by 1 bytes:
lszLenSSE 13 11 0 11 11 43 15 31 38 56 80 178 235 794
FStrLen 3 12 7 3 5 18 18 23 43 64 127 194 264 173
Ratch 42949 1 5 742949 46 25 18 34 73 143 206 309 1156
szLength 942949 10 15 22 16 20 64 31 62 106 149 227 870
szLen 42949 5 842949 11 18 18 81 95 151 217 326 538 2196
Jens_fast 3 04294942949 4 5 37 42 92 140 184 179 300 1233
Misaligned by 2 bytes:
lszLenSSE 14 11 24 11 11 28 17 33 41 37 85 174 261 787
FStrLen 3 3 16 2 5 5 21 112 28 66 138 170 291 208
Ratch 1 1 16 7 7 5 21 23 35 49 149 183 339 1163
szLength 9 10 15 1 10 13 27 52 59 50 106 148 204 891
szLen 11 3 7 6 29 18 27 83 107 171 205 348 529 2280
Jens_fast 3 114294942949 3 5 24 71 50 71 127 208 281 1170
Misaligned by 3 bytes:
lszLenSSE 53 24 13 11 23 49 18 30 27 54 119 192 236 782
FStrLen 3 4 5 242949 4 11 32 35 66 176 192 313 167
Ratch 13 2 5 7 19 7 12 17 16 78 155 194 320 1170
szLength 9 12 22 10 15 25 22 25 57 52 109 153 221 861
szLen 42949 5 7 20 11 18 39 68 106 182 216 348 563 2324
Jens_fast 342949 442949 3 6 41 40 60 109 139 172 264 1176
I optimized my version of STRLEN on a 32-bit Athlon. I refuse to chase a peculiar hardware speed with software . I try to code using rules that apply to most every processor, and ignore the anomalies. You can't optimize everything all the time. Ratch
Codewarp--
I've found a small problem with your latest version. Please check the lengths reported rather then the cycle times. It has something to do with high ascii and/or a string following the test string, not sure which.
[attachment deleted by admin]
Jimg--
No, it had to do will my a-little-to-quick transcription of the latest changes--sorry :red. This one should be fixed now--but there's no telling what else I've broke if I can't type it straight... :wink.
[attachment deleted by admin]
Codewarp-
Still a small problem on my machine:
Test routines for correctness:
lszLenSSE 0 1 2 3 5 8 13 21 34 55 89 144 233 999
Ratch 0 1 2 3 5 8 13 21 34 55 89 144 233 999
szLength 0 0 0 0 4 8 12 20 32 52 88 144 232 996
Jens_fast 0 1 2 3 5 8 13 21 34 55 89 144 233 999
lszLenSSE 0 1 2 3 5 8 13 21 34 55 89 144 233 999
Ratch 0 1 2 3 5 8 13 21 34 55 89 144 233 999
szLength -1 -1 -1 3 3 7 11 19 31 55 87 143 231 999
Jens_fast 0 1 2 3 5 8 13 21 34 55 89 144 233 999
lszLenSSE 0 1 2 3 5 8 13 21 34 55 89 144 233 999
Ratch 0 1 2 3 5 8 13 21 34 55 89 144 233 999
szLength -2 -2 2 2 2 6 10 18 34 54 86 142 230 998
Jens_fast 0 1 2 3 5 8 13 21 34 55 89 144 233 999
lszLenSSE 0 1 2 3 5 8 13 21 34 55 89 144 233 999
Ratch 0 1 2 3 5 8 13 21 34 55 89 144 233 999
szLength -3 1 1 1 5 5 13 21 33 53 89 141 233 997
Jens_fast 0 1 2 3 5 8 13 21 34 55 89 144 233 999
The previous version is working and still seems to be the fastest non-sse, even on Hutch's pentium :wink
Hutch-
Are the number glitchs (the ones printing 4294942949) repeatable on your machine? It doesn't seem to be related to any routine?
Jim,
The repeat number is 42949 and it is not consistent across different runs of the test piece. I downloaded Michaels timing code so I could run the test.
The machine is a 2.8 gig Prescott on an 800 meg FSB Intel board with 2 gig of DDR400 and it runs faultlessly, particularly when making timings. I may be worth getting someone else with a reasonably late pentium to test it as well.
Jimg --
I guess this is what I get for keeping separate versions of szLength( ) code for c++ and masm... This one is supposed to work ::). It has another cycle knocked out of every JNZ FOUND (with yet another align 4), and another cycle evaporated by replacing:
OR EDX, [EAX]
with: MOV ECX, [EAX]
OR EDX, ECX
then hiding the MOV in the shadow of a non-dependent instruction. Once again, if you wouldn't mind inserting the .exe for this new code and try again... :red :red
[attachment deleted by admin]
Perfect now :bg
Here's my results, and a copy of the code with an exe for those wanting to try it without building the exe.
Proc/Byte 0 1 2 3 5 8 13 22 39 55 89 144 233 999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
0 byte misalignment
lszLenSSE 25 25 25 25 25 28 28 32 38 47 82 119 166 587
Ratch 8 11 12 15 14 14 20 30 64 77 103 141 219 852
szLength 8 8 9 10 11 15 17 23 34 47 89 134 199 777
Jens_fast 20 20 20 20 21 26 29 36 57 69 99 145 217 925
1 byte misalignment
lszLenSSE 28 28 28 28 29 30 31 33 41 54 92 125 177 617
Ratch 7 10 12 15 18 17 23 32 69 85 108 154 240 952
szLength 13 14 14 16 15 20 19 26 56 67 92 135 201 782
Jens_fast 20 20 21 20 24 28 32 40 62 76 105 153 233 999
2 byte misalignment
lszLenSSE 28 28 28 28 28 30 30 31 42 55 92 124 176 621
Ratch 8 10 11 15 18 16 23 32 69 88 109 155 243 953
szLength 15 13 15 15 15 19 21 29 39 52 92 135 200 783
Jens_fast 19 19 19 21 24 28 32 41 61 77 105 155 235 1002
3 byte misalignment
lszLenSSE 27 27 28 29 28 31 29 35 43 56 91 124 175 626
Ratch 7 11 12 15 18 16 24 32 69 86 110 155 243 953
szLength 13 16 16 15 19 18 24 29 41 52 94 134 202 790
Jens_fast 19 19 19 20 24 28 32 40 61 75 104 153 230 995
[attachment deleted by admin]
This looks a lot better, I just ran the EXE and there are no "funny" numbers.
PIV Prescott 2.8 gig, 80 meg FSB board with 2 gig of DDR400.
Test routines for correctness:
0 byte misalignment
lszLenSSE 0 1 2 3 5 8 13 22 39 55 89 144 233 999
Ratch 0 1 2 3 5 8 13 22 39 55 89 144 233 999
szLength 0 1 2 3 5 8 13 22 39 55 89 144 233 999
Jens_fast 0 1 2 3 5 8 13 22 39 55 89 144 233 999
1 byte misalignment
lszLenSSE 0 1 2 3 5 8 13 22 39 55 89 144 233 999
Ratch 0 1 2 3 5 8 13 22 39 55 89 144 233 999
szLength 0 1 2 3 5 8 13 22 39 55 89 144 233 999
Jens_fast 0 1 2 3 5 8 13 22 39 55 89 144 233 999
2 byte misalignment
lszLenSSE 0 1 2 3 5 8 13 22 39 55 89 144 233 999
Ratch 0 1 2 3 5 8 13 22 39 55 89 144 233 999
szLength 0 1 2 3 5 8 13 22 39 55 89 144 233 999
Jens_fast 0 1 2 3 5 8 13 22 39 55 89 144 233 999
3 byte misalignment
lszLenSSE 0 1 2 3 5 8 13 22 39 55 89 144 233 999
Ratch 0 1 2 3 5 8 13 22 39 55 89 144 233 999
szLength 0 1 2 3 5 8 13 22 39 55 89 144 233 999
Jens_fast 0 1 2 3 5 8 13 22 39 55 89 144 233 999
Proc/Byte 0 1 2 3 5 8 13 22 39 55 89 144 233 999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
0 byte misalignment
lszLenSSE 31 11 11 23 12 15 27 20 26 49 54 154 193 665
Ratch 2 1 5 -4 10 5 22 23 41 71 118 146 207 900
szLength -10 1 1 2 17 9 15 31 42 66 102 145 237 895
Jens_fast 9 -1 -1 6 4 5 32 40 69 74 79 152 214 994
1 byte misalignment
lszLenSSE 13 25 16 13 23 16 15 29 24 59 80 174 237 779
Ratch 1 16 6 7 7 -6 13 63 48 79 135 196 310 1180
szLength 10 22 9 17 23 16 19 34 40 61 107 160 227 860
Jens_fast 4 0 -1 -1 -7 6 12 53 69 99 144 172 269 1173
2 byte misalignment
lszLenSSE 25 12 13 12 11 15 53 21 24 57 85 168 247 788
Ratch 12 2 6 4 22 6 35 64 42 67 136 204 310 1171
szLength 21 11 19 10 -1 19 18 16 37 71 107 149 226 885
Jens_fast 3 -1 -12 -1 3 7 62 40 69 85 126 193 282 1184
3 byte misalignment
lszLenSSE 27 11 0 12 12 27 19 20 48 43 126 200 281 813
Ratch 2 1 6 -5 11 5 26 22 86 59 136 219 291 1140
szLength -2 17 10 21 15 14 31 31 77 86 132 149 231 837
Jens_fast -10 -1 0 -1 14 5 34 51 94 92 116 182 252 1169
Looking great guys! Hutch, the funny numbers that you saw in the previous run, 42949's, were an attempt to display unsigned numbers in a 5 character field when print ustr$(ebx) is working correctly. The timelen.asm source files should be corrected so they use 'print sstr$(ebx)' instead of the unsigned version that I used incorrectly. The 42949 is actually the first 5 digits of -1 when it is displayed as unsigned. My mistake.
JimG and Codewarp: Glad to see you are moving right along here. I'm sorry if I slowed the flow here trying to understand things that are probably still just a bit beyond my abilities at the moment. Thank you all for your patience and help. I'll just keep re-reading the posts and scratching my head occasionally until it makes a little more sense to me. I'm still working on understanding how to cure the register stalls that Hutch had pointed out in some other code that I'm working on ... And, I just understand things a lot better when I see results like these posted that generally agree with the words and symbols that I'm trying to fit into my mind. Thanks again.
Would it be easy/legal/appropriate to incorporate the IdCPU code into the developing standard benchmarking code being used here on the strlen( ) code? That way, every report says what it is--it would also be cool...
Thanks Phil, for a moment I thought my PIV had developed a maths bug. :bg
I am quite surprised that Jens_mmx version is not found in the test bed (Too bad the graphs that used to be found there were gone).
strlen proc lpString:DWORD
mov eax, 1 ; request CPU feature flags
cpuid ; CPUID instruction
;- Pre-Scan to align the string-start ----
mov ecx, lpString
mov eax, ecx
cmp byte ptr [eax], 0
je done
and ecx, 0FFFFFFF8h
add ecx, 8
sub ecx, eax
cmp ecx, 8
je aligned
@@:
inc eax
cmp byte ptr [eax], 0
je done
dec ecx
jnz @B
aligned:
mov ecx, eax
;-----------------------------------------
test edx, 800000h ; test bit 23 to see if MMX available
jz no_mmx ; jump if no MMX is available
pxor mm0, mm0
@@:
movq mm1, qword ptr [ecx]
movq mm2, qword ptr [ecx + 8]
movq mm3, qword ptr [ecx + 16]
movq mm4, qword ptr [ecx + 24]
movq mm5, qword ptr [ecx + 32]
movq mm6, qword ptr [ecx + 40]
pcmpeqb mm1, mm0
pcmpeqb mm2, mm0
pcmpeqb mm3, mm0
pcmpeqb mm4, mm0
pcmpeqb mm5, mm0
pcmpeqb mm6, mm0
por mm1, mm2
por mm3, mm4
por mm5, mm6
por mm1, mm3
por mm1, mm5
add ecx, 48
packsswb mm1, mm1
movd eax, mm1
test eax, eax
jz @B
sub ecx, 48
emms ; Empty MMX state
no_mmx:
@@:
mov eax, dword ptr [ecx]
add ecx, 4
lea edx, [eax - 01010101h]
xor eax, edx
and eax, 80808080h
and eax, edx
jz @B
bsf edx, eax
sub edx, 4
shr edx, 3
lea eax, [ecx + edx - 4]
done:
sub eax, lpString
ret
strlen endp
Bitrake's strlen for AMD Athlon and small strings could not be found too.
StrLen MACRO lpString:REQ
LOCAL _0,_1
mov ecx,lpString
pxor MM0,MM0
pxor MM1,MM1
mov ebx,16
ALIGN 16
_0: pcmpeqb MM1,[ecx+8]
pcmpeqb MM0,[ecx]
nop
add ecx,ebx
packsswb MM1,MM1
packsswb MM0,MM0
movd edx,MM1
movd eax,MM0
or edx,eax
je _0
bsf eax,eax
jne _1
add ecx,8
bsf eax,edx
_1: sub ecx,lpString
shr eax,2
lea eax,[ecx+eax-16]
ENDM
His footnotes says
"- Instructions packaged/aligned to 8 bytes offer highest decode bandwidth.
- Branch targets aligned to 16 bytes boundaries
- Use when average string is >32 bytes"
Codewarp, I think MichaelW is working on that. :)
Quote from: Codewarp on June 24, 2005, 03:03:21 AM
Would it be easy/legal/appropriate to incorporate the IdCPU code into the developing standard benchmarking code being used here on the strlen( ) code? That way, every report says what it is--it would also be cool...
I am working on it, but I have a problem with obtaining a brand identification string for recent Intel processors that return a brand index of zero. Unlike the AMD processors, where CPUID functions 80000002h-80000004h return a 48-byte processor name string (starting with the K5 Model 1), the Intel processors return a brand string that encodes the rated FSB frequency and the multiplier. The name string is not absolutely necessary, but I would like to provide a nice "friendly" name for all of the recent processors, and I would like to use a method that would not require constant updating. If Intel would just follow in AMD's footsteps for a change :bg
roticv-
Thanks for the new routines. Jens_mmx is clearly faster on large strings (>250 bytes or so). The CPUID instruction is too big a penalty to pay on small strings. I've included it in the test routines. I also tried it without the cpuid (Jens_mmx2, not currently selected for test), and that made it better for strings about 150 or so. Bitrake's routine is just too specialized for the general purpose test being run here as we are testing alignment errors as well as raw speed.
New code, updated per Phil's correction and added Jens mmx-
[attachment deleted by admin]
The penalty for CPUID is ~500 and I don't think it should be included in the strlen routine as donkey's routine does not include it anyway.
When I have time, I would tweak my own routine and see how it compares. Below are the timings for different routines on my computer.
0 byte misalignment
lszLenSSE 35 27 20 36 35 30 37 44 56 72 160 228 305 990
Jens_mmx 546 605 616 606 612 612 614 630 621 625 643 653 694 1031
Ratch 20 14 22 21 25 19 31 41 64 83 151 195 293 1003
szLength 15 22 15 15 10 25 27 28 58 68 136 171 270 908
Jens_mmx2 11 53 53 54 57 61 62 69 78 73 96 108 153 497
1 byte misalignment
lszLenSSE 19 28 26 33 34 30 40 44 57 84 125 254 364 1172
Jens_mmx 552 553 558 557 569 657 646 657 654 666 674 703 745 1077
Ratch 19 14 24 19 27 20 31 37 64 85 180 272 426 1575
szLength 26 17 26 22 25 36 26 40 56 67 139 190 274 923
Jens_mmx2 16 15 28 25 56 79 95 104 121 114 138 169 197 533
2 byte misalignment
lszLenSSE 35 32 27 28 26 30 43 36 55 65 122 259 361 1154
Jens_mmx 552 541 544 544 569 638 635 654 659 656 669 693 719 1066
Ratch 3 14 24 19 11 26 33 37 61 90 185 275 428 1566
szLength 26 18 27 19 12 31 25 41 55 66 148 184 277 914
Jens_mmx2 11 14 29 32 34 95 90 94 118 107 132 155 192 534
3 byte misalignment
lszLenSSE 36 32 27 34 34 29 38 37 51 82 123 272 372 1178
Jens_mmx 544 548 558 546 575 629 642 649 654 646 675 686 723 1050
Ratch 11 22 17 27 19 34 34 40 75 106 181 302 429 1538
szLength 11 27 18 28 24 31 37 41 97 115 155 189 275 920
Jens_mmx2 10 23 21 31 32 76 93 91 114 102 127 149 182 523
Jens' mmx version sure thrash the rest.
I tried wiht my own routine, and it gives weird results.
0 byte misalignment
lszLenSSE 19 35 27 35 28 36 38 46 56 73 147 220 301 1001
roticv2 9 13 15 17 17 43 106 131 187 239 65 545 866 3454
Ratch 19 14 24 18 26 19 31 36 62 80 148 195 288 994
szLength 15 23 16 22 17 27 24 36 61 85 140 186 264 913
Jens_mmx2 18 54 51 53 59 54 63 70 93 71 114 104 181 489
1 byte misalignment
lszLenSSE 27 35 34 27 34 40 38 37 44 78 128 259 347 1171
roticv2 17 13 15 10 27 35 99 136 184 244 168 538 863 3456
Ratch 11 20 17 12 20 27 31 33 71 98 184 270 420 1558
szLength 18 26 25 25 20 30 34 33 58 67 137 190 267 924
Jens_mmx2 11 15 13 33 57 101 86 103 121 109 153 160 202 534
2 byte misalignment
lszLenSSE 22 34 28 27 35 38 50 49 57 70 122 258 344 1155
roticv2 9 19 15 28 22 38 101 132 129 151 355 187 235 635
Ratch 11 14 12 20 12 30 33 33 63 72 184 265 439 1570
szLength 14 25 23 24 19 29 25 29 56 75 138 198 268 904
Jens_mmx2 19 15 29 44 33 79 91 95 102 89 131 154 189 541
3 byte misalignment
lszLenSSE 35 27 32 35 35 32 23 45 63 86 119 265 372 1173
roticv2 10 13 20 17 30 42 86 113 189 231 137 549 882 3459
Ratch 11 6 18 13 24 19 31 32 69 107 187 301 433 1531
szLength 27 23 12 28 22 31 38 34 102 115 149 190 276 922
Jens_mmx2 20 14 30 24 39 91 82 99 111 95 135 149 179 531
My code is
roticv2 proc lpstring:dword
;int 3
mov eax, [esp+4]
cmp byte ptr[eax], 0
mov ecx, eax
jz done
and ecx, 0FFFFFFE0h
add ecx, 16
sub ecx, eax
cmp ecx, 16
jz aligned
@@:
add eax, 1 ;Simple byte scanner for alignment
cmp byte ptr[eax],0
jz done
sub ecx, 1
jnz @B
aligned:
pxor xmm1, xmm1
align 16
@@:
movdqa xmm0, [eax]
pcmpeqb xmm0, xmm1
add eax, 16
pmovmskb ecx, xmm0
test ecx, ecx
jz @B
bsf ecx, ecx
lea eax, [eax+ecx-16]
done:
sub eax, [esp+4]
retn 4
roticv2 endp
Nice, Roticv, I was wondering about an mmx implementation that didn't require sse... The cpuid may be a moot point, because you can save it once in memory, and read it many. It seems to me that some standard startup code should be grabbing/saving this value for subsequent access anywhere in the code--in 1 cycle! The last time I checked, the cpuid doesn't exactly change during program execution ::). On the other hand, what cpus don't have mmx--P1, PPro--does anyone out there use these any more?
But maybe we still have to use cpuid, to prevent mmx use in future machines that don't have it. If so, standard startup code really must be doing this. Such code can also abort execution if run on machines outside the supported set. Low level routines such as this must not be spending any more time on cpuid issues than actually required.
Roticv, I could not find any mention of the cpu type you ran on--what is it? :naughty:
Here is the run for my Sempron 2800+, cpuid is not nearly so bad, but still intolerable:
Proc/Byte 0 1 2 3 5 8 13 22 39 55 89 144 239 999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
0 byte misalignment
lszLenSSE 26 25 25 25 25 28 28 32 38 47 82 119 167 591
Jens_mmx 82 112 111 111 113 118 121 129 157 122 169 139 201 361
Ratch 7 11 12 15 14 14 20 30 62 78 101 142 230 856
szLength 8 8 8 10 12 15 17 24 34 47 91 131 204 782
Jens_fast 20 20 20 20 21 26 29 36 57 70 99 146 220 929
1 byte misalignment
lszLenSSE 28 28 28 28 28 30 31 34 41 54 93 126 178 620
Jens_mmx 82 83 87 90 98 146 148 156 173 156 183 212 230 374
Ratch 7 11 12 15 18 15 23 32 67 86 109 155 255 954
szLength 13 14 14 16 15 19 20 26 40 51 92 135 208 787
Jens_fast 20 20 20 20 24 28 32 40 62 76 105 158 238 1000
2 byte misalignment
lszLenSSE 28 28 28 28 28 30 30 32 41 55 92 123 176 623
Jens_mmx 82 83 87 90 98 142 144 156 169 148 163 195 212 362
Ratch 9 11 12 15 18 15 23 32 69 86 109 155 255 953
szLength 14 14 16 16 15 20 20 29 40 51 92 136 208 785
Jens_fast 20 20 20 20 24 28 32 40 61 76 105 154 238 1000
3 byte misalignment
lszLenSSE 28 28 28 28 28 31 30 35 43 56 90 123 174 626
Jens_mmx 82 83 87 90 98 128 136 143 153 131 177 188 205 349
Ratch 7 11 12 15 18 15 23 33 70 87 109 155 254 953
szLength 14 16 16 15 19 20 24 29 41 51 94 136 209 787
Jens_fast 20 20 20 20 24 28 32 40 61 76 105 154 239 1001
Ok, time to show my ignorance.
When trying roticv's routine, masm chokes on the movdqa line-
Assembling: F:\WinAsm\Progs\FastStringLength\timelen4\timelen.asm
F:\WinAsm\Progs\FastStringLength\timelen4\timelen.asm(337) : error A2008: syntax error : xmm
What's the solution for this?
The Jens-mmx code is not yet practical, because it can read up to 40 bytes beyond the end of memory. The best way to resolve this is to align to 32 bytes instead of 8 bytes, then process 32 bytes at a time, instead of 48. It makes it run faster too, probably because it lays down for the caches. Because memory blocks are multiples of 4k, they always end on a 32-byte boundary. Since 32 byte alignment can take up to 31 iterations, first align to 4 bytes, then align to 32 bytes, 4 at a time. This improves short and long string speed.
I am a using a Celeron(R) 2.40GHz that comes with SSE3.
Anyway, you have to use a newer verison of ml in order to compile movdqa. I'm using Microsoft (R) Macro Assembler Version 7.00.9466.
[attachment deleted by admin]
roticv: Here are the results of running your latest strlen.exe on a 996 MHz P3. There is a problem with 'correctness' with 2-byte misalignment. I also cannot assemble the source because I only have access to the earlier ML version.
Test routines for correctness:
0 byte misalignment
lszLenSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
roticv2 0 1 2 3 5 8 13 22 39 55 98 144 239 999
Ratch 0 1 2 3 5 8 13 22 39 55 89 144 239 999
szLength 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_mmx2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
1 byte misalignment
lszLenSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
roticv2 0 1 2 3 5 8 13 22 39 55 98 144 239 999
Ratch 0 1 2 3 5 8 13 22 39 55 89 144 239 999
szLength 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_mmx2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
2 byte misalignment
lszLenSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
roticv2 0 1 2 3 5 8 13 22 48 64 89 144 239 1008
Ratch 0 1 2 3 5 8 13 22 39 55 89 144 239 999
szLength 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_mmx2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
3 byte misalignment
lszLenSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
roticv2 0 1 2 3 5 8 13 31 39 55 98 144 239 999
Ratch 0 1 2 3 5 8 13 22 39 55 89 144 239 999
szLength 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_mmx2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Proc/Byte 0 1 2 3 5 8 13 22 39 55 89 144 239 999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
0 byte misalignment
lszLenSSE 16 16 16 16 17 19 19 22 29 48 63 86 116 403
roticv2 4 10 12 16 12 44 58 87 136 185 49 453 736 3019
Ratch 18 25 32 40 29 25 35 58 78 92 106 144 245 871
szLength 19 19 19 19 23 25 30 37 52 79 116 174 263 1025
Jens_mmx2 7 34 34 34 39 43 45 67 85 49 98 66 135 320
1 byte misalignment
lszLenSSE 16 16 16 16 16 19 19 22 32 60 92 117 184 648
roticv2 4 10 13 16 22 44 58 67 136 185 100 453 738 3017
Ratch 18 25 32 39 29 25 35 58 86 110 124 191 326 1150
szLength 24 25 25 28 28 30 34 41 72 87 119 177 274 1033
Jens_mmx2 7 12 17 18 23 64 73 78 114 77 126 144 169 349
2 byte misalignment
lszLenSSE 16 16 16 16 16 26 26 40 37 54 81 113 173 627
roticv2 4 10 13 16 22 44 58 87 73 77 287 111 135 328
Ratch 18 25 32 39 29 25 57 79 86 96 135 182 303 1140
szLength 25 25 28 28 28 31 34 45 72 87 119 177 274 1033
Jens_mmx2 7 12 17 18 23 66 70 81 112 75 124 142 167 347
3 byte misalignment
lszLenSSE 16 16 16 16 16 19 19 22 32 60 92 117 184 648
roticv2 4 10 13 16 22 44 58 66 138 185 96 453 736 3017
Ratch 18 25 32 39 29 25 35 58 86 110 124 191 326 1149
szLength 25 28 28 28 30 31 39 45 72 89 127 177 270 1032
Jens_mmx2 7 12 17 18 23 51 57 65 96 59 110 127 145 330
Press enter to exit...
It is odd that roticv2 only fails the correctness test with 2-byte misalignment when the string 999 contains extended ASCII characters.
That's extremely weird.
I get
Test routines for correctness:
0 byte misalignment
lszLenSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
roticv2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Ratch 0 1 2 3 5 8 13 22 39 55 89 144 239 999
szLength 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_mmx2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
1 byte misalignment
lszLenSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
roticv2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Ratch 0 1 2 3 5 8 13 22 39 55 89 144 239 999
szLength 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_mmx2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
2 byte misalignment
lszLenSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
roticv2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Ratch 0 1 2 3 5 8 13 22 39 55 89 144 239 999
szLength 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_mmx2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
3 byte misalignment
lszLenSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
roticv2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Ratch 0 1 2 3 5 8 13 22 39 55 89 144 239 999
szLength 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_mmx2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
When I run it on my machine. Will take a look into it. It probably has to do with the alignment and stuff like that.
To anyone having the same problem I did with movdqa-
movdqa requires masm 6.15 or better. Version 6.14 distributed with Hutch's masm32 distribution won't do. 6.15 is available from his site.
There, now the information will show up on a search :wink
Victor-
I'm getting the same results as Phil for the 2 byte misalign on the 999 string, and the same large cycle count for the other three. Also notice there is an error at the 3-byte misalignment for string 22. Also try a string length of 39.
After more testing, it gets even weirder.
Using these test strings:
SIZES TEXTEQU <7,8,9,10,11,12,13,14,15,16,17,LastString>
I get:
Test routines for correctness:
0 byte misalignment
roticv2 7 17 18 19 20 21 22 23 48 16 17 999
1 byte misalignment
roticv2 7 8 9 10 11 12 13 14 15 16 17 1008
2 byte misalignment
roticv2 7 8 9 10 11 12 13 14 15 16 17 1008
3 byte misalignment
roticv2 7 8 9 10 11 12 13 14 15 16 17 1008
but using these strings:
SIZES TEXTEQU <5,8,9,10,11,12,13,14,15,16,17,LastString>
I get:
Test routines for correctness:
0 byte misalignment
roticv2 5 8 9 10 11 12 13 14 15 16 17 999
1 byte misalignment
roticv2 5 8 9 10 11 12 13 14 15 16 17 1008
2 byte misalignment
roticv2 5 8 9 10 11 12 13 14 15 16 17 1008
3 byte misalignment
roticv2 5 8 9 10 11 12 13 14 15 16 17 1008
I just can't see how this can be happening???
temporarily deleted
As far as I know, mmx is supported in all Pentiums except for P1 and PPro. Does anyone out there know of any examples to the contrary? If not, I am inclined to drop the cpuid test, but if there are newer 32-bit cpus that don't have mmx, then it must be included with strlen( ) versions intended for use in applications.
Ah it is not weird now. I figured out what is wrong.
The instruction i was using was a SSE2 instruction, but your processor interpreted it as a SSE instruction because it does not recongnise it (The only difference is the 66h prefix to the instruction).
Therefore it did not work as per expected on both your processors.
Well, I'm glad you solved that mystery! I had expected an illegal instruction exception when I ran the program but it just ran without complaint! Does this mean that it would be impractical to write an SSE2/SSE3 emulator? I suppose so, since the processor didn't seem to notice that anything was wrong in its instruction sequence. I'm thinking back to ancient times when you could run programs that used FPU instructions even if you didn't happen to have one. Now it looks like it's all up to the programmer to check capabilities if they are using SSE2/SSE3 to use a less advanced routine to ensure compatibility with older machines. I would have prefered seeing an illegal instruction exception!
Btw even Ollydbg 1.10 did not recongnise the SSE2 instruction and decoded it wrongly. It is sad that such things do happen. Do remind me to only use up to SSE instructions next time :P
Well, it might be good or bad depending on how you at it. It is not nice for someone to run a program and get unknown opcode error just because his/her processor does not support it. Most probably he/she will not know what happened.
I think the programmer have to be proactive in ensuring that his target users have the instruction set before running it. I think I am a lousy programmer :toothy Haven't been coding in asm for quite some time. Coding mainly in C, solving programming qn.
Roticv--
While we have you in this vulnerable, contrite state, let me suggest another little change to your code, to make it faster on really short strings...
roticv2 proc lpstring:dword
;int 3
mov eax, [esp+4] ; removed your 1st test for zero, it's coming up soon most of the time anyway
test eax, 15 ; removed unnecessary code
jz aligned
@@:
cmp byte ptr [eax], 0 ; inc eax afterward so it's ready now
jz done
add eax, 1
test eax, 15 ; simplified...
jnz @B
aligned:
pxor xmm1, xmm1
align 16
@@:
movdqa xmm0, [eax]
pcmpeqb xmm0, xmm1
add eax, 16
pmovmskb ecx, xmm0
test ecx, ecx
jz @B
bsf ecx, ecx ; nice use of bsr !
lea eax, [eax+ecx-16]
done:
sub eax, [esp+4]
retn 4
roticv2 endp
Codewarp-
Unless this was a joke and I just didn't get it, the code you just posted doesn't give the correct answers. I changed it's name to roticv3 to avoid conflict with 2 that I'm still looking at. The results:
Test routines for correctness:
0 byte misalignment
szLength 0 1 2 3 5 8 13 22 39 55 89 144 239 999
roticv3 0 1 2 3 5 17 22 22 39 55 98 144 1255 999
1 byte misalignment
szLength 0 1 2 3 5 8 13 22 39 55 89 144 239 999
roticv3 0 1 2 3 5 8 13 22 48 64 98 144 239 1008
2 byte misalignment
szLength 0 1 2 3 5 8 13 22 39 55 89 144 239 999
roticv3 0 1 2 3 5 8 13 31 48 64 98 144 239 1008
3 byte misalignment
szLength 0 1 2 3 5 8 13 22 39 55 89 144 239 999
roticv3 0 1 2 3 5 8 13 31 48 64 98 144 239 1008
Proc/Byte 0 1 2 3 5 8 13 22 39 55 89 144 239 999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
0 byte misalignment
szLength 9 8 9 9 12 16 17 24 34 48 89 132 204 783
roticv3 26 26 27 26 26 28 28 29 30 35 47 74 370 301
1 byte misalignment
szLength 13 14 14 15 15 19 21 26 40 52 91 134 207 785
roticv3 7 11 16 21 32 46 84 78 87 88 95 112 153 365
2 byte misalignment
szLength 14 14 16 16 15 19 20 29 40 52 91 136 207 784
roticv3 6 11 16 20 31 46 84 78 82 85 95 111 153 364
3 byte misalignment
szLength 14 16 16 16 19 20 24 30 42 51 94 135 209 790
roticv3 7 11 16 21 31 46 72 75 78 90 93 106 150 360
Press enter to exit...
Jimg--
No joke, but neither did I attempt to fix the sse2 issue (movdqa instruction). I just tightened up on the initial byte scan. Otherwise I don't see any errors in the code. I would never knowingly post bad code, but I might unknowingly do it... Are you using an sse2 capable machine.
Duh... Now I get it. Sorry. Even though Intel has 90% of market to AMD's 10% or so, and it's interesting to test this stuff out here in the laboratory, I wouldn't think sse2 would be a good choice for a general purpose rountine just yet.
I modified Jens_mmx a little, and am getting some incredible times on the longer strings. I've looked and can't find out how it's cheating on the rest of the routines. Is there something going on here I don't see?
Test routines for correctness:
0 byte misalignment
szLength 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Ratch 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_fast 0 1 2 3 5 8 13 22 39 55 89 144 239 999
roticvSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
lszLenSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_mmx2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
1 byte misalignment
szLength 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Ratch 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_fast 0 1 2 3 5 8 13 22 39 55 89 144 239 999
roticvSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
lszLenSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_mmx2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
2 byte misalignment
szLength 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Ratch 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_fast 0 1 2 3 5 8 13 22 39 55 89 144 239 999
roticvSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
lszLenSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_mmx2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
3 byte misalignment
szLength 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Ratch 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_fast 0 1 2 3 5 8 13 22 39 55 89 144 239 999
roticvSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
lszLenSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_mmx2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Proc/Byte 0 1 2 3 5 8 13 22 39 55 89 144 239 999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
0 byte misalignment
szLength 8 8 10 10 12 15 17 24 34 63 89 131 203 778
Ratch 8 11 11 13 14 14 20 30 64 77 100 143 229 853
Jens_fast 20 20 20 20 21 26 29 36 57 69 99 145 219 923
roticvSSE 4 28 28 29 28 32 32 35 39 52 80 111 166 563
lszLenSSE 25 25 25 25 25 28 28 32 39 47 83 122 167 587
Jens_mmx2 7 33 31 30 37 43 47 55 79 46 92 60 123 286
1 byte misalignment
szLength 13 14 14 16 15 19 20 26 56 67 91 134 206 784
Ratch 19 11 11 15 18 17 23 31 69 85 111 156 255 955
Jens_fast 20 20 23 20 24 28 33 41 61 76 105 158 239 1003
roticvSSE 3 7 10 12 18 58 58 51 94 71 94 125 175 583
lszLenSSE 28 28 26 28 28 40 29 34 40 56 92 126 179 625
Jens_mmx2 6 8 12 17 25 73 59 65 85 62 96 122 143 294
2 byte misalignment
szLength 14 14 16 16 15 20 20 31 40 51 92 136 210 786
Ratch 8 2 12 15 18 17 23 32 69 85 110 154 253 953
Jens_fast 20 20 20 20 24 28 32 40 63 75 105 154 237 999
roticvSSE 6 7 12 12 19 57 52 51 59 67 90 120 194 582
lszLenSSE 28 28 28 28 28 30 29 32 41 55 91 123 176 625
Jens_mmx2 8 9 12 16 26 56 54 68 86 61 95 120 140 292
3 byte misalignment
szLength 14 16 16 15 19 20 24 29 40 50 93 135 208 786
Ratch 7 10 24 15 18 16 26 32 68 87 109 157 253 951
Jens_fast 20 20 20 20 24 28 32 40 62 76 104 156 238 998
roticvSSE 4 7 10 12 18 45 46 49 54 67 89 124 171 582
lszLenSSE 28 28 28 28 28 31 30 35 43 56 89 123 173 635
Jens_mmx2 6 8 12 13 26 53 62 71 86 63 113 125 143 294
Press enter to exit...
Of course, Jens_mmx is no good for a general purpose routine as it needs 64 bytes past the possible end of the test string, but if you are writing the program and can assure enough headroom, it's fast. For general purpose, I'll stick with szLength.
[attachment deleted by admin]
Nice, I get similar results with AMD XP 2500+
Quote from: Jimg on June 27, 2005, 03:52:23 PM
I modified Jens_mmx a little, and am getting some incredible times on the longer strings. I've looked and can't find out how it's cheating on the rest of the routines. Is there something going on here I don't see?
Of course, Jens_mmx is no good for a general purpose routine as it needs 64 bytes past the possible end of the test string, but if you are writing the program and can assure enough headroom, it's fast. For general purpose, I'll stick with szLength.
There are a couple of reasons why it is possible to achieve such good speed.
1) Alignment. It ensures that strings are aligned to 8bytes before getting into the main loop that scans using mmx registers. (Maybe we can make use of Codewarp's improvements to speed it up)
2) Unrolling of loops. It speeds up the routine as it unroll all the data and fit it into the L1 code cache.
3) Usuage of lea is not found in the main loop. Instead it is only found in the second loop to determine where is the null terminator found. Maybe this could be improved by using pmovmskb.
4) Grouped read/compares and ors. (Rule no 2 of the advanced part of optimisation in mark larson's tips)
PS: I don't think MichaelW's timing marcos are as stable as I want it to be. Oh well.
Quote from: roticv on June 27, 2005, 04:19:05 PM
Quote from: Jimg on June 27, 2005, 03:52:23 PM
I modified Jens_mmx a little, and am getting some incredible times on the longer strings. I've looked and can't find out how it's cheating on the rest of the routines. Is there something going on here I don't see?
Of course, Jens_mmx is no good for a general purpose routine as it needs 64 bytes past the possible end of the test string, but if you are writing the program and can assure enough headroom, it's fast. For general purpose, I'll stick with szLength.
There are a couple of reasons why it is possible to achieve such good speed.
1) Alignment. It ensures that strings are aligned to 8bytes before getting into the main loop that scans using mmx registers. (Maybe we can make use of Codewarp's improvements to speed it up)
2) Unrolling of loops. It speeds up the routine as it unroll all the data and fit it into the L1 code cache.
3) Usuage of lea is not found in the main loop. Instead it is only found in the second loop to determine where is the null terminator found. Maybe this could be improved by using pmovmskb.
4) Grouped read/compares and ors. (Rule no 2 of the advanced part of optimisation in mark larson's tips)
PS: I don't think MichaelW's timing marcos are as stable as I want it to be. Oh well.
All those reasons are ok, but the big one is this--it's damn hard and awkward to find a single byte at any alignment in any dword using the normal cpu instructions. But mmx is designed to operate on bigger chunks like and it gets right down to it. You can unroll, align, lea or not lea, and reorder instructions all you want, I did, but the tripling in speed is a different animal. It gets better with each extension (mmx -->sse -->sse2 -->sse3...), but most of the improvement can be implemented with just mmx. PMOVMSKB is a very useful instruction here, but is SSE, not MMX. SSE requires a P3 or later, but PII's are still around.
Also, the scan overshoot is not a problem for a nondestructive operation like strlen( ), as long as it doesn't go off the end of the 4k page. That problem is easily remedied by processing 32-byte chunks, with 32-byte alignment--goodbye page faults...
Now, it seems to me that mmx is so standard that it could be used for "everyday" use without checking every time. However, it's host library start-up code should still abort if no mmx support exists. Can we consider the P1 and PPro dead, or are there other non-mmx pentiums out there?
Here's my 2 cents.
It is not right to take things for granted. It is better to first check whether cpuid exist by checking the EFLAG, then call CPUID. After that, set the flag for MMX/SSE/SSE2/SSE3 and then from then on just compare with the flag. We only need to figure out whether the processor supports certain extenstion once, then we can proceed to using the correct instruction set.
There's a reason why MMX/SSE/SSE2/SSE3 instruction sets are invented :toothy
Let's declare Jen's MMX variant of strlen the winner.
Quote from: roticv on June 28, 2005, 02:04:04 PM
It is not right to take things for granted. It is better to first check whether cpuid exist by checking the EFLAG, then call CPUID. After that, set the flag for MMX/SSE/SSE2/SSE3 and then from then on just compare with the flag. We only need to figure out whether the processor supports certain extenstion once, then we can proceed to using the correct instruction set.
I tend to agree, however, anything that destroys performance on string lengths of a few bytes is dead on arrival. In "real" applications, the bulk of the clock cycles spent in strlen( ) is usually on the short strings--not on 1000 bytes+ strings. Therefore, no cpuid and no eflags is ok with me. Would you be doing all this real-time conditional coding in the memchr( ) and in the memmove( ), etc...? No, this has to be performed at application start-up time, not inside these low level routines. That way, the decision overhead is reduced to a single memory test instruction.
Tested on a P4 2.66 GHz
Test routines for correctness:
0 byte misalignment
szLength 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Ratch 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_fast 0 1 2 3 5 8 13 22 39 55 89 144 239 999
roticvSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
lszLenSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_mmx2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
1 byte misalignment
szLength 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Ratch 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_fast 0 1 2 3 5 8 13 22 39 55 89 144 239 999
roticvSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
lszLenSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_mmx2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
2 byte misalignment
szLength 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Ratch 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_fast 0 1 2 3 5 8 13 22 39 55 89 144 239 999
roticvSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
lszLenSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_mmx2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
3 byte misalignment
szLength 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Ratch 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_fast 0 1 2 3 5 8 13 22 39 55 89 144 239 999
roticvSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
lszLenSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_mmx2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Proc/Byte 0 1 2 3 5 8 13 22 39 55 89 144 239 999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
0 byte misalignment
szLength 12 3 2 3 17 12 15 17 48 55 92 138 225 777
Ratch -1 1 35 8 9 8 15 21 50 58 97 134 231 881
Jens_fast 12 0 0 0 16 6 36 41 68 74 92 130 200 930
roticvSSE 8 20 17 15 26 19 20 23 40 68 57 132 184 623
lszLenSSE 53 12 11 11 22 16 15 19 25 51 53 129 183 609
Jens_mmx2 8 33 23 23 38 33 33 39 62 39 72 65 132 416
1 byte misalignment
szLength 10 20 10 13 9 26 20 21 40 60 123 139 232 783
Ratch 1 12 6 7 7 16 12 21 42 69 125 185 356 1132
Jens_fast 3 10 1 -1 4 16 35 40 49 109 163 162 243 1087
roticvSSE -2 12 7 7 13 53 40 45 59 91 100 149 206 664
lszLenSSE 11 22 12 12 11 26 15 19 25 113 120 158 214 729
Jens_mmx2 -3 12 3 9 17 70 73 70 85 78 102 122 140 445
2 byte misalignment
szLength 9 10 24 10 11 14 28 28 38 49 97 137 210 838
Ratch 1 1 5 8 7 5 39 20 54 58 124 185 296 1107
Jens_fast 3 0 10 2 3 5 79 43 57 60 115 168 278 1100
roticvSSE -3 1 14 10 13 33 35 41 47 52 65 106 200 625
lszLenSSE 13 11 22 13 11 15 60 21 24 37 84 159 224 726
Jens_mmx2 -3 1 14 11 17 94 62 69 113 62 102 114 136 430
3 byte misalignment
szLength 11 16 9 20 16 11 21 38 66 82 122 137 209 788
Ratch 1 1 5 8 7 5 15 30 48 59 124 211 332 1073
Jens_fast 3 -1 -1 10 4 5 36 42 86 92 114 180 262 1080
roticvSSE -2 1 5 11 9 59 37 70 79 85 103 145 201 631
lszLenSSE 15 14 11 22 13 15 15 30 39 114 73 184 241 746
Jens_mmx2 -2 1 3 20 18 48 85 72 99 60 95 118 164 424
Tested on 996 MHz P3 taken from timelen5.zip above. Only the timings are included here. I visually verified the correctness section and excluded it from the results.
Proc/Byte 0 1 2 3 5 8 13 22 39 55 89 144 239 999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
0 byte misalignment
szLength 19 19 19 19 23 25 30 37 55 79 116 175 263 1023
Ratch 18 25 32 39 29 25 35 58 78 92 105 144 245 872
Jens_fast 12 12 12 12 15 18 19 38 48 63 88 130 199 871
roticvSSE 4 18 18 18 18 20 20 23 28 48 59 78 104 341
lszLenSSE 16 16 16 16 16 19 19 22 29 48 63 86 116 402
Jens_mmx2 4 31 30 31 35 38 44 63 73 45 85 60 121 287
1 byte misalignment
szLength 24 26 25 28 28 30 34 41 72 87 121 177 274 1033
Ratch 18 25 32 39 29 25 35 58 86 110 124 191 326 1149
Jens_fast 12 12 12 12 15 18 19 38 55 77 99 158 242 978
roticvSSE 4 10 13 16 22 48 49 50 59 78 93 108 143 387
lszLenSSE 16 16 16 16 16 19 19 22 32 60 92 117 185 650
Jens_mmx2 4 9 12 15 21 62 67 70 104 71 117 133 152 332
2 byte misalignment
szLength 25 25 28 28 28 31 35 45 72 87 119 177 274 1033
Ratch 18 25 32 39 29 25 47 81 85 97 135 183 303 1141
Jens_fast 12 12 12 12 15 18 30 62 57 66 107 148 231 1008
roticvSSE 4 10 13 16 22 46 46 50 57 75 87 106 133 374
lszLenSSE 16 16 16 15 17 26 26 40 36 54 82 114 171 630
Jens_mmx2 4 9 12 15 21 59 66 74 103 72 116 139 149 324
3 byte misalignment
szLength 25 28 28 28 30 31 39 45 88 89 128 178 272 1037
Ratch 18 25 32 39 29 25 35 58 86 110 125 192 327 1153
Jens_fast 12 12 12 12 15 18 19 42 55 77 99 158 243 982
roticvSSE 4 10 13 16 22 32 35 37 43 63 83 95 123 366
lszLenSSE 16 16 16 16 16 19 19 22 32 60 93 118 184 651
Jens_mmx2 4 9 12 15 21 45 56 59 88 55 101 115 136 305
Sorry to wake this dead thread but I just found another interesting strlen routine by r22.
align 16
strLenAlign16SSE:
mov ecx,[esp+4]
movdqa xmm2,dqword[filled]
lea eax,[ecx+16]
movdqa xmm0,dqword[ecx]
.lp:
movdqa xmm1,xmm0
pxor xmm0,xmm2 ;xor -1
paddb xmm1,xmm2 ;sub 1
movdqa xmm3,[eax] ;used for unroll
pand xmm0,xmm1
pmovmskb edx,xmm0
add eax,16
test dx,-1 ;1111 1111 1111 1111b
jnz .unrol
movdqa xmm1,xmm3
pxor xmm3,xmm2 ;xor -1
paddb xmm1,xmm2 ;sub 1
pand xmm3,xmm1
movdqa xmm0,[eax] ;back to first roll
pmovmskb edx,xmm3
add eax,16
test dx,-1 ;1111 1111 1111 1111b
jz .lp
.unrol:
add ecx,32
sub eax,ecx
xor ecx,ecx
sub ecx,edx
and edx,ecx
CVTSI2SD xmm0,edx
PEXTRW edx,xmm0,3
shr dx,4
add dx,0fc01h
; bsf edx,edx replaced by crazy SSE version
add eax,edx
ret 4
align 16
filled dq 0FFFFFFFFFFFFFFFFh,0FFFFFFFFFFFFFFFFh
Victor,
"Sorry to wake this dead thread but I just found another interesting strlen routine by r22."
I'm wondering what is so interesting... :bg
A lot of code in the main loop and slow exchange of the bsf... :'(
It is not a big deal to create something faster with 128-bit instructions
Here is the proof tested on my P4 3.6 GHz:
Proc/Byte 0 1 2 3 5 8 13 22 39 55 89 144 239 999
0 byte misalignment
szLength 16 15 14 15 20 22 24 32 56 79 144 245 250 861
Ratch 12 17 20 22 19 21 25 35 59 80 135 180 268 1046
Jens_fast 18 20 20 19 21 26 35 73 84 99 127 168 243 1097
roticvSSE 9 32 31 32 31 36 36 42 55 68 111 235 325 919
lszLenSSE 30 28 28 30 30 32 32 37 51 65 101 193 275 978
Jens_mmx2 9 48 45 45 50 51 54 62 79 64 98 90 152 460
slenLingo 14 14 15 15 15 15 15 22 27 36 48 72 96 405
1 byte misalignment
szLength 20 20 18 25 23 26 29 35 51 63 210 178 262 869
Ratch 13 15 18 20 19 19 25 37 59 91 171 257 420 1666
Jens_fast 19 19 19 18 20 24 29 72 85 131 155 207 322 1450
roticvSSE 7 13 16 19 24 57 57 62 79 94 122 169 319 967
lszLenSSE 29 29 29 29 28 32 32 42 74 113 116 230 329 1095
Jens_mmx2 9 13 17 21 30 76 82 88 113 97 134 147 179 564
slenLingo 15 15 15 15 15 15 15 22 27 37 49 71 101 407
2 byte misalignment
szLength 21 18 22 21 21 27 26 37 51 63 131 176 255 965
Ratch 13 17 19 22 21 22 36 34 59 79 166 246 422 1491
Jens_fast 19 23 18 20 22 23 88 101 119 101 156 211 330 1355
roticvSSE 9 13 16 18 24 56 55 63 76 92 169 260 299 939
lszLenSSE 29 29 29 28 28 33 45 38 56 69 115 221 330 1195
Jens_mmx2 9 14 17 21 32 77 77 88 118 89 120 151 183 502
slenLingo 15 15 15 15 14 15 15 22 28 37 49 71 102 407
3 byte misalignment
szLength 20 24 23 23 26 25 32 38 92 105 135 176 253 880
Ratch 13 18 20 22 21 20 24 34 72 95 169 318 410 1434
Jens_fast 20 21 19 20 22 24 27 74 119 135 159 237 331 1474
roticvSSE 8 14 16 18 24 51 56 60 72 86 114 160 296 1090
lszLenSSE 29 30 28 29 28 32 31 38 59 76 119 247 343 1124
Jens_mmx2 8 11 17 21 30 66 81 141 192 85 119 136 170 484
slenLingo 14 14 14 15 15 14 15 22 28 36 48 72 103 406
Press enter to exit...
Regards,
Lingo
[attachment deleted by admin]
Lingo,
Very cool solution to the strlen( ) implementation. :clap: Your method illustrates a different strategy for search alignment than any of the other algorithms seen in this thread. It avoids alignment complexity by scanning the first 16 bytes without regard for alignment, and only aligning for subsequent reads if necessary. I do have a few comments about it:
(1) The Prefetches appear to be unnecessary and removing them reduces clock count by a few points
(on Athlon64). Prefetches are good for multiple data streams, and/or enhancing random access. However,
this is a single sequential stream, and normal cpu prefetch for that is as good as it gets.
(2) Unless I am mistaken, this code requires SSE2, because of the movdqa instruction, making it less
widely applicable.
(3) Strings < 16 bytes can be sped up a few cycles, with simplification and avoiding jmps.
(4) An additional 12% cycle reduction resulted when I doubled-up the loop to process 32-bytes at a time,
but I left it out because of its potential 31-byte overreach, past the end of the string, and off the end of the last page.
(5) Your method and mine below can both access memory past the end of the string, and off the end of the last page,
by as much as 15 bytes (causing a possible page-fault). But it is a way too cool :8) algorithm to leave on the shelf...
My adaptation of your method is as follows:
mov eax, [esp + 4] ; eax = base address to start search
movdqu xmm1, [eax] ; load first 16 bytes, aligned or not
pxor xmm0, xmm0 ; xmm0 = 0's
and eax, -16 ; align eax down to base paragraph
pcmpeqb xmm1, xmm0 ; check 16 bytes for zeros
pmovmskb edx, xmm1
test edx, edx ; test edx for zero
jz again ; branch ahead if no zero bytes found
bsf eax, edx ; return the byte position as the length
ret
align 8
again: movdqa xmm1, [eax + 16]
add eax, 16 ; eax = address of 16-byte compare
pcmpeqb xmm1, xmm0 ; search the 16-bytes
pmovmskb edx, xmm1
test edx, edx
jz again
sub eax, [esp + 4] ; subtract original base address
bsf edx, edx ; get position of 1st zero byte
add eax, edx ; add to base address
ret
Codewarp, :bg
"2) Unless I am mistaken, this code requires SSE2, because of the movdqa instruction, making it less
widely applicable."
I have a new version with movups and movaps indeed movdqu and movdqa
Pls, reload the zip file and test it again
"(3) Strings < 16 bytes can be sped up a few cycles, with simplification and avoiding jmps."
I agree
"5) Your method and mine below can both access memory past the end of the string, and off the end of the last page,
by as much as 15 bytes (causing a possible page-fault). But it is a way too cool algorithm to leave on the shelf..."
I disagree
What is "the end of the string"?
We search the end of the string, hence the phrase "the end of the string" is undefined ?
Before the usage of the StrLen we need a buffer with ENOUGHT memory
for our string, hence if someone use my algo he needs to allocate ENOUGHT+ 32 bytes of memory
Example: The constant _MAX_PATH ->(Maximum length of full path) is 260 bytes long
If we search length of the string of the current path with file name
we need to allocate 260+32 bytes for buffer
or ENOUGHT+32 memory will be 292 bytes
"; align eax down to base paragraph"
Thanks for comments :bg
Regards,
Lingo
Lingo,
The bare SSE support is much appreciated. :thumbu
Sadly, the overreach is real. Suppose that you have a valid 2 byte string (3rd byte is zero), and these are the last three bytes at end of a 4k page, the last page in the block. This algorithm will initially read 16 bytes from the misaligned address, trespassing 13 bytes into the non-existant next page. The best way to handle this problem is to use only 16/aligned reads, then mask out the initial unwanted comparison bytes. But that is hard to do as fast as the method you have here.
Codewarp,
Why not just put a ALIGN 16 after the buffer? Then you would be guaranteed to be within a 16 byte boundary. Ratch
Putting an Align 16 anywhere is irrelevant. You are handed a pointer to 3 bytes, you read 16 bytes, you don't get to choose its alignment. This will lead to overrun. There are only three choices: live with it, backup to read 16 bytes from the beginning, or use GPRs instead of SSE.
Codewarp, :bg
"The best way to handle this problem is to use only 16/aligned reads,
then mask out the initial unwanted comparison bytes.
But that is hard to do as fast as the method you have here.
For me it is not a big deal (Pls, reload the zip file and see Lingo2)
but it is slower because we have additional code :(
Regards,
Lingo
Codewarp,
QuoteYou are handed a pointer to 3 bytes, you read 16 bytes, you don't get to choose its alignment.
Now you setting conditions that I did not know about.
Quote
...backup to read 16 bytes from the beginning
Do you mean back it up to a 16 byte boundary? It becomes even more complicated if there is a '0' lurking within the backed up data area. Ratch
I am not setting any conditions--I am merely reporting a perfectly normal case where the algorithm in question fails to meet desired objectives, i.e. where it reads past the end of its data, and off the end of the 4k page. By doing so, I have shown the assertion to be irrefutable.
Ratch, are you really suggesting that strlen( ) work only for 16-byte aligned strings? :eek Strlen( ) is a routine defined by the c-runtime library--it has no preconditions about alignment. Its users expect it to return a correct string length from any byte address, as long as its terminating zero byte follows within valid memory. We, as programmers, are bound to implement that, even if we don't like it. Now, if you want to have two versions, strlen( ) and strlen16( ), where everyone knows the limitations, that's fine. Or you can switch internally to a different method, upon detecting misaligned strings. However expecting strings to all be 16-byte aligned is unreasonable.
By backing up to the previous 16-byte boundary, comparing, then masking off the unwanted part of the comparison, you can completely avoid unaligned reads AND avoid the overreach. But that is more work than lingo's algorithm, and consequently slower, but perhaps more correct.
Codewarp,
Quote
Ratch, are you really suggesting that strlen( ) work only for 16-byte aligned strings?
Nope, I am saying that backing up and reading might pose difficulities with respect to time and logic. I think it's better to go forward to the next 16-byte boundary instead. See below. Ratch
http://www.masmforum.com/simple/index.php?topic=2442.0
Ratch,
Your prediction of "logic and time" consequences doesn't hold up. Here is a fully correct version that cannot ever overreach a string and its 4k page. It required exactly three additional GPR instructions, plus it reads only aligned blocks, which can more than pay for the 3 instructions, for a net zero cost. This is a practical sse version of Lingo's method for any application, and one that outperforms all GPR implementations:
lensse: mov eax, [esp + 4] ; eax = base address to start search
mov ecx, eax
and eax, not 15 ; pull down to aligned address
and ecx, 15 ; ecx = pre-read bytes to skip
movaps xmm1, [eax] ; load first 16 bytes, aligned
pxor xmm0, xmm0 ; xmm0 = 0's
pcmpeqb xmm1, xmm0 ; check 16 bytes for zeros
pmovmskb edx, xmm1 ; edx holds a 1-bit for each zero byte (in the low 16 bits)
shr edx, cl ; discard the pre-read
test edx, edx ; test edx for zero
jz again ; branch ahead if no zero bytes found
bsf eax, edx ; return the bit position as the length
ret
align 8
again: movaps xmm1, [eax + 16]
add eax, 16 ; eax = address of 16-byte compare
pcmpeqb xmm1, xmm0 ; search the 16-bytes
pmovmskb edx, xmm1
test edx, edx
jz again
bsf edx, edx ; get position of 1st zero byte
sub eax, [esp + 4] ; subtract original base address
add eax, edx ; add for base address
ret
Hi Codewarp
I was working on a similar approach like yours, but you came first. Since I have some problems compiling the movdqa instruction, I reduced the the routine to xmm instructions and shuffled a little bit some instructions to obtain a better performance. As expected, the performance is similar to Donkeys routine, but the real advantage comes out with misaligned strings.
StrLength proc pString:dword
.xmm
mov eax, [esp + 4] ; eax = base address to start search
mov ecx, eax
and eax, not 7 ; pull down to aligned address
and ecx, 7 ; ecx = pre-read bytes to skip
movq mm1, [eax] ; load first 8 bytes, aligned
pxor mm0, mm0 ; mm0 = 0's
pcmpeqb mm1, mm0 ; check 8 bytes for zeros
pmovmskb edx, mm1 ; edx holds a 1-bit for each zero byte (in the low 8 bits)
shr edx, cl ; discard the pre-read
test edx, edx ; test edx for zero
jz more ; branch ahead if no zero bytes found
emms
bsf eax, edx ; return the bit position as the length
ret 4
more:
add eax, 8
again:
movq mm1, [eax]
pcmpeqb mm1, mm0 ; search the 8-bytes
add eax, 8 ; eax = address of 8-byte compare
pmovmskb edx, mm1
test edx, edx
jz again
bsf edx, edx ; get position of 1st zero byte
emms
sub eax, [esp + 4] ; subtract original base address
lea eax, [eax + edx - 8] ; add for base address
ret 4
StrLength endp
Regards,
Biterider
Thank you, Biterider, your mmx adaptation will now replace the mmx version in my codebase (with some changes ::)). I was particularly pleased with how effortlessly I could dispose of the pre-read. You are quite correct about the misalignment performance, important now since the greater majority of strings we call this on are not 16-byte aligned.
Codewarp,
Quote
This is a practical sse version of Lingo's method for any application, and one that outperforms all GPR implementations:
It does not perform anything on my machine (AMD Athlon running Windows ME). Evidently my CPU chokes on 128-bit registers referenced by MOVAPS XMM1,[EAX]. Not all machines have the latest MMX hardware and OS support. That's one advantage of writing GPR code; it works on the 386 and up. Anyway, congratulations on finding a solution to the alignment problem. Ratch
Hello, I was bored so I disasm'd Borlands version and tweaked it a bit. Could someone bench this against some of the other fast ones posted up to see where it stands? I ran it through Olly with a whole lot of different strings and it always returned the proper length.
strlen proto :dword
.code
;##########################################################################
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
strlen proc xstr:dword
mov eax, dword ptr [esp+4]
d1: mov edx, dword ptr [eax]
mov ecx, edx
add eax, 4
sub edx, 16843009
and edx, 2155905152
jz d1
not ecx
and edx, ecx
jz d1
test dl, dl
jnz d2
test dh, dh
jnz d3
test edx, 16711680
jnz d4
jmp d5
d2: dec eax
d3: dec eax
d4: dec eax
d5: dec eax
sub eax, dword ptr [esp+4]
add esp, 8
jmp dword ptr [esp-8]
strlen endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
;##########################################################################
That's the slowest one yet!
Proc/Byte 0 1 2 3 5 8 13 22 39 55 89 144 239 999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
0 byte misalignment
szLength 8 8 9 9 12 15 17 24 48 63 89 131 202 779
Ratch 9 11 12 15 14 14 20 31 64 78 100 143 231 855
Jens_fast 20 20 19 20 21 26 29 36 58 69 99 145 219 923
roticvSSE 4 28 29 28 28 32 32 35 39 53 79 112 160 576
lszLenSSE 25 25 25 25 25 28 28 32 38 47 84 116 166 591
Jens_mmx2 6 31 31 32 37 43 48 55 78 45 93 60 124 288
BiteRider 27 26 26 26 26 28 29 32 37 46 82 112 161 569
borland 9 10 11 9 14 18 22 29 58 74 109 162 249 1121
1 byte misalignment
szLength 13 13 14 16 15 19 20 26 56 67 91 134 206 781
Ratch 8 11 12 15 18 16 23 32 69 86 110 156 254 959
Jens_fast 20 20 21 20 24 28 32 41 64 75 105 154 243 1001
roticvSSE 3 7 10 12 18 53 48 51 60 73 94 125 176 589
lszLenSSE 32 28 28 28 31 30 30 37 45 55 92 126 178 623
Jens_mmx2 6 10 12 16 26 73 59 65 86 61 97 123 142 296
BiteRider 25 26 26 26 27 29 29 31 48 57 84 113 169 576
borland 8 9 11 10 15 20 25 36 71 90 135 205 323 1198
2 byte misalignment
szLength 14 14 16 16 15 20 20 31 41 51 92 136 208 787
Ratch 8 11 12 15 18 17 24 32 69 86 110 156 254 950
Jens_fast 20 20 20 19 24 28 32 40 62 75 104 153 237 997
roticvSSE 4 7 8 12 18 45 46 51 59 67 93 125 172 590
lszLenSSE 28 28 28 30 28 30 31 35 41 54 94 126 178 621
Jens_mmx2 7 8 12 17 26 59 54 58 84 59 96 120 141 295
BiteRider 26 26 26 26 24 29 29 33 48 57 84 112 164 575
borland 9 10 11 9 15 17 25 36 72 90 135 206 325 1203
3 byte misalignment
szLength 15 16 16 15 19 20 25 30 41 51 94 137 209 784
Ratch 9 11 12 15 18 16 23 32 70 85 108 156 253 947
Jens_fast 20 20 20 20 24 28 32 40 65 76 104 153 237 996
roticvSSE 6 7 10 35 18 43 46 49 53 66 89 119 172 583
lszLenSSE 28 28 29 28 28 30 31 32 41 57 91 124 176 624
Jens_mmx2 7 9 12 16 26 54 62 70 86 63 115 125 144 290
BiteRider 26 26 26 26 29 29 31 33 47 56 82 111 164 577
borland 8 10 12 9 15 19 25 36 69 91 134 207 325 1197
These are only the one that give the correct answers, the one's with movedqa don't work on my machine.
kunt0r,
Quote
I ran it through Olly with a whole lot of different strings and it always returned the proper length.
It should. It uses the same algo as most of the other GPR routines. For instance, its decimal SUB constant 16843009 decimal converts to 010101010H, and its AND constant 2155905152 decimal converts to 080808080H. It is sensitive to string alignment; 7575 ticks aligned, vs. 8856 ticks with 3 byte misalignment for a 10003 byte long string. It has no provision to prevent over reading its memory. Its code to search the last word for a zero byte is clunky, but that does not affect its speed too much, because to only executes it once. It uses two instructions instead of RET DWORD, and it leaves its return address vulnerable on the stack for a possible overwrite. It speed is comparable to my GPR routine, except my routine does not slow up for string misalignment on long strings. Not that it matters much, but my code is also shorter. Ratch
ah I didn't know about "ret 4" doing the same thing as those two lines of code, that sped it up slightly.
kunt0r,
Quote
ah I didn't know about "ret 4" doing the same thing as those two lines of code, that sped it up slightly.
Can't be much of a difference because it only gets executed once. The search loop is where the subprogram it spends most of its time. As I mentioned before, it leaves its return address vulnerable to a possible overwrite. One would expect something better from Borland. Ratch
Jimg,
Did you use my latest STRLEN? What was the switch was set for, the 8-bit or 7-bit seach? Ratch
http://www.masmforum.com/simple/index.php?topic=2442.0
(edit-Ratch, Borland didn't use that kind of stack stuff, I did it myself to shave a few cycles. Borlands actually has more code which checks for misaligned data with a test al, 3 line which leads to more code ect..I basically used Borlands as a basis and tried to make it faster then they did)
I updated my borland one and got these results:
Proc/Byte 0 1 2 3 5 8 13 22 39 55 89 144 239 999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
borland 9 11 10 10 11 12 15 21 49 61 87 129 200 774
szLength 8 8 9 9 12 15 17 24 48 63 89 131 202 779
Ratch 9 11 12 15 14 14 20 31 64 78 100 143 231 855
Jens_fast 20 20 19 20 21 26 29 36 58 69 99 145 219 923
roticvSSE 4 28 29 28 28 32 32 35 39 53 79 112 160 576
lszLenSSE 25 25 25 25 25 28 28 32 38 47 84 116 166 591
Jens_mmx2 6 31 31 32 37 43 48 55 78 45 93 60 124 288
BiteRider 27 26 26 26 26 28 29 32 37 46 82 112 161 569
.686p
.model flat, stdcall
option casemap :none
include \masm32\include\windows.inc
include \masm32\include\kernel32.inc
includelib \masm32\lib\kernel32.lib
include \masm32\include\user32.inc
includelib \masm32\lib\user32.lib
include \masm32\kinc\strlen.inc
include \masm32\kinc\timer.inc
.data
align 4
teststr db 999 dup("a"), 0
msgstr db "Cycles: %d", 0
.data?
time dd ?
buff db 64 dup(?)
.code
start:
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
counter_begin 1000000, REALTIME_PRIORITY_CLASS
invoke strlen, addr teststr
counter_end
invoke wsprintf, addr buff, addr msgstr, eax
invoke MessageBox, 0, addr buff, 0, 0
invoke ExitProcess, 0
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
end start
strlen proto :dword
.code
;##########################################################################
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
strlen proc xstr:dword
mov eax, dword ptr [esp+4]
d1: mov edx, dword ptr [eax]
mov ecx, edx
add eax, 4
sub edx, 16843009
and edx, 2155905152
jz d1
not ecx
and edx, ecx
jz d1
test dl, dl
jnz d2
test dh, dh
jnz d3
test edx, 16711680
jnz d4
jmp d5
d2: dec eax
d3: dec eax
d4: dec eax
d5: dec eax
sub eax, dword ptr [esp+4]
ret 4
strlen endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
;##########################################################################
If anyone has the time to test this out, it would be appreciated, I needed a DWORD type strlen algo that had auto aligning code at its beginning so I added a front end to Agner Fog's strlen algo ad it appears to be working OK at the moment. Some rough benchmarking with a mixed set of samples from a few bytes to 16k shows this hybrid to be about 2.5 times faster than a classic byte scanner.
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
align 4
slen2 proc item:DWORD
push edi
push esi
mov eax, [esp+12]
mov ecx, eax ; copy EAX to ECX
add ecx, 3 ; align up by 4
and ecx, -4
sub ecx, eax ; calculate any misalignment in ecx
mov esi, ecx ; store ECX in ESI
jz proceed
sub eax, 1
@@:
add eax, 1
cmp BYTE PTR [eax], 0 ; scan for terminator for
je quit ; up to the 1st 3 bytes
sub ecx, 1
jns @B
jmp proceed
quit:
sub eax, [esp+12] ; calculate length if terminator
jmp outa_here ; is found in 1st 3 bytes
; ----------------
proceed: ; proceed with the rest
lea edx, [eax+3] ; pointer+3 used in the end
align 4
@@:
mov edi, [eax] ; read first 4 bytes
add eax, 4 ; increment pointer
lea ecx, [edi-01010101h] ; subtract 1 from each byte
not edi ; invert all bytes
and ecx, edi ; and these two
and ecx, 80808080h
jz @B ; no zero bytes, continue loop
test ecx, 00008080h ; test first two bytes
jnz @F
shr ecx, 16 ; not in the first 2 bytes
add eax, 2
@@:
shl cl, 1 ; use carry flag to avoid branch
sbb eax, edx ; compute length
add eax, esi
outa_here:
pop esi
pop edi
ret 4
slen2 endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
Codewarp wrote: :P
" Re: Improved STRLEN
« Reply #1 on: August 08, 2005, 10:40:48 pm »
Ratch,
Isn't this a rehash of what we already covered this in the szLen thread, ad nausium? Also back in that thread, I suggested a super optimization that you are almost doing in your routine above.
The idea is this: search 7-bit ascii, since it's faster than 8-bit. But when you find a "zero", check it for bit7=1: 1=>resume the search for 8-bit ascii, 0=>you are done. In other words use 7-bit search as far as you can take it, then ride the rest of the way using 8-bit search as needed. In most text, the 8-bit part will never be needed, but when required, it covers 8-bit as well--the best of both worlds..."
I agree with bolded text and you can test the result (see timelen1.asm)
I improved my algos LingoSSE2 and LingoMMX too.. (see timelen.asm)
Here is the results on my P4 3.6 GHz Prescott:
A. Timelen.asm -> test
Test routines for correctness:
0 byte misalignment
Borland 0 1 2 3 5 8 13 22 39 55 89 144 239 999
szLength 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Ratch 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Hutch 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_fast 0 1 2 3 5 8 13 22 39 55 89 144 239 999
lszLenSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
roticvSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Biterider 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_mmx2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
LingoMMX 0 1 2 3 5 8 13 22 39 55 89 144 239 999
LingoSSE2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
1 byte misalignment
Borland 0 1 2 3 5 8 13 22 39 55 89 144 239 999
szLength 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Ratch 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Hutch 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_fast 0 1 2 3 5 8 13 22 39 55 89 144 239 999
lszLenSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
roticvSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Biterider 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_mmx2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
LingoMMX 0 1 2 3 5 8 13 22 39 55 89 144 239 999
LingoSSE2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
2 byte misalignment
Borland 0 1 2 3 5 8 13 22 39 55 89 144 239 999
szLength 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Ratch 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Hutch 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_fast 0 1 2 3 5 8 13 22 39 55 89 144 239 999
lszLenSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
roticvSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Biterider 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_mmx2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
LingoMMX 0 1 2 3 5 8 13 22 39 55 89 144 239 999
LingoSSE2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
3 byte misalignment
Borland 0 1 2 3 5 8 13 22 39 55 89 144 239 999
szLength 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Ratch 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Hutch 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_fast 0 1 2 3 5 8 13 22 39 55 89 144 239 999
lszLenSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
roticvSSE 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Biterider 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Jens_mmx2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
LingoMMX 0 1 2 3 5 8 13 22 39 55 89 144 239 999
LingoSSE2 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Proc/Byte 0 1 2 3 5 8 13 22 39 55 89 144 239 999
===========================================================
0 byte misalignment
Borland 19 20 20 20 22 25 32 42 70 122 167 232 344 1988
szLength 15 15 16 16 19 22 24 31 52 72 128 174 248 866
Ratch 12 15 18 20 19 21 24 35 59 82 140 182 284 993
Hutch 21 21 23 23 24 30 33 40 56 81 149 187 279 989
Jens_fast 19 19 19 19 20 23 43 80 88 101 126 168 252 981
lszLenSSE 28 29 28 28 28 32 32 37 52 64 101 200 289 925
roticvSSE 8 13 19 18 25 36 36 41 59 73 109 159 275 924
Biterider 14 15 15 15 14 25 23 37 49 61 102 219 272 922
Jens_mmx2 8 44 45 45 50 53 57 67 78 63 99 90 155 466
LingoMMX 14 14 14 14 14 24 24 36 47 66 100 117 143 402
LingoSSE2 14 14 14 14 14 14 14 25 39 46 69 98 111 301
1 byte misalignment
Borland 19 20 20 20 22 25 31 42 69 165 189 281 437 2535
szLength 19 19 19 23 23 27 29 35 50 63 131 176 256 882
Ratch 12 15 18 20 19 20 25 34 59 92 170 247 397 1465
Hutch 20 21 24 23 38 40 42 45 60 83 154 187 298 1002
Jens_fast 20 20 19 20 23 32 27 72 85 131 162 215 321 1362
lszLenSSE 28 28 28 28 28 32 32 36 51 76 118 226 331 1084
roticvSSE 10 14 16 18 24 58 60 64 83 97 122 168 308 978
Biterider 14 14 14 14 14 24 24 37 55 72 97 165 283 932
Jens_mmx2 8 12 17 21 31 77 84 95 119 100 131 153 183 490
LingoMMX 14 14 14 14 14 24 24 36 51 67 101 121 145 403
LingoSSE2 14 14 14 14 14 14 14 25 39 49 69 98 117 304
2 byte misalignment
Borland 19 20 20 20 22 26 43 42 69 121 192 284 455 2418
szLength 19 18 21 21 21 24 27 37 51 63 131 176 253 872
Ratch 12 15 19 21 20 20 37 36 59 81 166 247 401 1472
Hutch 19 21 20 22 33 36 41 51 60 80 159 212 321 1006
Jens_fast 19 19 19 19 20 27 62 77 85 99 156 209 328 1366
lszLenSSE 29 29 28 28 28 32 44 37 52 64 124 253 343 1091
roticvSSE 8 12 16 18 25 55 54 64 76 90 117 164 297 956
Biterider 14 14 15 14 15 24 24 44 56 73 97 160 284 936
Jens_mmx2 8 14 17 20 33 76 77 92 106 89 123 144 177 497
LingoMMX 13 15 15 15 14 24 24 38 56 69 97 118 145 403
LingoSSE2 14 14 14 14 14 14 14 25 39 50 74 101 117 301
3 byte misalignment
Borland 20 23 23 23 23 26 31 45 83 150 196 328 443 2381
szLength 20 21 21 22 24 24 31 37 92 104 137 176 254 872
Ratch 13 15 17 20 20 20 24 35 72 95 167 282 405 1438
Hutch 19 19 22 28 32 33 40 65 70 94 158 264 367 998
Jens_fast 19 19 19 19 20 27 59 86 117 128 150 198 301 1377
lszLenSSE 30 28 29 28 28 32 34 40 59 76 111 245 354 1128
roticvSSE 8 12 16 19 24 51 55 60 72 89 115 161 293 962
Biterider 14 14 15 15 24 24 37 44 55 73 97 156 283 935
Jens_mmx2 8 12 17 20 30 66 74 82 101 82 118 142 180 490
LingoMMX 13 14 14 14 14 24 25 41 57 71 97 117 146 402
LingoSSE2 14 14 14 14 14 14 14 25 39 46 69 98 117 301
Press enter to exit...
B. Timelen1.asm -> test
Test routines for correctness:
0 byte misalignment
RatchN 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Lingo32 0 1 2 3 5 8 13 22 39 55 89 144 239 999
1 byte misalignment
RatchN 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Lingo32 0 1 2 3 5 8 13 22 39 55 89 144 239 999
2 byte misalignment
RatchN 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Lingo32 0 1 2 3 5 8 13 22 39 55 89 144 239 999
3 byte misalignment
RatchN 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Lingo32 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Proc/Byte 0 1 2 3 5 8 13 22 39 55 89 144 239 999
========================================================
0 byte misalignment
RatchN 12 18 22 23 21 20 26 36 65 80 135 178 258 1392
Lingo32 10 14 17 19 18 17 20 28 39 55 111 141 198 1047
1 byte misalignment
RatchN 9 15 21 32 38 38 44 57 75 101 146 192 270 1513
Lingo32 9 14 18 22 27 27 31 32 46 53 113 146 203 1054
2 byte misalignment
RatchN 9 15 25 29 32 32 37 48 70 93 144 189 268 1493
Lingo32 10 15 18 20 26 26 27 33 46 54 113 147 206 1052
3 byte misalignment
RatchN 9 22 22 23 23 31 30 42 64 87 139 184 257 1411
Lingo32 9 15 17 19 17 28 28 34 47 55 112 148 200 1053
Press enter to exit...
Regards,
Lingo
[attachment deleted by admin]
Lingo,
Ok, let's talk about string length subroutines. I am going to limit my remarks to GPR routines only. Well written MMX/SSE code is going to beat the pants off any GPR implementation. If MMX/SSE instructions did not, they would not exist.
The most important part of the subroutine is the core loop where most of the execution time is spent, especially on a long string. The code before and after this core loop is either executed once, or only on special conditions such as aligning on a dword, locating the zero byte within the word and testing for a legitimate zero byte. Let's define the core loops.
The following loop I will call the Agner Fog loop. It is widely attributed to Agner, but I have my doubts. I keep seeing it elsewhere, and I suspect it is old and was well known to computer science academics before Agner. It is a 7 line loop. http://www.cl.cam.ac.uk/~am/progtricks.html
.REPEAT ;searching string ....
MOV EDX,[EAX] ;next 4 byte gulp (DWORD)
ADD EAX,DWORD ;EAX=character pointer
LEA ECX,[EDX-01010101H] ;propagate if byte is zero
NOT EDX ;set up test pattern
AND EDX,ECX ;leftmost bit of zero byte should now be set
AND EDX,080808080H ;sieve out zero bytes
.UNTIL !ZERO? ;check the next DWORD
endif
This I will call the Lingo loop. It is a 7 line loop.
.REPEAT
LEA EDX,[ECX+0FEFEFEFFH]
NOT ECX
AND ECX,080808080H
ADD EAX,4
AND EDX,ECX
MOV ECX,[EAX]
.UNTIL !ZERO? ;check the next DWORD
Below is the Ratch8 loop. It is a 6 line loop, and used for 8-bit extended ASCII.
.REPEAT
MOV EDX,[EAX] ;next 4 byte gulp (DWORD)
AND EDX,07F7F7F7FH ;mask out bit 8
ADD EAX,DWORD ;EAX=character pointer
SUB EDX,01010101H ;make those zero bytes shine
AND EDX,ECX ;sieve out zero bytes
.UNTIL !ZERO? ;check the next DWORD
And finally is the Ratch7 loop. It is a 5 line loop used for 7-bit ASCII.
.REPEAT
MOV EDX,[EAX] ;next 4 byte gulp (DWORD)
ADD EAX,DWORD ;EAX=character pointer
SUB EDX,01010101H ;make those zero bytes shine
AND EDX,ECX ;sieve out zero bytes
.UNTIL !ZERO? ;check the next DWORD
Theoretically, the Ratch7 loop will execute the fastest, and according to my timings, it does. If 8-bit is needed, my timings show the Ratch8, Lingo, and Agner loops in a dead heat. I do not know why I do not get faster speeds on Ratch8 since it only has 6 lines in the loop vs. 7 for Lingo and Agner. It appears that timing is a tricky thing and I do not pretend to understand it. I executed your timelen1 routine on my 1 ghz AMD Athlon and as you can see, the results are different than yours were. By the way, you did not test my 7-bit version, and you transcribed the MOV ECX 80808080H in my 8-bit version to the wrong spot in timelen1. Also you used an old version of my STRLEN in timelen.
Test routines for correctness:
0 byte misalignment
RatchN 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Lingo32 0 1 2 3 5 8 13 22 39 55 89 144 239 999
1 byte misalignment
RatchN 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Lingo32 0 1 2 3 5 8 13 22 39 55 89 144 239 999
2 byte misalignment
RatchN 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Lingo32 0 1 2 3 5 8 13 22 39 55 89 144 239 999
3 byte misalignment
RatchN 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Lingo32 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Proc/Byte 0 1 2 3 5 8 13 22 39 55 89 144 239 999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
0 byte misalignment
RatchN 10 11 14 17 14 15 19 30 61 73 95 136 220 1029
Lingo32 8 8 9 10 12 13 15 22 53 60 80 120 179 1066
1 byte misalignment
RatchN 4 7 12 18 26 26 33 42 62 76 108 152 227 1050
Lingo32 9 13 17 17 18 20 23 28 58 66 91 136 199 1061
2 byte misalignment
RatchN 4 8 14 18 26 25 32 31 63 75 109 148 222 1048
Lingo32 9 12 14 14 16 18 24 28 57 66 85 120 184 1062
3 byte misalignment
RatchN 5 12 15 18 14 24 20 30 63 76 96 150 219 1028
Lingo32 8 12 14 14 14 17 19 24 53 65 86 124 186 1055
Press enter to exit...
Quote
Ratch,
Isn't this a rehash of what we already covered this in the szLen thread, ad nausium? Also back in that thread, I suggested a super optimization that you are almost doing in your routine above.
The idea is this: search 7-bit ascii, since it's faster than 8-bit. But when you find a "zero", check it for bit7=1: 1=>resume the search for 8-bit ascii, 0=>you are done. In other words use 7-bit search as far as you can take it, then ride the rest of the way using 8-bit search as needed. In most text, the 8-bit part will never be needed, but when required, it covers 8-bit as well--the best of both worlds..."
I agree with bolded text and you can test the result (see timelen1.asm)
If it was discussed before, I missed it. There are presently 10 pages to this thread. That's a lot of material to plow through. I guess my contribution to that idea is a 6 line implementation, if that is any achievement. Ratch
Ratch, :lol
"The following loop I will call the Agner Fog loop. It is widely attributed to Agner, but I have my doubts. I keep seeing it
elsewhere, and I suspect it is old and was well known to computer science academics before Agner. It is a 7 line loop"
Who cares about the first "author"?
We just use it
If we talk about "doubts" just for example
what about your "new" strlen algo and the similar very old
algo of the Paul Hsieh here:
http://www.azillionmonkeys.com/qed/asmexample.html
Let's compare them:
A. "Update!
While discussing sprite data copying (see next example) I realized that there is a significant improvement for 32-bit x86's that have
slow branching (P-IIs and Athlon.) "
; by Paul Hsieh
lea ecx,[ebx-1]
l1: inc ecx
test ecx,3
jnz l3
l2: mov edx,[ecx] ; U
mov eax,07F7F7F7Fh ; V
and eax,edx ; U
add ecx,4 ; V
add eax,07F7F7F7Fh ; U
or eax,edx ; U
and eax,080808080h ; U
cmp eax,080808080h ; U
je l2 ; V +1brt
sub ecx,4
l3: cmp byte ptr [ecx],0
jne l1
sub ecx,ebx
B. final version of the improved STRLEN by Ratch:
http://www.masmforum.com/simple/index.php?topic=2442.0
"OK, here is my final version of STRLEN, unless someone finds a bug. It can now detect the obscure 8-bit byte 080H. It does
this by checking the byte for 080H at the end of the subroutine, and returning to the beginning of the subroutine if that value is
detected. If a lot of 080H bytes are present in the string, a performance penalty will be incurred. Ratch"
004097D1 8B 44 24 04 mov eax,dword ptr [esp+4]
004097D5 B9 80 80 80 80 mov ecx,80808080h
Labe_0:
004097DA A8 03 test al,3
004097DC 74 08 je 004097E6
004097DE F6 00 FF test byte ptr [eax],0FFh
004097E1 74 26 je 00409809
Labe_Begin:
004097E3 40 inc eax
004097E4 EB F4 jmp 004097DA
;Labe_1:
;mov ecx, 80808080h -> From 1st version
Labe_2:
004097E6 8B 10 mov edx,dword ptr [eax]
004097E8 81E27F7F7F7F and edx,7F7F7F7Fh
004097EE 83 C0 04 add eax,4
004097F1 81EA01010101 sub edx,1010101h
004097F7 23 D1 and edx,ecx
004097F9 74 EB je 004097E6 ; Labe_2
004097FB 83 E8 05 sub eax,5
Labe_3:
004097FE 40 inc eax
004097FF C1 CA 08 ror edx,8
00409802 73 FA jae 004097FE ; Labe_3
00409804 F6 00 FF test byte ptr [eax],0FFh
00409807 78 DA js 004097E3 ; Labe_Begin
Labe_4:
00409809 2B 44 24 04 sub eax,dword ptr [esp+4]
0040980D C2 04 00 ret 4
"Theoretically, the Ratch7 loop will execute the fastest,
and according to my timings, it does"
Ratch7 is the buliaNaza's algo:
and some time ago I improved it:
http://board.win32asmcommunity.net/index.php?topic=8330.msg77056#msg77056
So "my 7-bit algo" is improved "buliaNaza's algo" and it is faster then Ratch7(buliaNaza)
Pls,try to change inc eax with add eax,1 in your algo too...
"...and according to my timings, it does"
Pls, attach your test files (like me) :lol
"By the way, you did not test my 7-bit version..."
Because I CAN'T...
It is the most important point in "my algo"
and it is the main reason for me to create the timelen1.asm file
It is the Codewarp's point of view too and I created timelen1.asm file
just as an answer to him rather then to "offend" your algo:
"In other words use 7-bit search as far as you can take it,
then ride the rest of the way using 8-bit search as needed.
In most text, the 8-bit part will never be needed, but when required,
it covers 8-bit as well--the best of both worlds..." by Codewarp
"My algo" starts the job with 7-bit code search part and if it failed
"automatically" switch to 8-bit code search part and tiil to end...
"Your algo" works with 7-bit code search OR with 8-bit code
search but can't switch "automatically" if we have "mixed" string with
7 AND and 8-bit simbols in it
...and you transcribed the MOV ECX 80808080H
in my 8-bit version to the wrong spot in timelen1 "
A. your 1st variant
STRLEN: ;it all begins here
MOV EAX,[ESP+DWORD] ;address of string
.WHILE TRUE ;check DWORD alignment
TEST AL,DWORD-1 ;is DWORD aligned
.BREAK .IF ZERO? ;yes, DWORD aligned
TEST BYTE PTR [EAX],0FFH ;not aligned, check for zero byte
JZ @F ;jmp if end of string
INC EAX ;prepare to check next byte
.ENDW ;around the horn
MOV ECX,080808080H ;sieve mask
.REPEAT
.......
B. your 2nd variant
STRLEN: ;it all begins here
MOV EAX,[ESP+DWORD] ;address of string
MOV ECX,80808080h
......
Ok, the technical error is mine but it is not so
important here because it isn't in the main loop...
It is an obvious example how the macros "hide"
the pure code..
"Also you used an old version of my STRLEN in timelen."
I downloaded timelen and added
Biterider, Hutch, LingoMMX and LingoSSE2 algos ONLY!
I didn't touch your algo (with macros) there
"I guess my contribution to that idea is a 6 line implementation..."
I agree with you that it is your contribution and I used it
as an idea in my new timelen1.asm file... :U
I preferred Codewarp's point of view about 5&7 line implementation
rather then 6 line implementation and just tried to proof it...
Right now I prefer new couple 5&6 line implementation (see my new timelen1.asm)
rather then 5&7 line implementation :lol
In conclusion please:
- feel free to edit the test files and algos in your way
and post them for us...(like me) :lol
- use the pure code rather then macros (if you can)
- answer the question who will uses my or your GPR algos with
5&7-5&6 or 6 line implementations if we have similar faster MMX/SSE/SSE2 algos :lol
- try to optimize my new lingo32 algo (if you can) (see my new timelen1.asm) :lol
Here are new results:
Test routines for correctness:
0 byte misalignment
RatchN 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Lingo32 0 1 2 3 5 8 13 22 39 55 89 144 239 999
1 byte misalignment
RatchN 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Lingo32 0 1 2 3 5 8 13 22 39 55 89 144 239 999
2 byte misalignment
RatchN 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Lingo32 0 1 2 3 5 8 13 22 39 55 89 144 239 999
3 byte misalignment
RatchN 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Lingo32 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Proc/Byte 0 1 2 3 5 8 13 22 39 55 89 144 239 999
========= ==== ==== ==== ==== ==== ==== ==== ==== ====
0 byte misalignment
RatchN 17 20 20 22 20 22 28 45 61 84 149 186 276 1546
Lingo32 12 15 16 20 17 21 21 31 43 59 128 148 226 1197
1 byte misalignment
RatchN 21 16 23 36 39 40 66 61 86 109 157 216 301 1650
Lingo32 9 15 18 23 24 28 27 39 49 67 128 179 234 1204
2 byte misalignment
RatchN 10 15 28 29 41 33 46 55 86 96 168 203 284 1676
Lingo32 12 15 20 22 26 29 32 32 51 55 125 174 235 1187
3 byte misalignment
RatchN 9 21 32 25 23 32 36 47 79 87 149 196 291 1550
Lingo32 10 17 18 19 18 28 26 31 43 54 125 157 225 1215
Press enter to exit...
Regards,
Lingo
[attachment deleted by admin]
Lingo,
Quote
If we talk about "doubts" just for example
what about your "new" strlen algo and the similar very old
algo of the Paul Hsieh here:
http://www.azillionmonkeys.com/qed/asmexample.html
The code you present from Paul Hsieh's site is about sprites (whatever they are). It is a spaghetti code of two or more internal loops that have no resemblance to my search loop.
Quote
Ratch7 is the buliaNaza's algo:
and some time ago I improved it:
http://board.win32asmcommunity.net/index.php?topic=8330.msg77056#msg77056
So "my 7-bit algo" is improved "buliaNaza's algo" and it is faster then Ratch7(buliaNaza)
Pls,try to change inc eax with add eax,1 in your algo too...
I believe your code referred to by the link was written as a 8-bit algo, because it returns to the loop if a zero is not found. And it uses a LEA instruction instead of a SUB like my 7-bit algo does, so it is not quite the same. They are both 5 line core loops, so they should show equal times. By the way, any byte over 081H will kick your code out of the core loop and slow it up greatly.
Quote
"By the way, you did not test my 7-bit version..."
Because I CAN'T...
It is the most important point in "my algo"
and it is the main reason for me to create the timelen1.asm file
It is the Codewarp's point of view too and I created timelen1.asm file
just as an answer to him rather then to "offend" your algo:
If you could test my 8-bit version, I don't see why you cannot test my 7-bit version also. But no matter.
Quote
"In other words use 7-bit search as far as you can take it,
then ride the rest of the way using 8-bit search as needed.
In most text, the 8-bit part will never be needed, but when required,
it covers 8-bit as well--the best of both worlds..." by Codewarp
"My algo" starts the job with 7-bit code search part and if it failed
"automatically" switch to 8-bit code search part and tiil to end...
"In other words use 7-bit search as far as you can take it,
then ride the rest of the way using 8-bit search as needed.
In most text, the 8-bit part will never be needed, but when required,
it covers 8-bit as well--the best of both worlds..." by Codewarp
"Your algo" works with 7-bit code search OR with 8-bit code
search but can't switch "automatically" if we have "mixed" string with
7 AND and 8-bit simbols in it
If you are referring to your code in timelen1, that appears to be a 8-bit search code period. My 7-bit code expects all characters to be 7-bit ASCII. If they are not, an error in counting will occur.
Quote
"Your algo" works with 7-bit code search OR with 8-bit code
search but can't switch "automatically" if we have "mixed" string with
7 AND and 8-bit simbols in it
That is absolutely wrong. Eight bit code by definition can be from value 0 to 0FFH. My program only goes out of the loop and returns when the value is 080H, which should be rare. It can evaluate all the 8-bit values in any order.
Quote
Ok, the technical error is mine but it is not so
important here because it isn't in the main loop...
It is an obvious example how the macros "hide"
the pure code..
It becomes important only if there are a lot of 080H bytes in the string.
Quote
In conclusion please:
- feel free to edit the test files and algos in your way
and post them for us...(like me)
- use the pure code rather then macros (if you can)
- answer the question who will uses my or your GPR algos with
5&7-5&6 or 6 line implementations if we have similar faster MMX/SSE/SSE2 algos
- try to optimize my new lingo32 algo (if you can) (see my new timelen1.asm)
Below are the results of the most recent timelin1.ece run on my machine. I don't know why your algo shows a little better time than mine, because they both use a 6 line loop. The difference is not as great as your machine, however. As I said previously, there is something about timing that is mysterious. You refer to my using macros. If you mean .REPEAT, .WHILE, etc., those are not macros. They are built in directives of MASM, but if they confuse you, I will translate them to jumps instead. The GPR implementations of STRLENs are only good for old CPUs that do not have MMX/SSE instructions, or if MMX is not available for some reason. Correct me if I am wrong, but doesn't MMX use the same registers as the FPU? Ratch
Test routines for correctness:
0 byte misalignment
RatchN 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Lingo32 0 1 2 3 5 8 13 22 39 55 89 144 239 999
1 byte misalignment
RatchN 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Lingo32 0 1 2 3 5 8 13 22 39 55 89 144 239 999
2 byte misalignment
RatchN 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Lingo32 0 1 2 3 5 8 13 22 39 55 89 144 239 999
3 byte misalignment
RatchN 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Lingo32 0 1 2 3 5 8 13 22 39 55 89 144 239 999
Proc/Byte 0 1 2 3 5 8 13 22 39 55 89 144 239 999
========= ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
0 byte misalignment
RatchN 10 11 13 18 15 16 18 29 61 81 95 136 221 1000
Lingo32 7 7 10 13 12 12 15 22 48 56 77 118 175 883
1 byte misalignment
RatchN 5 10 15 17 23 24 29 39 60 72 106 145 216 1025
Lingo32 9 12 17 17 19 19 23 30 57 68 86 124 187 896
2 byte misalignment
RatchN 5 10 14 17 24 23 30 31 60 74 105 147 218 1021
Lingo32 9 13 13 13 19 18 22 23 54 66 93 123 187 919
3 byte misalignment
RatchN 5 12 14 17 15 24 21 29 62 74 96 148 220 999
Lingo32 8 11 12 13 13 16 19 23 53 65 86 121 185 900
Press enter to exit...
Guys,
Let me ask you this question, I know Lingo is developing on a very late PIV, what box are you using Ratch ? I ask the question because you bot appear not to agree on the benchmarking results and it just may be AMD/Intel differences.
Ratch,
To be honest I can't understand what is your problem
and what you expect from me...
Hutch,
" it just may be AMD/Intel differences.
f0dder was so kind to test the same files
on his box with Socket 939 AMD64 3500+,
TWINX1024-3200C2 DDR-400 (2x512MB),
MSI K8N-NEO4-Platinum nForce4 chipset.
http://board.win32asmcommunity.net/index.php?topic=21565.0
Regards,
Lingo
Lingo,
Quote
Ratch,
To be honest I can't understand what is your problem
and what you expect from me...
Why do you preceive there to be a problem between us? You first made a comment about my reply to Codewarp, which I answered. Then you answered back and so on. I do not expect anything from you.
Hutch,
Quote
Guys,
Let me ask you this question, I know Lingo is developing on a very late PIV, what box are you using Ratch ? I ask the question because you bot appear not to agree on the benchmarking results and it just may be AMD/Intel differences.
Both our algos use a 6 line core loop. I was wondering why there was so much discrepancy between the results when run on his and my machines, especially on long strings. This is true even when I run Lingo's timelen1.exe test on my machine. I fear there is something about timing that none of us know about, or worse, nothing we can do even if we did know. I use a 1 Ghz AMD Athlon which is a few years old now. But no matter how fast or slow the machine, the same length of loop should be somewhat the same timing. Any comments or speculation would be appreciated. Ratch
Ratch, Lingo-
I love this stuff myself, but it has become painfully obvious that athlons and pentiums do not work the same. You can optimize for one or the other. I have an athlon myself, and I can optimize all day to save a few cycles based on some carefully selected instructions or strategically placed nop's, and a pentium user would find that the new code runs slower than the old. It's really no use to argue with each other if we're not using the same cpu, just present your best and let the rest of us pick whichever we want.
That being said, I do think it's really important to have general purpose routines. There are very few cpus left which don't have mmx capability, but the number that can do sse2 is still a small percentage of the total. SSE2 is fun and very appealing, but I certainly wouldn't use it without checking if the cpu was capable first. And checking if the cpu is capable takes longer than any savings in the code itself on these little gp routines.
Also, I agree, I really think our timing assumptions are seriously flawed. Unfortunately, it's all we have. There needs to be some way to find the real world performance of a routine as it is normally used. We seldom call any of these routines over and over a million times in a tight loop. The real timing test is how long does your code take when it is called just once and is not in the cache. Does it take longer to execute a routine the first time because it is long and complicated than the time it saves by being tricky? How do we test such a thing? I've been playing with calling each routine once in sequence, then repeating the sequence and averaging the results, but I can't get any consistency at all. Anyone have any ideas?
But all together, most of us are enjoying the competition, just keep it civil and keep those great ideas coming!
Ratch, if I may butt in with my two cents, I've heard it mentioned on numerous occasions that trying to get a quantitive, definite value on CPU timings is a sure-fire way to drive yourself insane. (http://dukunbola.com/tagboard/e/spin.gif) This is due to the fact that virtually all processor architectures are different - some use more "pipelines" for concurrently executing or staging (or optimizing) instructions, others have a bigger/smaller L2 cache, some physically handle ALU/MMX/SSE differently (or not at all...) yadda, yadda, yadda. It's like measuring harmonic cabin vibrations of various models of Chrysler vehicles... :bg
The easiest solution is as Michael had said, simply time the proc on as many hardware(s) as you can, and just accept the results. :)
Of course if you're looking to optimize the code for a specific processor, check out Agner Fog's and Michael's optimization guides. Maybe if you compared Intel datasheets with AMD datasheets you might find a minimum and maximum execution time for a specific instruction, but even so, cache and optimization(s) are going to skew the actual timings. Keep It Simple. :)
Jimg
Quote
It's really no use to argue with each other if we're not using the same cpu, just present your best and let the rest of us pick whichever we want.....
Also, I agree, I really think our timing assumptions are seriously flawed. Unfortunately, it's all we have....
Thanks Jimg, your observations are the better than both Lingo's and mine put together. Ratch
Mark Jones,
Quote
The easiest solution is as Michael had said, simply time the proc on as many hardware(s) as you can, and just accept the results. :)
I agree with just about everything you said in your last post. Unfortunately I don't have the hardware or inclination to test my algo on a suite of platforms. But no matter. I usually try to just keep the highly used portions of the programs such as the loops as short as possible, and avoid the "bad" instructions like DIV, LOOP, REP SCASB, XLAT, etc. Ratch
I worked on an algo recently with Jim and we both use different hardware, mine is a 2.8 gig PIV and Jim was testing on a late model AMD pre 64 bit processor. We were getting different times on different code techniques which displayed the hardware differences so together we produced a version that averaged the best across both processors.
This is basically the joys of writing mixed model code where if you want general purpose code, you must test across a number of different machines and correlate the results.
Quote from: Jimg on August 18, 2005, 03:50:17 PM
...I do think it's really important to have general purpose routines. There are very few cpus left which don't have mmx capability, but the number that can do sse2 is still a small percentage of the total. SSE2 is fun and very appealing, but I certainly wouldn't use it without checking if the cpu was capable first. And checking if the cpu is capable takes longer than any savings in the code itself on these little gp routines.
Jimg,
As I have mentioned before in other postings here, none of these routines should be using cpuid directly--ever! Rather, the library startup code containing the routines in question should use cpuid to set feature-DWORDs testable from anywhere using a single memory test instruction. This method reduces the test overhead to a single cycle or less. My overhead for some of this is down to zero--because my library simply refuses to load if mmx and conditional moves are not supported, so no tests for these features are necessary. This affords me complete freedom to employ sse, sse2 and sse3 as I see fit anywhere in my supporting libraries.
Quote
Also, I agree, I really think our timing assumptions are seriously flawed. Unfortunately, it's all we have. There needs to be some way to find the real world performance of a routine as it is normally used. We seldom call any of these routines over and over a million times in a tight loop. The real timing test is how long does your code take when it is called just once and is not in the cache. Does it take longer to execute a routine the first time because it is long and complicated than the time it saves by being tricky? How do we test such a thing? I've been playing with calling each routine once in sequence, then repeating the sequence and averaging the results, but I can't get any consistency at all. Anyone have any ideas?
Clock cycles are the best measure we have of pure cpu cost of execution implied by a particular sequence of code. If you muddy the waters by injecting the differences in cache contents, page faults, motherboard design, chipsets, cpu clock multipliers and financial resources for obtaining the fastest memory sticks, clock cycles will tell you little more a State of the Union address--next to nothing.
Your desire for a real-world test is legitimate and shared by many others, but I am afraid that clock cycle counts are not it. If clock counts represent the best case of a run, why not set up the conditions for the worst case, call the routine exactly once, then report the cycles consumed? Unfortunately, every machine would report something different--not all that useful. Our testing is not flawed at all--you have requirements that go beyond this particular metric, that's all.
Furthermore, some aspects that greatly affect execution time--such as cache utilization and locality of reference--cannot be tested outside the context of an entire running application, nor tested with clock cycles. Real-world performance testing--look elsewhere.
One of my favorite techniques, is to determine the cost of a particular routine in a complete application context by weighting it down with additional known cost, then measuring the drag on overall performance in real time. From this data you can compute the percentage of overall execution time taken by this routine, and hence the real time spent in that routine. You can then compare the difference between your "idealized" clock performance and your real-time performance. However don't be fooled--this difference will vary across machines and other factors.
what do you think about this code? :
szBuffer db 'sdkjahgkyugkuygfkljashdgvlkasgdfkluygqweoiugalsdkf', 0
...
push offset szBuffer
call myStrLength
...
myStrLength:
pop edi
pop esi
push edi
mov edx, esi
ll_Loop:
mov al, byte ptr [esi]
inc esi
test al, al
jnz ll_Loop
dec esi
mov eax, esi
sub eax, edx
retn
Snouphruh, :bg
"what do you think about this code?"
You can use eax and ecx registers (without preserving)
rather than edi and esi
myStrLength:
or ecx, -1
mov eax, [esp+4] ; eax->szBuffer
sub ecx, eax ; ecx->-szBuffer-1
ll_Loop:
cmp byte ptr [eax], 0
lea eax,[eax+1]
jnz ll_Loop
add eax, ecx
ret 4
Regards,
Lingo
nice! very nice!
but I heard LEA is slow.
and MOV EAX, [ESP + 4] takes 4 bytes long.
what if...:
myStrLength:
or ecx, -1
pop edx ; edx = return address
pop eax ; eax = pointer to the buffer
sub ecx, eax
ll_Loop:
cmp byte ptr [eax], 0
lea eax, [eax + 1]
jnz ll_Loop
add eax, ecx
jmp edx
but my previous example has body loop:
mov al, byte ptr [esi]
inc esi
test al, al
jnz ll_Loop
which takes only 2 CPU clocks, 'cause MOV and INC are being paired as well as TEST and JNZ are.
Sorry for that old topic rivial :P
I just would like to have some timing results of these functions. Please, don't forget to write your processor AMD/Intel.
These ones a much slower than the others but I need to choose and I don't know which one is better for Intel.
Thanks a lot
AMD Athlon XP 1800+
Quote
lstrlenA return value : 191
AzmtStrLen1A return value : 191
AzmtStrLen2A return value : 191
lstrlenA : 439 cycles
AzmtStrLen1A : 210 cycles
AzmtStrLen2A : 203 cycles
Press any key to exit...
[attachment deleted by admin]
Celeron M:
lstrlenA return value : 191
AzmtStrLen1A return value : 191
AzmtStrLen2A return value : 191
lstrlenA : 431 cycles
AzmtStrLen1A : 283 cycles
AzmtStrLen2A : 224 cycles
q6600
lstrlenA : 255 cycles
AzmtStrLen1A : 191 cycles
AzmtStrLen2A : 192 cycles
Wow sinsi, I'm surprised that lstrlen does not perform more worstly than that.
PIII 500
lstrlenA return value : 191
AzmtStrLen1A return value : 191
AzmtStrLen2A return value : 191
lstrlenA : 457 cycles
AzmtStrLen1A : 241 cycles
AzmtStrLen2A : 229 cycles
Biterider
Hi
After looking into your code, the only problem i see is that if the string pointer is not aligned to 4 and the string is at the end of an allocated memory page, the algo can produce a GPF.
I suggest to check the lower 2 bits of the string pointer and jump according to them into to comparison chain. Previously you have to set the lower 2 bits to zero and load ecx with the content of an aligned address.
Biterider
You're right, I completly forgot to make the string aligned of a 4 byte boundary.
This algo is made for aligned string so I not planning to change it... but thanks anyway.
:U
[attachment deleted by admin]
I added a (not very well tested) procedure that is essentially one posted by Mark Larson, roughly 3 years ago. I think my very early version of Windows 2000, even though it has the latest SP installed, has a slow version of lstrlen. This is running on a P3:
lstrlenA return value : 191
AzmtStrLen1A return value : 191
AzmtStrLen2A return value : 191
markl_szlen return value : 191
lstrlenA : 826 cycles
AzmtStrLen1A : 238 cycles
AzmtStrLen2A : 227 cycles
markl_szlen : 177 cycles
[attachment deleted by admin]
Seeing the Michael's result, there is no doubt that coding our own string length algo is not worthless.
The problem with this algo of Mark Larsen is that the end of the string must be 4 null-char and not just one.
if mmx isn't a problem, this version (short, quite fast and reduce branch mispredictions to the minimum) :
ALIGN 16
;
; obtenir la taille réelle d'une chaîne de caractères
; note : la chaîne doit être alignée sur 4/8/16 octets (pour une meilleure vitesse)
;
; Syntax :
; mov esi,OFFSET {start address of the string}
; call StringLength_Mmx_Mini
;
; Return :
; eax = string length
;
StringLength_Mmx_Mini PROC
push edx ;; empiler edx
mov eax,esi ;; placer l'adresse de la chaîne dans ecx
; nop ;; ) aucun alignement nécessaire pour un meilleur rendement
; nop ;; )
; nop ;; )
; nop ;; )
; nop ;; )
; nop ;; )
; nop ;; )
; nop ;; )
pxor MM0,MM0 ;; effacer MM0 (ce sera notre registre de comparaison)
Label1: pcmpeqb MM0,QWORD PTR [eax] ;; comparer les octets en parallèle, pour voir si l'un de ceux en MM0 est égal à 0
pmovmskb edx,MM0 ;; générer le masque de MM0 dans edx
add eax,8 ;; ajouter 8 à eax (notre pas de progression)
test edx,edx ;; fixer les flags de edx
jz Label1 ;; si c'est égal à 0, aller Label1
sub eax,esi ;; soustraire l'adresse de départ à eax
bsf edx,edx ;; scanner le premier bit armé à partir de la droite
lea eax,[eax+edx-8] ;; placer eax+edx-8 dans eax
pop edx ;; désempiler edx
ret ;; retourner (sortir de la procédure)
StringLength_Mmx_Mini ENDP
hmm, i don't know the ratio of lamps i could have with this one, but it must be clearly good... (clearly, because of the numbers of lamps, of course... :lol)
That algo gets 47 cycles on mine - very fast! Is it only MMX though? I had to use ".xmm" to get it to compile.
you're right, pmovmskb has been implemented only with sse... :red
I've modified the testbed (variable alignment) and added an xmm-version:
on my Core2Duo:
lstrlenA return value : 191
AzmtStrLen1A return value : 191
AzmtStrLen2A return value : 191
markl_szlen return value : 191
StringLength_Mmx_Min
(MMX/SSE2) return value : 191
StrSizeA(SSE2) value : 191
align 0
lstrlenA : 252 cycles
AzmtStrLen1A : 193 cycles
AzmtStrLen2A : 193 cycles
markl_szlen : 106 cycles
StringLength_Mmx_Min: 72 cycles
StrSizeA(SSE2): 38 cycles
align 1
lstrlenA : 247 cycles
AzmtStrLen1A : 216 cycles
AzmtStrLen2A : 221 cycles
markl_szlen : 152 cycles
StringLength_Mmx_Min: 103 cycles
StrSizeA(SSE2): 102 cycles
align 4
lstrlenA : 244 cycles
AzmtStrLen1A : 193 cycles
AzmtStrLen2A : 192 cycles
markl_szlen : 90 cycles
StringLength_Mmx_Min: 109 cycles
StrSizeA(SSE2): 92 cycles
align 7
lstrlenA : 246 cycles
AzmtStrLen1A : 203 cycles
AzmtStrLen2A : 202 cycles
markl_szlen : 126 cycles
StringLength_Mmx_Min: 96 cycles
StrSizeA(SSE2): 105 cycles
Press any key to exit...
[attachment deleted by admin]
I wrote a similar one a few years ago for my string library...
lszLenMMX FRAME pString
mov eax,[pString]
nop
nop ; fill in stack frame+mov to 8 bytes
pxor mm0,mm0
nop ; fill pxor to 4 bytes
pxor mm1,mm1
nop ; fill pxor to 4 bytes
: ; this is aligned to 16 bytes
movq mm0,[eax]
pcmpeqb mm0,mm1
add eax,8
pmovmskb ecx,mm0
or ecx,ecx
jz <
sub eax,[pString]
bsf ecx,ecx
sub eax,8
add eax,ecx
emms
RET
ENDF
lstrlen() is known not to be the fastest algo.
AFAIK strlen() from MSVCRT is significantly faster. You probably should compare your routine with that version as well.
Quote from: japheth on December 22, 2008, 12:34:26 PM
lstrlen() is known not to be the fastest algo.
AFAIK strlen() from MSVCRT is significantly faster. You probably should compare your routine with that version as well.
Yes, the strlen from the c runtime is quite fast but a little slower than a custom agner fog algo, though I think they use the same tricks. These functions takes speed with long strings but are easy to beat on small strings. But it is a good point that if you don't want to code your algo, the ansi version of strlen from msvcrt is the best alternative to lstrlen. For the unicode version, that's another story.
:U
hmm, if it's sse due to pmovmskb, then a fully sse version :toothy :
ALIGN 16
;
; obtenir la taille réelle d'une chaîne de caractères
; note : la chaîne doit être alignée sur 4/8/16 octets (pour une meilleure vitesse)
;
; Syntax :
; mov esi,OFFSET {start address of the string}
; call StringLength_Sse
;
; Return :
; eax = string length
;
StringLength_Sse PROC
push ecx ;; empiler ecx
push edx ;; empiler edx
mov edx,esi ;; placer l'adresse de départ dans edx
; nop ;; ) aucun alignement nécessaire pour un meilleur rendement
; nop ;; )
; nop ;; )
; nop ;; )
; nop ;; )
; nop ;; )
; nop ;; )
; nop ;; )
pxor XMM0,XMM0 ;; ) effacer XMM0 et XMM1 (ce sera nos registres de comparaison)
pxor XMM1,XMM1 ;; )
; ici, on teste un bloc de 32 caractères, pour voir s'il existe un 0
Label1: pcmpeqb XMM0,OWORD PTR [edx] ;; comparer le qword à l'adresse en edx à XMM0
pcmpeqb XMM1,OWORD PTR [edx+16] ;; comparer le qword à l'adresse en edx+16 à XMM1
por XMM1,XMM0 ;; fusionner XMM1 et XMM0
add edx,OWORD*2 ;; ajouter 32 (notre pas de progression) à edx
pmovmskb eax,XMM1 ;; générer le masque de XMM1 dans eax
test eax,eax ;; fixer les flags de eax
jz Label1 ;; si c'est égal à 0 (pas de 0 trouvé), aller Label1
; ici, on va chercher le 0 dans le bloc de 32 caractères
pmovmskb ecx,XMM0 ;; générer le masque de XMM0 dans ecx
shl eax,16 ;; décaler eax (qui contient déjà le masque XMM1) à gauche d'un dword/2
or eax,ecx ;; fusionner ecx à eax
sub edx,esi ;; enlever l'adresse de départ à edx
sub edx,OWORD*2 ;; enlever la dernière progression à edx
bsf eax,eax ;; scanner le premier bit armé de eax à partir de la droite
add eax,edx ;; ajouter edx à eax pour obtenir la taille finale
pop edx ;; désempiler edx
pop ecx ;; désempiler ecx
ret ;; retourner (sortir de la procédure)
StringLength_Sse ENDP
I use similar algo with masm64... :wink
My results->Windows Vista Ultimate 64bit - SP1
CPU-DualCore Intel Core 2 Duo E8500, 3.16 GHz
C:\My Documents\ASM\strlen>strlena
lstrlenA return value : 191
strlen64Lingo return value: 191
AzmtStrLen1A return value : 191
AzmtStrLen2A return value : 191
markl_szlen return value : 191
StringLength_Mmx_Min
(MMX/SSE2) return value : 191
StrSizeA(SSE2) value : 191
align 0
lstrlenA : 258 cycles
strlen64Lingo : 20 cycles
AzmtStrLen1A : 191 cycles
AzmtStrLen2A : 191 cycles
markl_szlen : 100 cycles
StringLength_Mmx_Min: 54 cycles
StrSizeA(SSE2): 39 cycles
align 1
lstrlenA : 254 cycles
strlen64Lingo : 20 cycles
AzmtStrLen1A : 220 cycles
AzmtStrLen2A : 218 cycles
markl_szlen : 151 cycles
StringLength_Mmx_Min: 112 cycles
StrSizeA(SSE2): 97 cycles
align 4
lstrlenA : 254 cycles
strlen64Lingo : 20 cycles
AzmtStrLen1A : 191 cycles
AzmtStrLen2A : 191 cycles
markl_szlen : 89 cycles
StringLength_Mmx_Min: 114 cycles
StrSizeA(SSE2): 113 cycles
align 7
lstrlenA : 254 cycles
strlen64Lingo : 20 cycles
AzmtStrLen1A : 200 cycles
AzmtStrLen2A : 200 cycles
markl_szlen : 119 cycles
StringLength_Mmx_Min: 99 cycles
StrSizeA(SSE2): 109 cycles
Press any key to exit...
[attachment deleted by admin]
Hi Lingo,
As usual, your algo beats the hell out of 'em... at least for long strings:
lstrlenA return value : 1024
strlen64Lingo return value: 1024
AzmtStrLen1A return value : 1024
AzmtStrLen2A return value : 1024
markl_szlen return value : 1024
StringLength_Mmx_Min
(MMX/SSE2) return value : 1024
StrSizeA(SSE2) value : 1024
_strlen return value : 1024
align 1k
lstrlenA return value: 1024
lstrlenA : 3032 cycle
strlen64Lingo : 370 cycles
AzmtStrLen1A : 1407 cycle
AzmtStrLen2A : 1407 cycle
markl_szlen : 581 cycles
StringLength_Mmx_Min: 908 cycles
StrSizeA(SSE2): 600 cycles
_strlen (Agner Fog): 635 cycles
align 0
lstrlenA return value: 191
lstrlenA : 630 cycles
strlen64Lingo : 369 cycles
AzmtStrLen1A : 301 cycles
AzmtStrLen2A : 307 cycles
markl_szlen : 276 cycles
StringLength_Mmx_Min: 224 cycles
StrSizeA(SSE2): 114 cycles
_strlen (Agner Fog): 103 cycles
align 1
lstrlenA return value: 191
lstrlenA : 632 cycles
strlen64Lingo : 370 cycles
AzmtStrLen1A : 383 cycles
AzmtStrLen2A : 382 cycles
markl_szlen : 276 cycles
StringLength_Mmx_Min: 304 cycles
StrSizeA(SSE2): 138 cycles
_strlen (Agner Fog): 111 cycles
align 4
lstrlenA return value: 191
lstrlenA : 628 cycles
strlen64Lingo : 371 cycles
AzmtStrLen1A : 301 cycles
AzmtStrLen2A : 304 cycles
markl_szlen : 251 cycles
StringLength_Mmx_Min: 339 cycles
StrSizeA(SSE2): 142 cycles
_strlen (Agner Fog): 114 cycles
align 7
lstrlenA return value: 191
lstrlenA : 628 cycles
strlen64Lingo : 369 cycles
AzmtStrLen1A : 384 cycles
AzmtStrLen2A : 387 cycles
markl_szlen : 274 cycles
StringLength_Mmx_Min: 321 cycles
StrSizeA(SSE2): 144 cycles
_strlen (Agner Fog): 115 cycles
I added the last algo, see attachment.
[attachment deleted by admin]
Thanks, but I can't understand why to use so big strings...
I included A.Fog's algo too, but it is slower... :wink
C:\My Documents\ASM\strlen>strlena
lstrlenA return value : 191
strlen64Lingo return value: 191
A.Fog StrLen return value : 191
AzmtStrLen1A return value : 191
AzmtStrLen2A return value : 191
markl_szlen return value : 191
StringLength_Mmx_Min
(MMX/SSE2) return value : 191
StrSizeA(SSE2) value : 191
align 0
lstrlenA : 259 cycles
strlen64Lingo : 20 cycles
A.Fog StrLen : 36 cycles
AzmtStrLen1A : 192 cycles
AzmtStrLen2A : 191 cycles
markl_szlen : 100 cycles
StringLength_Mmx_Min: 50 cycles
StrSizeA(SSE2): 39 cycles
align 1
lstrlenA : 243 cycles
strlen64Lingo : 20 cycles
A.Fog StrLen : 49 cycles
AzmtStrLen1A : 222 cycles
AzmtStrLen2A : 218 cycles
markl_szlen : 151 cycles
StringLength_Mmx_Min: 113 cycles
StrSizeA(SSE2): 97 cycles
align 4
lstrlenA : 254 cycles
strlen64Lingo : 20 cycles
A.Fog StrLen : 44 cycles
AzmtStrLen1A : 192 cycles
AzmtStrLen2A : 191 cycles
markl_szlen : 89 cycles
StringLength_Mmx_Min: 112 cycles
StrSizeA(SSE2): 110 cycles
align 7
lstrlenA : 243 cycles
strlen64Lingo : 20 cycles
A.Fog StrLen : 49 cycles
AzmtStrLen1A : 200 cycles
AzmtStrLen2A : 200 cycles
markl_szlen : 119 cycles
StringLength_Mmx_Min: 99 cycles
StrSizeA(SSE2): 109 cycles
Press any key to exit...
[attachment deleted by admin]
Quote from: lingo on March 06, 2009, 05:00:36 PM
Thanks, but I can't understand why to use so big strings...
I included A.Fog's algo too, but it is slower... :wink
Strange. Here are my Celeron M results, and the Agner Fog algo is a lot faster on non-aligned short strings... ::)
align 0
lstrlenA return value: 191
lstrlenA : 429 cycles
strlen64Lingo : 198 cycles
AzmtStrLen1A : 283 cycles
AzmtStrLen2A : 223 cycles
markl_szlen : 113 cycles
StringLength_Mmx_Min: 72 cycles
StrSizeA(SSE2): 72 cycles
_strlen (Agner Fog): 91 cycles
align 1
lstrlenA return value: 191
lstrlenA : 422 cycles
strlen64Lingo : 198 cycles
AzmtStrLen1A : 282 cycles
AzmtStrLen2A : 230 cycles
markl_szlen : 144 cycles
StringLength_Mmx_Min: 118 cycles
StrSizeA(SSE2): 87 cycles
_strlen (Agner Fog): 64 cycles
Nothing strange for me...My results are the same as the results of qWord... :wink
On my old lapi with AMD Turion 64 ML-30, 1.6GHz
and Vista64bit Ultimate-SP1:
lstrlenA return value : 191
strlen64Lingo return value: 191
A.Fog StrLen return value : 191
AzmtStrLen1A return value : 191
AzmtStrLen2A return value : 191
markl_szlen return value : 191
StringLength_Mmx_Min
(MMX/SSE2) return value : 191
StrSizeA(SSE2) value : 191
align 0
lstrlenA : 425 cycles
strlen64Lingo : 55 cycles
A.Fog StrLen : 223 cycles
AzmtStrLen1A : 175 cycles
AzmtStrLen2A : 174 cycles
markl_szlen : 109 cycles
StringLength_Mmx_Min: 103 cycles
StrSizeA(SSE2): 101 cycles
align 1
lstrlenA : 425 cycles
strlen64Lingo : 55 cycles
A.Fog StrLen : 211 cycles
AzmtStrLen1A : 213 cycles
AzmtStrLen2A : 218 cycles
markl_szlen : 111 cycles
StringLength_Mmx_Min: 112 cycles
StrSizeA(SSE2): 106 cycles
align 4
lstrlenA : 426 cycles
strlen64Lingo : 55 cycles
A.Fog StrLen : 198 cycles
AzmtStrLen1A : 175 cycles
AzmtStrLen2A : 175 cycles
markl_szlen : 108 cycles
StringLength_Mmx_Min: 112 cycles
StrSizeA(SSE2): 106 cycles
align 7
lstrlenA : 425 cycles
strlen64Lingo : 55 cycles
A.Fog StrLen : 197 cycles
AzmtStrLen1A : 192 cycles
AzmtStrLen2A : 191 cycles
markl_szlen : 113 cycles
StringLength_Mmx_Min: 110 cycles
StrSizeA(SSE2): 104 cycles
Press any key to exit...
Quote from: lingo on March 07, 2009, 04:20:22 PM
Nothing strange for me...My results are the same as the results of qWord... :wink
On my old lapi with AMD Turion 64 ML-30, 1.6GHz
and Vista64bit Ultimate-SP1
Either there are dramatic differences between AMD and a Celeron M, or we are not talking about the same Agner Fog algo.
All of the routines, except Agner Fog's, fail on unaligned read beyond end of the buffer
invoke VirtualAlloc,0,1000h,MEM_COMMIT,PAGE_READWRITE
mov esi,eax
invoke RtlZeroMemory,esi,1000h
invoke AzmtStrLen1A,addr [esi+1000h-1]
invoke AzmtStrLen2A,addr [esi+1000h-1]
invoke markl_szlen,addr [esi+1000h-1]
invoke StringLength_Mmx_Min,addr [esi+1000h-1]
invoke StrSizeA,addr [esi+1000h-1]
lea ecx,[esi+1000h-1]
call strlen64
invoke _strlen,addr [esi+1000h-1]; *** working, not buggy
invoke VirtualFree,esi,0,MEM_RELEASE
zero should be returned, not access violation.
also markl_szlen function is buggy:
.data
teststr db 'ab',0,'a',0
.code
invoke markl_szlen,addr teststr
reports size 4
Hi jj2007:
lstrlenA return value : 1024
strlen64Lingo return value: 1024
AzmtStrLen1A return value : 1024
AzmtStrLen2A return value : 1024
markl_szlen return value : 1024
StringLength_Mmx_Min
(MMX/SSE2) return value : 1024
StrSizeA(SSE2) value : 1024
_strlen return value : 1024
align 1k
lstrlenA return value: 1024
lstrlenA : 1090 cycles
strlen64Lingo : 96 cycles
AzmtStrLen1A : 1071 cycles
AzmtStrLen2A : 1074 cycles
markl_szlen : 416 cycles
StringLength_Mmx_Min: 245 cycles
StrSizeA(SSE2): 226 cycles
_strlen (Agner Fog): 195 cycles
align 0
lstrlenA return value: 191
lstrlenA : 262 cycles
strlen64Lingo : 99 cycles
AzmtStrLen1A : 195 cycles
AzmtStrLen2A : 195 cycles
markl_szlen : 107 cycles
StringLength_Mmx_Min: 49 cycles
StrSizeA(SSE2): 40 cycles
_strlen (Agner Fog): 51 cycles
align 1
lstrlenA return value: 191
lstrlenA : 241 cycles
strlen64Lingo : 98 cycles
AzmtStrLen1A : 204 cycles
AzmtStrLen2A : 205 cycles
markl_szlen : 154 cycles
StringLength_Mmx_Min: 104 cycles
StrSizeA(SSE2): 91 cycles
_strlen (Agner Fog): 40 cycles
align 4
lstrlenA return value: 191
lstrlenA : 240 cycles
strlen64Lingo : 97 cycles
AzmtStrLen1A : 195 cycles
AzmtStrLen2A : 195 cycles
markl_szlen : 106 cycles
StringLength_Mmx_Min: 108 cycles
StrSizeA(SSE2): 104 cycles
_strlen (Agner Fog): 45 cycles
align 7
lstrlenA return value: 191
lstrlenA : 241 cycles
strlen64Lingo : 98 cycles
AzmtStrLen1A : 213 cycles
AzmtStrLen2A : 214 cycles
markl_szlen : 99 cycles
StringLength_Mmx_Min: 105 cycles
StrSizeA(SSE2): 129 cycles
_strlen (Agner Fog): 40 cycles
Result on my computer.
regards herge
Quote from: drizz on March 07, 2009, 05:24:12 PM
All of the routines, except Agner Fog's, fail on unaligned read beyond end of the buffer
...
zero should be returned, not access violation.
technically, -1 should be returned (0 for empty string, and obviously here it's not the case :wink)
anyway, agner's algo isn't (really) safe, he has just displaced the problem from the end of the area to the beginning. so yes, here it works on current systems coz of microsoft functions, but there is no warrantry it will work in the futur... anyway it's not the proper way to solve the problem, your functions dedicated to string should simply contain a "safe area"...
What are you talking about
?!?Quote from: NightWare on March 09, 2009, 12:29:11 AM
technically, -1 should be returned (0 for empty string, and obviously here it's not the case :wink)
but _it is_ an empty string
Quote from: NightWare on March 09, 2009, 12:29:11 AM
anyway, agner's algo isn't (really) safe, he has just displaced the problem from the end of the area to the beginning. so yes, here it works on current systems coz of microsoft functions, but there is no warrantry it will work in the futur...
"coz of microsoft functions"
?!??Which operating system allocates memory pages that are not 4kB or bigger?
Which operating system memory allocation functions return unaligned pointer?
facts please!
VirtualAlloc/NtAllocateMemory = 4kB aligned pointer
HeapAlloc/GlobalAlloc = 8-byte aligned on 32-bit platforms and 16-bytes on 64-bit
Windows, Linux, BSD or Mac, 32-bit x86Quote;************************* strlenSSE2.asm **********************************
; Author: Agner Fog
...
; Operating system: Windows, Linux, BSD or Mac, 32-bit x86
His function is safe, others are not!
AzmtStrLen1A - requires 4-byte aligned pointer
AzmtStrLen2A - requires 4-byte aligned pointer
markl_szlen - requires 16-byte aligned pointer
StringLength_Mmx_Min - requires 8-byte aligned pointer
StrSizeA - requires 16-byte aligned pointer
strlen64 - requires 32-byte aligned pointer
_strlen - no pointer alignment requirements
Quote from: NightWare on March 09, 2009, 12:29:11 AM
anyway, agner's algo isn't (really) safe, he has just displaced the problem from the end of the area to the beginning.
Agner's algo will fail, like all others, in the specific case more thoroughly described here (http://www.masm32.com/board/index.php?topic=10925.msg80375#msg80375). The issue is a non-issue for all real world applications; i.e. you must construct a test case where no null byte is being found near the page boundary,
and the memory is allocated with VirtualAlloc. HeapAlloc'ed memory does not throw an exception if you go some bytes beyond the page boundary.
The trick with his algo is that he starts on a safe boundary, and then uses a bsf to eliminate false hits. While bsf is listed as a very slow opcode in opcodes.chm, with 6-42 cycles, my tests show that it is now down to 2 cycles. The AF algo is pretty fast, and it seems only Lingo's routine can beat it, and for longer strings only.
Quote from: drizz on March 09, 2009, 01:20:35 AM
but _it is_ an empty string
0 must be returned only if the first byte is 0, and no other case, on a manipulated string area you can have no 0 (displacement of the 2nd part+insert) and the algo will fail... coz there is no size limit (+with this logic, you must take care of the possible fullfilled area for ALL of your string functions (copy, insert,...))
Quote from: drizz on March 09, 2009, 01:20:35 AM
"coz of microsoft functions" ?!??
ok here i've misread agner's algo, it's safe until what's previously said
Quote from: NightWare0 for empty string, and obviously here it's not the case
Quote from: NightWare on March 09, 2009, 02:29:59 AM
0 must be returned only if the first byte is 0
the first byte
IS 0 :dazzled: :dazzled: :dazzled:
I'm not talking nonsense situations like jj
Quote from: NightWare on March 09, 2009, 02:29:59 AM
on a manipulated string area you can have no 0 (displacement of the 2nd part+insert) and the algo will fail... coz there is no size limit (+with this logic, you must take care of the possible fullfilled area for ALL of your string functions (copy, insert,...))
:red oops correction... here the non 0 is possible because I USE a safe area... otherwise you're right drizz, no access violation from agner's algo. but i will continue to use my security area...
jj2007, If I am not wrong may be there is an error in your test
program for my results:
strlen64Lingo : 370 cycles -> align 1k vs
strlen64Lingo : 369 cycles ->align 0
and from herge's test:
strlen64Lingo : 96 cycles ->align 1k
strlen64Lingo : 99 cycles ->align 0
What is this: nonsense or manipulation... :lol
from your test:
align 1k
lstrlenA return value: 1024
lstrlenA : 3032 cycle
strlen64Lingo : 370 cycles
AzmtStrLen1A : 1407 cycle
AzmtStrLen2A : 1407 cycle
markl_szlen : 581 cycles
StringLength_Mmx_Min: 908 cycles
StrSizeA(SSE2): 600 cycles
_strlen (Agner Fog): 635 cycles
align 0
lstrlenA return value: 191
lstrlenA : 630 cycles
strlen64Lingo : 369 cycles
AzmtStrLen1A : 301 cycles
AzmtStrLen2A : 307 cycles
markl_szlen : 276 cycles
StringLength_Mmx_Min: 224 cycles
StrSizeA(SSE2): 114 cycles
_strlen (Agner Fog): 103 cycles
align 1k
lstrlenA return value: 1024
lstrlenA : 1090 cycles
strlen64Lingo : 96 cycles
AzmtStrLen1A : 1071 cycles
AzmtStrLen2A : 1074 cycles
markl_szlen : 416 cycles
StringLength_Mmx_Min: 245 cycles
StrSizeA(SSE2): 226 cycles
_strlen (Agner Fog): 195 cycles
align 0
lstrlenA return value: 191
lstrlenA : 262 cycles
strlen64Lingo : 99 cycles
AzmtStrLen1A : 195 cycles
AzmtStrLen2A : 195 cycles
markl_szlen : 107 cycles
StringLength_Mmx_Min: 49 cycles
StrSizeA(SSE2): 40 cycles
_strlen (Agner Fog): 51 cycles
I get the same numbers as herge did - looks like herge has a new computer :bg
Quote from: drizz on March 09, 2009, 02:47:48 AM
I'm not talking nonsense situations like jj
It was precisely my intention to prove that such algos fail only in nonsense situations. There was an earlier thread where people argued that SSE2 is bad because it may read 16 bytes "beyond", except only "one harmless byte".
Quote from: lingo on March 09, 2009, 03:37:18 AM
jj2007, If I am not wrong may be there is an error in your test
..
What is this: nonsense or manipulation... :lol
No manipulation. I posted the source above, so you can check. But the result is pretty odd indeed, worth investigating.
Hi All:
I am having trouble compiling StrLenaLingo.asm
C:\masm32\test>\masm32\bin\ml /c /coff /Zi /Zd /Fl "strlenalingo".asm
Assembling: strlenalingo.asm
strlenalingo.asm(471) : error A2008: syntax error : xmm
strlenalingo.asm(485) : error A2008: syntax error : xmm
strlenalingo.asm(563) : error A2008: syntax error : movdqa
movdqa xmm1, [eax] ; read from nearest preceding boundary << 471
movdqa xmm1, [eax] ; read 16 bytes aligned << 485
@@: movdqa xmm0,OWORD ptr [edx] ; << 563
I don't know much about xmm code.
So I don't know how to fix it.
Regards herge
drizz,
'strlen64 - requires 32-byte aligned pointer'
should be: strlen64 - requires 16-byte aligned pointer :wink
Quote from: jj2007 on March 09, 2009, 07:44:22 AM
Quote from: lingo on March 09, 2009, 03:37:18 AM
jj2007, If I am not wrong may be there is an error in your test
..
What is this: nonsense or manipulation... :lol
No manipulation. I posted the source above, so you can check. But the result is pretty odd indeed, worth investigating.
I investigated, and Lingo is right, there was an error: His code was always called with the address of the 1024 bytes string, which was kind of unfair :red
Now I took the best of two worlds, i.e. Lingo's speed and Agner's brilliant alignment scheme, and threw them together. The result (shown as strlen32) is, ehm, how to put it: just about good enough for my own private library: :bg
EDIT: Bug fixed - there was a "hole" of 16 bytes between the two parts.
Intel(R) Pentium(R) 4 CPU 3.40GHz (SSE3)
align 1k
lstrlenA return value: 1024
strlen32 return value: 1024
lstrlenA : 3096 cycles
strlen32 : 347 cycles
strlen64Lingo : 371 cycles
StrSizeA(SSE2): 594 cycles
_strlen (Agner Fog): 633 cycles
align 0
lstrlenA return value: 191
strlen32 return value: 191
lstrlenA : 637 cycles
strlen32 : 79 cycles
strlen64Lingo : 92 cycles
StrSizeA(SSE2): 122 cycles
_strlen (Agner Fog): 105 cycles
align 1
lstrlenA return value: 191
strlen32 return value: 191
lstrlenA : 625 cycles
strlen32 : 78 cycles
strlen64Lingo : not possible
StrSizeA(SSE2): 139 cycles
_strlen (Agner Fog): 112 cycles
Here is the algo, full code is attached.
@Herge: It won't compile with Masm v614 - use JWasm instead.
strlen32 proc src:DWORD ; jj 9 March 2007, 92 (down from 103) bytes
mov eax, [esp+4] ; get pointer to string: -- this part taken from Agner Fog --------
mov ecx, eax ; copy pointer
pxor xmm0, xmm0 ; set to zero for comparison
and eax, -16 ; align pointer by 16
and ecx, 15 ; lower 4 bits indicate misalignment
pcmpeqb xmm0, [eax] ; read 16 from nearest preceding boundary and compare with zero
pmovmskb edx, xmm0 ; get one bit for each byte result
shr edx, cl ; shift out false bits
shl edx, cl ; shift back again
bsf edx, edx ; find first 1-bit
jnz fdr1 ; found in round 1
add eax, 16 ; correct aligned pointer for bytes already treated above
pxor xmm0, xmm0 ; reset to zero for comparisons below
pxor xmm1, xmm1
; align 16 ; no good, costs about one cycle extra
@@: pcmpeqb xmm0, [eax] ; -------------- this part taken from Lingo --------------
pcmpeqb xmm1, [eax+16] ; ecx is pointer to initial string, 16-byte aligned
por xmm1, xmm0
pmovmskb edx, xmm1
add eax, 32 ; len counter (moving up costs 3 cycles for the 191 byte string)
test edx, edx
jz @B
pmovmskb ecx, xmm0
shl edx, 16
or edx, ecx
bsf edx, edx
sub eax, 32 ; subtract initial 16 bytes
fdr1: sub eax, [esp+4]
add eax, edx
ret 4
strlen32 endp
[attachment deleted by admin]
Probably a stupid question:
pxor xmm0, xmm0 ; reset to zero for comparisons below
pxor xmm1, xmm1
if 1 ; crashtest - some values will be incorrect
movdqa xmm1, Minus1
endif
; align 16 ; no good, costs about one cycle extra
@@: pcmpeqb xmm0, [eax] ; -------------- this part taken from Lingo --------------
pcmpeqb xmm1, [eax+16] ; ecx is pointer to initial string, 16-byte aligned
por xmm1, xmm0
pmovmskb edx, xmm1
add eax, 32 ; len counter (moving up costs 3 cycles for the 191 byte string)
test edx, edx
jz @B
Why is it apparently not necessary to reset xmm0 and xmm1 inside the loop? If I insert a movdqa xmm1, Minus1 before the loop, the algo will not work correctly for some strings; but although xmm1 changes a lot inside the loop, results seem not to be affected :dazzled:
0.183 cycles per byte seems quite acceptable - a factor 10 faster than lstrlen. In contrast to the P4, here Lingo's algo is a little bit faster for short strings.
Intel(R) Celeron(R) M CPU 420 @ 1.60GHz (SSE3)
strlen32 codesize=92
align 4k
lstrlenA return value: 4096
strlen32 return value: 4096
strlen32 : 749 cycles
strlen64Lingo : 761 cycles
_strlen (Agner Fog): 1095 cycles
align 1k
lstrlenA return value: 1024
strlen32 return value: 1024
strlen32 : 199 cycles
strlen64Lingo : 200 cycles
_strlen (Agner Fog): 271 cycles
align 0
lstrlenA return value: 191
strlen32 return value: 191
strlen32 : 48 cycles
strlen64Lingo : 44 cycles
_strlen (Agner Fog): 91 cycles
align 1
lstrlenA return value: 191
strlen32 return value: 191
strlen32 : 48 cycles
strlen64Lingo : not possible
_strlen (Agner Fog): 64 cycles
align 4
lstrlenA return value: 191
strlen32 return value: 191
strlen32 : 48 cycles
strlen64Lingo : not possible
_strlen (Agner Fog): 64 cycles
align 7
lstrlenA return value: 191
strlen32 return value: 191
strlen32 : 49 cycles
strlen64Lingo : not possible
_strlen (Agner Fog): 64 cycles
Herge,
You could use MASM 6.15 or later also.
Hi Greg:
I tried the ML.EXE that you can get if you have c++ 2005 from Microsoft.
It compiles okay, but you get a C...5 error access violation and you
send a message to Microsoft when you run the EXE.
lstrlenA return value : 1024
strlen64Lingo return value: 1024
AzmtStrLen1A return value : 1024
AzmtStrLen2A return value : 1024
markl_szlen return value : 1024
StringLength_Mmx_Min
(MMX/SSE2) return value : 1024
StrSizeA(SSE2) value : 1024
_strlen return value : 1024
align 1k
lstrlenA return value: 1024
lstrlenA : 1082 cycles
strlen64Lingo : 84 cycles
AzmtStrLen1A : 1061 cycles
AzmtStrLen2A : 1061 cycles
markl_szlen : 415 cycles
StringLength_Mmx_Min: 275 cycles
StrSizeA(SSE2): 168 cycles
_strlen (Agner Fog): 183 cycles
align 0
lstrlenA return value: 191
lstrlenA : 264 cycles
strlen64Lingo : 85 cycles
AzmtStrLen1A : 194 cycles
AzmtStrLen2A : 194 cycles
markl_szlen : 107 cycles
StringLength_Mmx_Min: 71 cycles
StrSizeA(SSE2): 27 cycles
_strlen (Agner Fog): 37 cycles
align 1
lstrlenA return value: 191
lstrlenA : 240 cycles
strlen64Lingo : 86 cycles
AzmtStrLen1A : 203 cycles
AzmtStrLen2A : 203 cycles
markl_szlen : 154 cycles
StringLength_Mmx_Min: 109 cycles
StrSizeA(SSE2): ; It Blows up HERE!
Micosoft writes a report.
C:\DOCUME~1\User\LOCALS~1\Temp\a488_appcompat.txt
Which for reasons I don't understand I can't find.
It does a dump in a list box you can Not Copy.
Which I must Say is Most helpful!
I believe we get a C5 error access violation.
Attachments StrLenaLingo ASM OBJ EXE PDB
Regards herge
[attachment deleted by admin]
Hi Herge,
There is a new version towards the bottom of page 13 of this thread, in this post (http://www.masm32.com/board/index.php?topic=1807.msg81053#msg81053). You have a previous one with a tiny bug:
StrSizeA proc lpStrA:DWORD
@@: mov edx,DWORD ptr [esp+4]
pxor xmm1,xmm1
mov ecx,edx
neg ecx
align 16
@@: movdqu xmm0,OWORD ptr [edx]
lea edx,[edx+16]
pcmpeqb xmm0,xmm1
pmovmskb eax,xmm0
test eax,eax
jz @B
@@: lea ecx,[edx+ecx-16]
xor edx,edx
bsf edx,eax
lea eax,[ecx+edx]
ret 4
StrSizeA endp
The new version strlen32 is faster and shorter and does not crash.
Hi jj2007:
We Have Lift Off!
lstrlenA return value : 1024
strlen64Lingo return value: 1024
AzmtStrLen1A return value : 1024
AzmtStrLen2A return value : 1024
markl_szlen return value : 1024
StringLength_Mmx_Min
(MMX/SSE2) return value : 1024
StrSizeA(SSE2) value : 1024
_strlen return value : 1024
align 1k
lstrlenA return value: 1024
lstrlenA : 1077 cycles
strlen64Lingo : 84 cycles
AzmtStrLen1A : 1056 cycles
AzmtStrLen2A : 1056 cycles
markl_szlen : 413 cycles
StringLength_Mmx_Min: 275 cycles
StrSizeA(SSE2): 224 cycles
_strlen (Agner Fog): 182 cycles
align 0
lstrlenA return value: 191
lstrlenA : 259 cycles
strlen64Lingo : 83 cycles
AzmtStrLen1A : 194 cycles
AzmtStrLen2A : 193 cycles
markl_szlen : 105 cycles
StringLength_Mmx_Min: 71 cycles
StrSizeA(SSE2): 38 cycles
_strlen (Agner Fog): 37 cycles
align 1
lstrlenA return value: 191
lstrlenA : 238 cycles
strlen64Lingo : 84 cycles
AzmtStrLen1A : 201 cycles
AzmtStrLen2A : 202 cycles
markl_szlen : 152 cycles
StringLength_Mmx_Min: 109 cycles
StrSizeA(SSE2): 91 cycles
_strlen (Agner Fog): 49 cycles
align 4
lstrlenA return value: 191
lstrlenA : 239 cycles
strlen64Lingo : 84 cycles
AzmtStrLen1A : 191 cycles
AzmtStrLen2A : 191 cycles
markl_szlen : 105 cycles
StringLength_Mmx_Min: 95 cycles
StrSizeA(SSE2): 104 cycles
_strlen (Agner Fog): 44 cycles
align 7
lstrlenA return value: 191
lstrlenA : 235 cycles
strlen64Lingo : 84 cycles
AzmtStrLen1A : 211 cycles
AzmtStrLen2A : 210 cycles
markl_szlen : 98 cycles
StringLength_Mmx_Min: 106 cycles
StrSizeA(SSE2): 138 cycles
_strlen (Agner Fog): 49 cycles
Thank you jj2007.
Regards herge
Quote from: jj2007 on March 09, 2009, 04:42:39 PM
Why is it apparently not necessary to reset xmm0 and xmm1 inside the loop?
because xmm0 and xmm1 are defined as zero during the comparisons (until a 0 is found)
here a new one, but must be tested (i've just made few test during conception) :
ALIGN 16
;
; syntax :
; mov esi,OFFSET String
; call NWStrLen
;
; Return :
; eax = String Length
;
NWStrLen PROC
push ecx ;; empiler ecx
push edx ;; empiler edx
mov edx,esi ;; placer l'adresse de départ dans edx
pxor XMM0,XMM0 ;; ) effacer XMM0 et XMM1 (ce sera nos registres de comparaison)
pxor XMM1,XMM1 ;; )
; ici, on teste un bloc de x caractères (dépend de l'alignement), pour voir s'il existe un 0
movdqu XMM2,OWORD PTR [edx] ;; placer l'oword à l'adresse en edx dans XMM2
and edx,0FFFFFFF0h ;; conserver l'alignement 16 précédant dans edx
pcmpeqb XMM0,XMM2 ;; comparer XMM2 à XMM0
pcmpeqb XMM1,OWORD PTR [edx+16] ;; comparer l'oword à l'adresse en edx+16 à XMM1
por XMM1,XMM0 ;; fusionner XMM1 et XMM0
pmovmskb eax,XMM1 ;; générer le masque de XMM1 dans eax
test eax,eax ;; fixer les flags de eax
jz Label1 ;; si c'est égal à 0 (pas de 0 trouvé), aller Label1
; ici, on va chercher le 0 dans le bloc de x caractères
shl eax,16 ;; décaler eax (qui contient déjà le masque XMM1) à gauche d'un dword/2
mov ecx,esi ;; placer l'adresse originelle dans ecx
sub ecx,edx ;; soustraire l'alignement précédant
shr eax,cl ;; décaler eax à droite, correspondant au décalage de l'alignement
pmovmskb ecx,XMM0 ;; générer le masque de XMM0 dans ecx
or eax,ecx ;; fusionner ecx à eax
bsf eax,eax ;; scanner le premier bit armé de eax à partir de la droite
pop edx ;; désempiler edx
pop ecx ;; désempiler ecx
ret ;; retourner (sortir de la procédure)
nop ;; ) alignement nécessaire pour un meilleur rendement
nop ;; )
nop ;; )
; nop ;; )
; nop ;; )
; nop ;; )
; nop ;; )
; nop ;; )
; ici, on teste un bloc de 32 caractères, pour voir s'il existe un 0
Label1: add edx,OWORD*2 ;; ajouter 32 (notre pas de progression) à edx
pcmpeqb XMM0,OWORD PTR [edx] ;; comparer l'oword à l'adresse en edx à XMM0
pcmpeqb XMM1,OWORD PTR [edx+16] ;; comparer l'oword à l'adresse en edx+16 à XMM1
por XMM1,XMM0 ;; fusionner XMM1 et XMM0
pmovmskb eax,XMM1 ;; générer le masque de XMM1 dans eax
test eax,eax ;; fixer les flags de eax
jz Label1 ;; si c'est égal à 0 (pas de 0 trouvé), aller Label1
; ici, on va chercher le 0 dans le bloc de 32 caractères
pmovmskb ecx,XMM0 ;; générer le masque de XMM0 dans ecx
shl eax,16 ;; décaler eax (qui contient déjà le masque XMM1) à gauche d'un dword/2
or eax,ecx ;; fusionner ecx à eax
sub edx,esi ;; enlever l'adresse de départ à edx
bsf eax,eax ;; scanner le premier bit armé de eax à partir de la droite
add eax,edx ;; ajouter edx à eax pour obtenir la taille finale
pop edx ;; désempiler edx
pop ecx ;; désempiler ecx
ret ;; retourner (sortir de la procédure)
NWStrLen ENDP
I modified a bit my strlen64 and created new strlen64A (Thanks to NightWare for movdqu idea) :wink
I used jj's test program and have new results:
Intel(R) Core(TM)2 Duo CPU E8500 @ 3.16GHz (SSE4)
strlen32 retval: 5, 10, 15, 20, 25, 30, 35, 40, 45
strlen32 retval: 5, 10, 15, 20, 25, 30, 35, 40, 45
lstrlenA return value : 1024
strlen64Lingo return value: 1024
strlen32 return value: 1024
StrSizeA(SSE2) value : 1024
_strlen return value : 1024
strlen64A return value : 1024
align 1k
lstrlenA return value: 1024
strlen32 return value: 1024
strlen32 : 105 cycles
strlen64Lingo : 84 cycles
strlen64LingoA: 83 cycles
_strlen (Agner Fog): 180 cycles
align 0
lstrlenA return value: 191
strlen32 return value: 191
strlen32 : 26 cycles
strlen64Lingo : 18 cycles
strlen64LingoA: 19 cycles
_strlen (Agner Fog): 40 cycles
align 1
lstrlenA return value: 191
strlen32 return value: 191
strlen32 : 26 cycles
strlen64Lingo : not possible
strlen64LingoA: 22 cycles
_strlen (Agner Fog): 50 cycles
align 4
lstrlenA return value: 191
strlen32 return value: 191
strlen32 : 26 cycles
strlen64Lingo : not possible
strlen64LingoA: 23 cycles
_strlen (Agner Fog): 50 cycles
align 7
lstrlenA return value: 191
strlen32 return value: 191
strlen32 : 26 cycles
strlen64Lingo : not possible
strlen64LingoA: 23 cycles
_strlen (Agner Fog): 50 cycles
Press any key to exit...
[attachment deleted by admin]
This is getting good...
Intel(R) Core(TM)2 Quad CPU Q6600 @ 2.40GHz (SSE4)
strlen32 retval: 5, 10, 15, 20, 25, 30, 35, 40, 45
strlen32 retval: 5, 10, 15, 20, 25, 30, 35, 40, 45
lstrlenA return value : 1024
strlen64Lingo return value: 1024
strlen32 return value: 1024
StrSizeA(SSE2) value : 1024
_strlen return value : 1024
strlen64A return value : 1024
align 1k
lstrlenA return value: 1024
strlen32 return value: 1024
strlen32 : 97 cycles
strlen64Lingo : 84 cycles
strlen64LingoA: 78 cycles
_strlen (Agner Fog): 178 cycles
align 0
lstrlenA return value: 191
strlen32 return value: 191
strlen32 : 24 cycles
strlen64Lingo : 19 cycles
strlen64LingoA: 20 cycles
_strlen (Agner Fog): 40 cycles
align 1
lstrlenA return value: 191
strlen32 return value: 191
strlen32 : 29 cycles
strlen64Lingo : not possible
strlen64LingoA: 23 cycles
_strlen (Agner Fog): 49 cycles
align 4
lstrlenA return value: 191
strlen32 return value: 191
strlen32 : 25 cycles
strlen64Lingo : not possible
strlen64LingoA: 23 cycles
_strlen (Agner Fog): 49 cycles
align 7
lstrlenA return value: 191
strlen32 return value: 191
strlen32 : 24 cycles
strlen64Lingo : not possible
strlen64LingoA: 23 cycles
_strlen (Agner Fog): 49 cycles
Hey jj, the CPU identification is good, now add the Windows version to it as well. :bg
Quote from: sinsi on March 10, 2009, 05:45:06 AM
This is getting good...
...
Hey jj, the CPU identification is good, now add the Windows version to it as well. :bg
XP unless otherwise specified. Speedwise, it should not make any difference. You may check this thread (http://www.masm32.com/board/index.php?topic=8802.msg64219#msg64219), but warning, what M$ expects us to do to detect the version is no good for your mental health.
I have incorporated Lingo's new algo, and replaced lstrlen with crt_strlen because lstrlen is no longer a serious competitor for these algos.
Intel(R) Pentium(R) 4 CPU 3.40GHz (SSE3)
codesizes: strlen32=92, strlen64A=117, _strlen=66
-- test 16k return values jj, Lingo, Agner: 16384, 16384, 16384
crt_strlen : 16155 cycles
strlen32 : 4819 cycles
strlen64LingoA : 6208 cycles
_strlen (Agner Fog): 10044 cycles
-- test 4k return values jj, Lingo, Agner: 4096, 4096, 4096
crt_strlen : 3973 cycles
strlen32 : 1144 cycles
strlen64LingoA : 1137 cycles
_strlen (Agner Fog): 2308 cycles
-- test 1k return values jj, Lingo, Agner: 1024, 1024, 1024
crt_strlen : 1046 cycles
strlen32 : 362 cycles
strlen64LingoA : 357 cycles
_strlen (Agner Fog): 651 cycles
-- test 0 return values jj, Lingo, Agner: 191, 191, 191
crt_strlen : 260 cycles
strlen32 : 73 cycles
strlen64LingoA : 78 cycles
_strlen (Agner Fog): 108 cycles
-- test 1 return values jj, Lingo, Agner: 191, 191, 191
crt_strlen : 255 cycles
strlen32 : 84 cycles
strlen64LingoA : 91 cycles
_strlen (Agner Fog): 115 cycles
-- test 4 return values jj, Lingo, Agner: 191, 191, 191
crt_strlen : 242 cycles
strlen32 : 78 cycles
strlen64LingoA : 80 cycles
_strlen (Agner Fog): 116 cycles
-- test 7 return values jj, Lingo, Agner: 191, 191, 191
crt_strlen : 257 cycles
strlen32 : 79 cycles
strlen64LingoA : 80 cycles
_strlen (Agner Fog): 111 cycles
[attachment deleted by admin]
Hi jj2007:
Good Morning here are my results for
strlenSSE2.exe
Intel(R) Core(TM)2 Duo CPU E4600 @ 2.40GHz (SSE4)
codesizes: strlen32=92, strlen64A=117, _strlen=66
-- test 16k return values Lingo, jj, Agner: 16384, 16384, 16384
crt_strlen : 9666 cycles
strlen32 : 1479 cycles
strlen64LingoA : 1139 cycles
_strlen (Agner Fog): 2817 cycles
-- test 4k return values Lingo, jj, Agner: 4096, 4096, 4096
crt_strlen : 2427 cycles
strlen32 : 405 cycles
strlen64LingoA : 333 cycles
_strlen (Agner Fog): 720 cycles
-- test 1k return values Lingo, jj, Agner: 1024, 1024, 1024
crt_strlen : 648 cycles
strlen32 : 101 cycles
strlen64LingoA : 98 cycles
_strlen (Agner Fog): 197 cycles
-- test 0 return values Lingo, jj, Agner: 191, 191, 191
crt_strlen : 123 cycles
strlen32 : 26 cycles
strlen64LingoA : 20 cycles
_strlen (Agner Fog): 56 cycles
-- test 1 return values Lingo, jj, Agner: 191, 191, 191
crt_strlen : 122 cycles
strlen32 : 26 cycles
strlen64LingoA : 33 cycles
_strlen (Agner Fog): 40 cycles
-- test 4 return values Lingo, jj, Agner: 191, 191, 191
crt_strlen : 122 cycles
strlen32 : 26 cycles
strlen64LingoA : 23 cycles
_strlen (Agner Fog): 46 cycles
-- test 7 return values Lingo, jj, Agner: 191, 191, 191
crt_strlen : 119 cycles
strlen32 : 26 cycles
strlen64LingoA : 23 cycles
_strlen (Agner Fog): 40 cycles
Press any key to exit...
Regards herge
On my old lapi with Vista64 Ultimate SP1: :wink
AMD Turion(tm) 64 Mobile Technology ML-30 (SSE3)
strlen32 retval: 5, 10, 15, 20, 25, 30, 35, 40, 45
strlen32 retval: 5, 10, 15, 20, 25, 30, 35, 40, 45
lstrlenA return value : 1024
strlen64Lingo return value: 1024
strlen32 return value: 1024
StrSizeA(SSE2) value : 1024
_strlen return value : 1024
strlen64A return value : 1024
align 1k
lstrlenA return value: 1024
strlen32 return value: 1024
strlen32 : 285 cycles
strlen64Lingo : 236 cycles
strlen64LingoA: 236 cycles
_strlen (Agner Fog): 942 cycles
align 0
lstrlenA return value: 191
strlen32 return value: 191
strlen32 : 109 cycles
strlen64Lingo : 53 cycles
strlen64LingoA: 54 cycles
_strlen (Agner Fog): 223 cycles
align 1
lstrlenA return value: 191
strlen32 return value: 191
strlen32 : 74 cycles
strlen64Lingo : not possible
strlen64LingoA: 64 cycles
_strlen (Agner Fog): 197 cycles
align 4
lstrlenA return value: 191
strlen32 return value: 191
strlen32 : 74 cycles
strlen64Lingo : not possible
strlen64LingoA: 64 cycles
_strlen (Agner Fog): 198 cycles
align 7
lstrlenA return value: 191
strlen32 return value: 191
strlen32 : 74 cycles
strlen64Lingo : not possible
strlen64LingoA: 64 cycles
_strlen (Agner Fog): 197 cycles
Press any key to exit...
AMD Turion(tm) 64 Mobile Technology ML-30 (SSE3)
codesizes: strlen32=92, strlen64A=117, _strlen=66
-- test 16k return values Lingo, jj, Agner: 16384, 16384, 16384
crt_strlen : 16537 cycles
strlen32 : 3182 cycles
strlen64LingoA : 3126 cycles
_strlen (Agner Fog): 14014 cycles
-- test 4k return values Lingo, jj, Agner: 4096, 4096, 4096
crt_strlen : 4132 cycles
strlen32 : 867 cycles
strlen64LingoA : 815 cycles
_strlen (Agner Fog): 3537 cycles
-- test 1k return values Lingo, jj, Agner: 1024, 1024, 1024
crt_strlen : 1051 cycles
strlen32 : 288 cycles
strlen64LingoA : 236 cycles
_strlen (Agner Fog): 939 cycles
-- test 0 return values Lingo, jj, Agner: 191, 191, 191
crt_strlen : 222 cycles
strlen32 : 113 cycles
strlen64LingoA : 54 cycles
_strlen (Agner Fog): 225 cycles
-- test 1 return values Lingo, jj, Agner: 191, 191, 191
crt_strlen : 217 cycles
strlen32 : 76 cycles
strlen64LingoA : 65 cycles
_strlen (Agner Fog): 197 cycles
-- test 4 return values Lingo, jj, Agner: 191, 191, 191
crt_strlen : 214 cycles
strlen32 : 76 cycles
strlen64LingoA : 63 cycles
_strlen (Agner Fog): 197 cycles
-- test 7 return values Lingo, jj, Agner: 191, 191, 191
crt_strlen : 211 cycles
strlen32 : 76 cycles
strlen64LingoA : 64 cycles
_strlen (Agner Fog): 198 cycles
Press any key to exit...
Thanks, very interesting. It seems the two algos are roughly equivalent, with Lingo's a bit stronger on AMD and Core2 (Herge) and mine stronger on P4's and (marginally) on Celeron M. In any case, Hutch faces a difficult choice for the next Masm32 version:
-- test 1k --
Masm32 lib szLen : 2215 cycles
crt_strlen : 1042 cycles
strlen32 : 354 cycles
strlen64LingoA : 354 cycles
_strlen (Agner Fog): 648 cycles
-- test aligned 1, 191 bytes --
Masm32 lib szLen : 515 cycles
crt_strlen : 262 cycles
strlen32 : 73 cycles
strlen64LingoA : 105 cycles
_strlen (Agner Fog): 111 cycles
A factor 6-7 on one of the most popular functions is not so bad :green2
I can't understand what happen with your PC or with you... :lol
New nonsense about the same program and test:
'strlen64LingoA : 105 cycles !!!'
Pls, take a look of your previous messages about the same test and program..
Where is the true?
Quote from: lingo on March 10, 2009, 02:03:40 PM
I can't understand what happen with your PC or with you... :lol
New nonsense about the same program and test:
'strlen64LingoA : 105 cycles !!!'
Pls, take a look of your previous messages about the same test and program..
Where is the true?
The truth is that timings tend to be not 100% accurate, and that I have a P4 in office, and a Celeron M at home. Your algo is marginally slower than mine
on a P4 for short unaligned strings... no need to panic, dear friend :thumbu
Here are some more timings with a higher LOOP_COUNT:
-- test 0 0=perfectly aligned on 16-byte boundary
crt_strlen : 243 cycles
strlen32 : 74 cycles
strlen64LingoA : 71 cycles
_strlen (Agner Fog): 105 cycles
-- test 1 1=misaligned 1 byte
crt_strlen : 247 cycles
strlen32 : 75 cycles
strlen64LingoA : 90 cycles
_strlen (Agner Fog): 111 cycles
-- test 4 return values
crt_strlen : 240 cycles
strlen32 : 76 cycles
strlen64LingoA : 81 cycles
_strlen (Agner Fog): 130 cycles
-- test 7 return values
crt_strlen : 243 cycles
strlen32 : 74 cycles
strlen64LingoA : 83 cycles
_strlen (Agner Fog): 114 cycles
Your algo seems faster on AMD and Core Duo. In any case, you should be proud of having found an algo that is 5 times as fast as the fastest M$ algo, and (for longer strings) twice as fast as the latest Agner Fog algo. My own one is a minor adaption of yours, so the credits go to you anyway :U
:bg
> In any case, Hutch faces a difficult choice for the next Masm32 version:
Yeah ?
No I don't, I have been watching musical chairs on string length algos for at least the last 10 years, in about 99.9999999999999999999999999% of cases the slow byte scanner is more than fast enough and in the .0 --- 0001% of other cases Agner Fog's algo is even more than fast enough. Speed is greate but it must also be useful gains and string length algos are rarely ever a big deal.
On a native 64 bit box it should be a toss between native 64 bit and emulated 128 bit SSE3/4/? on paragraph alignment, shame most string data is aligned to 1.
Quote from: hutch-- on March 10, 2009, 02:38:50 PM
:bg
... paragraph alignment, shame most string data is aligned to 1.
Shame you don't read the posts in your own forum. Both 'winner' algos have no problem with misalignment.
:(
I was not satisfied with the performance of my algo for short strings, so I fumbled together a variant, strlen32b. It is now almost on par with Lingo's algo for short strings, and about 2% faster for very long strings. Of course, AMD, Core2 and P4 might look different again - the Celeron M is "Core" but not "Core Duo" ::)
Intel(R) Celeron(R) M CPU 420 @ 1.60GHz (SSE3)
codesizes: strlen32=92, strlen32b=114, strlen64A=117, _strlen=66
ERROR in strlen64A at ct 16: 14 bytes instead of 15
-- test 16k return values Lingo, jj, Agner: 16384, 16384, 16384
strlen32 : 2881 cycles
strlen32b : 2936 cycles
strlen64LingoA : 3024 cycles
_strlen (Agner Fog): 4250 cycles
-- test 4k return values Lingo, jj, Agner: 4096, 4096, 4096
crt_strlen : 3800 cycles
strlen32 : 743 cycles
strlen32b : 744 cycles
strlen64LingoA : 774 cycles
_strlen (Agner Fog): 1103 cycles
-- test 0 return values Lingo, jj, Agner: 95, 95, 95
crt_strlen : 101 cycles
strlen32 : 31 cycles
strlen32b : 29 cycles
strlen64LingoA : 25 cycles
_strlen (Agner Fog): 30 cycles
-- test 1 return values Lingo, jj, Agner: 95, 95, 95
crt_strlen : 111 cycles
strlen32 : 31 cycles
strlen32b : 35 cycles
strlen64LingoA : 37 cycles
_strlen (Agner Fog): 34 cycles
-- test 3 return values Lingo, jj, Agner: 11, 14, 14
crt_strlen : 23 cycles
strlen32 : 20 cycles
strlen32b : 16 cycles
strlen64LingoA : 14 cycles
_strlen (Agner Fog): 14 cycles
-- test 15 return values Lingo, jj, Agner: -1, 14, 14
crt_strlen : 23 cycles
strlen32 : 20 cycles
strlen32b : 16 cycles
strlen64LingoA : 14 cycles
_strlen (Agner Fog): 14 cycles
[attachment deleted by admin]
JJ,
hmmm? No kidding? Misalignment of a 'one byte aligned' string... I do not know whether to laugh or cry. I do not think you 'got' the point Hutch was trying to make.
Paul
hi lingo,
por xmm1, xmm0
pxor xmm2, xmm2 ; why ? why do you want to use xmm2 ?
pmovmskb edx, xmm1
pxor xmm1, xmm1 ; why ? if the mask in edx = 0 then both xmm0 and xmm1 = 0
test edx, edx
>>jnz<< Ex_1
hi jj, same thing for you in your algo, why do you want to clean the simd registers when it's not needed ?
plus, if you really want to take care of the speed for small strings, you should avoid the jump (it's a potential misprediction).
Hi jj2007:
Here is my latest results:
Intel(R) Core(TM)2 Duo CPU E4600 @ 2.40GHz (SSE4)
codesizes: strlen32=92, strlen32b=114, strlen64A=117, _strlen=66
ERROR in TestAlgo at ct 16: 14 bytes instead of 15
-- test 16k return values Lingo, jj, Agner: 16384, 16384, 16384
strlen32 : 1503 cycles
strlen32b : 1512 cycles
strlen64LingoA : 1138 cycles
_strlen (Agner Fog): 2814 cycles
-- test 4k return values Lingo, jj, Agner: 4096, 4096, 4096
crt_strlen : 2425 cycles
strlen32 : 410 cycles
strlen32b : 403 cycles
strlen64LingoA : 325 cycles
_strlen (Agner Fog): 716 cycles
-- test 0 return values Lingo, jj, Agner: 95, 95, 95
crt_strlen : 61 cycles
strlen32 : 16 cycles
strlen32b : 15 cycles
strlen64LingoA : 14 cycles
_strlen (Agner Fog): 19 cycles
-- test 1 return values Lingo, jj, Agner: 95, 95, 95
crt_strlen : 61 cycles
strlen32 : 15 cycles
strlen32b : 18 cycles
strlen64LingoA : 26 cycles
_strlen (Agner Fog): 20 cycles
-- test 3 return values Lingo, jj, Agner: 11, 14, 14
crt_strlen : 14 cycles
strlen32 : 10 cycles
strlen32b : 8 cycles
strlen64LingoA : 6 cycles
_strlen (Agner Fog): 7 cycles
-- test 15 return values Lingo, jj, Agner: -1, 14, 14
crt_strlen : 14 cycles
strlen32 : 10 cycles
strlen32b : 8 cycles
strlen64LingoA : 6 cycles
_strlen (Agner Fog): 7 cycles
Press any key to exit...
Regards herge
Quote from: PBrennick on March 11, 2009, 12:48:59 AM
JJ,
hmmm? No kidding? Misalignment of a 'one byte aligned' string... I do not know whether to laugh or cry. I do not think you 'got' the point Hutch was trying to make.
Paul
Explain, please. I don't get your point.
Well, jj, put it this way...
ALIGN 1
Unless we can ALIGN 0.5, byte align is...everything...'get' it?
Quote from: NightWare on March 11, 2009, 01:25:59 AM
hi jj, same thing for you in your algo, why do you want to clean the simd registers when it's not needed ?
plus, if you really want to take care of the speed for small strings, you should avoid the jump (it's a potential misprediction).
Hi NightWare, thanks a lot for reading this thoroughly :thumbu
I have wondered myself whether clearing is not needed in some places, but was not sure. In fact, I took one out tonight, see below, ; pxor xmm0, xmm0. Could you please indicate where you consider it not needed?
As to the nullptr jump, I had to introduce it because it failed for null strings (and I admit I ws too tired to analyse the reason; plus, my Olly version here does not display xmm registers :dazzled:).
strlen32b proc src:DWORD ; jj 9 March 2007, 92 (down from 103) bytes; 0.176 cycles/byte at 16k
mov ecx, [esp+4] ; get pointer to string: -- this part taken from Agner Fog ----
mov al, [ecx] ; test for Null$
test al, al
je nullptr
pxor xmm0, xmm0 ; set to zero for comparison
mov eax, ecx ; copy pointer
pxor xmm1, xmm1
and eax, -16 ; align pointer by 16
and ecx, 15 ; lower 4 bits indicate misalignment
pcmpeqb xmm1, [eax] ; read 16 from nearest preceding boundary and compare with zero
; lea eax, [eax+16]
add eax, 16
pcmpeqb xmm0, [eax] ; ecx is pointer to initial string, 16-byte aligned
por xmm1, xmm0
pmovmskb edx, xmm1 ; get one bit for each byte result; - OK, isnull, stays null, Z=1, Bad: notnull, willbenull, z=0
shr edx, cl ; shift out false bits ** compliments to Agner, **
shl edx, cl ; shift back again ** this is called genius ;-) **
test edx, edx
jnz fdr1
add eax, 16 ; correct aligned pointer for bytes already treated above (lea exactly same cycles)
; pxor xmm0, xmm0 (must be 0) ; reset to zero for comparisons below
pxor xmm1, xmm1 ; align 16 no good, costs about one cycle extra
@@: pcmpeqb xmm0, [eax] ; ------ this part taken from Lingo, with adaptions ------
pcmpeqb xmm1, [eax+16] ; ecx is pointer to initial string, 16-byte aligned
por xmm1, xmm0
; add eax, 32 ; is marginally slower than lea
lea eax, [eax+32] ; len counter (moving up lea or add costs 3 cycles for the 191 byte string)
pmovmskb edx, xmm1
test edx, edx
jz @B
sub eax, 32 ; subtract initial bytes
fdr1: pmovmskb ecx, xmm0
shl edx, 16 ; bswap works, too, but one cycle slower
or edx, ecx
bsf edx, edx
add eax, edx
sub eax, [esp+4]
nullptr: ret 4
strlen32b endp
Quote from: sinsi on March 11, 2009, 07:03:23 AM
Well, jj, put it this way...
ALIGN 1
Unless we can ALIGN 0.5, byte align is...everything...'get' it?
Yeah, of course :boohoo: I was desperately trying to find
align 1 in the code I posted, but now I realise Paul means Hutch's statement. But again, this is deliberately trying to misunderstand him: What he meant (apparently - I also risk to misinterpret him) is that strings are in general
not aligned on a paragraph or even dword border, and that algos for aligned strings are therefore pretty useless. Lingo's first version had that problem, but he fixed it (but still has a minor problem for very short strings). So both "fast" algos are general purpose,
if you assume that the user has a modern CPU
I think that strings > MAX_PATH are rare - most strings (that you need to get the length of) are short (filenames etc.) so there is no need for a 'long' string scanner.
Mind you, for a long string scanner, you have possibly opened up a small niche, and the replies re optimisation can often apply to other bits of code.
Thanks to you, I've been looking at SSEx instructions and broadening my asm horizons (at least I think 'thanks, but' since they're a bit hard atm :bdg)
Hi jj2007:
Use windbg from Microsoft. It does display the xmm registers.
Regards herge
Quote from: sinsi on March 11, 2009, 07:26:47 AM
I think that strings > MAX_PATH are rare - most strings (that you need to get the length of) are short (filenames etc.) so there is no need for a 'long' string scanner.
Mind you, for a long string scanner, you have possibly opened up a small niche, and the replies re optimisation can often apply to other bits of code.
Thanks to you, I've been looking at SSEx instructions and broadening my asm horizons (at least I think 'thanks, but' since they're a bit hard atm :bdg)
Long strings are rare, that's correct. But the new algos are a factor 3 faster than
len(My$):
-- test 0 return values Lingo, jj, Agner: 62, 62, 62
Masm32 lib szLen : 88 cycles
crt_strlen : 81 cycles
strlen32b : 23 cycles
strlen64LingoA : 24 cycles
_strlen (Agner Fog): 26 cycles
-- test 1 return values Lingo, jj, Agner: 62, 62, 62
Masm32 lib szLen : 89 cycles
crt_strlen : 81 cycles
strlen32b : 23 cycles
strlen64LingoA : 23 cycles
_strlen (Agner Fog): 22 cycles
D:\masm32\examples\exampl10\timer_demos\unroll\unroll_test.exe has 62 bytes ;-)
:bg
> Shame you don't read the posts in your own forum. Both 'winner' algos have no problem with misalignment.
Years of reading posts leave you with a reasonably good idea of the value of an "atom cracking" string length algo. Let me think, "As useful as a hip pocket in a singlet", wjhat about the world's fastest "MessageBoxA" algo ? How about a hobbling horse in the Kentucky Derby ? :P
Quote from: herge on March 11, 2009, 07:29:29 AM
Hi jj2007:
Use windbg from Microsoft. It does display the xmm registers.
Regards herge
Thanks, herge. I will look into it. Olly has the same capacity, but it is somewhat hidden in the options.
Hi jj2007:
See attachment a picture of windbg in action.
Regards herge
[attachment deleted by admin]
Hutch,
... llike a screen door on a submarine.
Paul
"why do you want to use xmm2 ?"
Thanks NightWare, it was from other similar algos...
IMO we may need several strlen algos to use in the application.
For example: strlenA for bigger strings and strlenB for short strings...
Intel(R) Core(TM)2 Duo CPU E8500 @ 3.16GHz (SSE4)
codesizes: strlen32=92, strlen32b=114, strlen64A=112, strlen64B=87, _strlen=66
-- test 16k return values LingoA,LingoB, jj, Agner: 16384, 16384, 163
4, 16384
strlen32 : 1577 cycles
strlen32b : 1585 cycles
strlen64LingoA : 1553 cycles
strlen64LingoB : 1604 cycles
_strlen (Agner Fog): 2793 cycles
-- test 4k return values LingoA,LingoB, jj, Agner: 4096, 4096, 4096, 4096
crt_strlen : 2727 cycles
strlen32 : 420 cycles
strlen32b : 421 cycles
strlen64LingoA : 405 cycles
strlen64LingoB : 412 cycles
_strlen (Agner Fog): 716 cycles
-- test 0 return values LingoA,LingoB, jj, Agner: 95, 95, 95, 95
crt_strlen : 77 cycles
strlen32 : 17 cycles
strlen32b : 15 cycles
strlen64LingoA : 11 cycles
strlen64LingoB : 13 cycles
_strlen (Agner Fog): 19 cycles
-- test 1 return values LingoA,LingoB, jj, Agner: 95, 95, 95, 95
crt_strlen : 79 cycles
strlen32 : 17 cycles
strlen32b : 19 cycles
strlen64LingoA : 28 cycles
strlen64LingoB : 25 cycles
_strlen (Agner Fog): 20 cycles
-- test 3 return values LingoA,LingoB, jj, Agner: 14, 14, 14, 14
crt_strlen : 17 cycles
strlen32 : 10 cycles
strlen32b : 8 cycles
strlen64LingoA : 6 cycles
strlen64LingoB : 4 cycles
_strlen (Agner Fog): 7 cycles
-- test 15 return values LingoA,LingoB, jj, Agner: 14, 14, 14, 14
crt_strlen : 16 cycles
strlen32 : 10 cycles
strlen32b : 8 cycles
strlen64LingoA : 6 cycles
strlen64LingoB : 3 cycles
_strlen (Agner Fog): 7 cycles
Press any key to exit...
[attachment deleted by admin]
I imagine this because there are lots of timings posted on this and other topics.
Wouldnt it be real nice to be able to write
code and is profiled as your writing it...youd get timings instantly !
Timings that would be identical to what youd get as you do timings now, manually.
Or even written and profiled simultaneously
as if your on a different processor altogether. Clusters ? Parallel ?
Code would be profiled by speed, security, or memory...
just daydreaming. I know this kind of editor
would have to be partially if not fully written
in assembler, and not in my lifetime ? Open source ?
IT PROBABLY IS NOT AS DIFFICULT AS IT SEEMS, ON SOME LEVELS.
I know you think I am going toward 'the super optimizing compiler' direction.
More like 'the supervising optimizing compiler'.
Quote from: jj2007 on March 11, 2009, 07:05:49 AM
I have wondered myself whether clearing is not needed in some places, but was not sure. In fact, I took one out tonight, see below, ; pxor xmm0, xmm0. Could you please indicate where you consider it not needed?
pxor xmm1,xmm1 just after is also useless coz you have jumped to fdr1 if it's not equal to 0. :wink
Hi lingo:
Results from my computer.
Intel(R) Core(TM)2 Duo CPU E4600 @ 2.40GHz (SSE4)
codesizes: strlen32=92, strlen32b=114, strlen64A=112, strlen64B=87, _strlen=66
-- test 16k return values LingoA,LingoB, jj, Agner: 16384, 16384, 16384, 16384
strlen32 : 1491 cycles
strlen32b : 1521 cycles
strlen64LingoA : 1140 cycles
strlen64LingoB : 1297 cycles
_strlen (Agner Fog): 2862 cycles
-- test 4k return values LingoA,LingoB, jj, Agner: 4096, 4096, 4096, 4096
crt_strlen : 2443 cycles
strlen32 : 401 cycles
strlen32b : 410 cycles
strlen64LingoA : 353 cycles
strlen64LingoB : 325 cycles
_strlen (Agner Fog): 730 cycles
-- test 0 return values LingoA,LingoB, jj, Agner: 95, 95, 95, 95
crt_strlen : 66 cycles
strlen32 : 17 cycles
strlen32b : 14 cycles
strlen64LingoA : 12 cycles
strlen64LingoB : 14 cycles
_strlen (Agner Fog): 23 cycles
-- test 1 return values LingoA,LingoB, jj, Agner: 95, 95, 95, 95
crt_strlen : 62 cycles
strlen32 : 18 cycles
strlen32b : 18 cycles
strlen64LingoA : 31 cycles
strlen64LingoB : 25 cycles
_strlen (Agner Fog): 21 cycles
-- test 3 return values LingoA,LingoB, jj, Agner: 14, 14, 14, 14
crt_strlen : 15 cycles
strlen32 : 11 cycles
strlen32b : 10 cycles
strlen64LingoA : 6 cycles
strlen64LingoB : 2 cycles
_strlen (Agner Fog): 7 cycles
-- test 15 return values LingoA,LingoB, jj, Agner: 14, 14, 14, 14
crt_strlen : 14 cycles
strlen32 : 10 cycles
strlen32b : 8 cycles
strlen64LingoA : 6 cycles
strlen64LingoB : 3 cycles
_strlen (Agner Fog): 7 cycles
Press any key to exit...
Regards herge
Quote from: NightWare on March 11, 2009, 10:43:53 PM
Quote from: jj2007 on March 11, 2009, 07:05:49 AM
I have wondered myself whether clearing is not needed in some places, but was not sure. In fact, I took one out tonight, see below, ; pxor xmm0, xmm0. Could you please indicate where you consider it not needed?
pxor xmm1,xmm1 just after is also useless coz you have jumped to fdr1 if it's not equal to 0. :wink
I thought so, too. But the
shr edx, cl (shift out false bits) trick has one nasty side effect: You might have an FF somewhere in xmm1 because there was a zero byte
before your misaligned string:
align 16
db 15 dup (0)
szTest_Fail db "my other brother darryl my other brother darryl"
db 255, 255, 255, 0
Now one might argue that no sane person has a string with FF/255 bytes. But it fails exactly for this case (I tested it) :wink
Quote from: jj2007 on March 11, 2009, 11:09:04 PM
I thought so, too. But the shr edx, cl (shift out false bits) trick has one nasty side effect: You might have an FF somewhere in xmm1 because there was a zero byte before your misaligned string:
hmm, for example you could use (in your strlen32 algo) :
pxor xmm0,xmm0
movdqu xmm1,[eax]
pcmpeqb xmm1,xmm0 ; <- here you will have the same result as pxor xmm1,xmm1 if there is no 0
and eax,0FFFFFFF0h
pmovmskb edx,xmm1
...
and no need for shr/shl edx,cl
Quote from: NightWare on March 11, 2009, 11:49:15 PM
Quote from: jj2007 on March 11, 2009, 11:09:04 PM
I thought so, too. But the shr edx, cl (shift out false bits) trick has one nasty side effect: You might have an FF somewhere in xmm1 because there was a zero byte before your misaligned string:
hmm, for example you could use (in your strlen32 algo) :
pxor xmm0,xmm0
movdqu xmm1,[eax]
pcmpeqb xmm1,xmm0 ; <- here you will have the same result as pxor xmm1,xmm1 if there is no 0
and eax,0FFFFFFF0h
pmovmskb edx,xmm1
...
and no need for shr/shl edx,cl
Thanks, NightWare. In the meantime, I had found a different way to overcome this, a repeated
pcmpeqb xmm0, [eax]:
strlen32s proc src:DWORD ; jj 12 March 2007, 89 bytes; 0.176 cycles/byte at 16k
mov ecx, [esp+4] ; get pointer to string: -- this part taken from Agner Fog ----
pxor xmm0, xmm0 ; zero for comparison
movups xmm1, [ecx] ; move 16 bytes into xmm1, unaligned (adapted from Lingo)
pcmpeqb xmm1, xmm0 ; set bytes in xmm2 to FF if nullbytes found in xmm1
pmovmskb edx, xmm1 ; set byte mask in edx
bsf eax, edx ; bit scan forward
jne Le16 ; return bsf index if a bit was set
mov eax, ecx ; copy pointer
and eax, -16 ; align pointer by 16
pxor xmm1, xmm1 ; zero for comparison
and ecx, 15 ; lower 4 bits indicate misalignment
je @F ; jumping is a few cycles faster
pcmpeqb xmm0, [eax] ; force FF's into false positives (the SSE2 equivalent to Agner's shr/shl trick)
@@: pcmpeqb xmm0, [eax] ; ------ this part taken from Lingo, with adaptions ------
pcmpeqb xmm1, [eax+16] ; ecx is pointer to initial string, 16-byte aligned
por xmm1, xmm0
lea eax, [eax+32] ; len counter (moving up lea or add costs 3 cycles for the 191 byte string)
pmovmskb edx, xmm1
test edx, edx
jz @B
pmovmskb ecx, xmm0
shl edx, 16 ; bswap works, too, but one cycle slower
or edx, ecx
bsf edx, edx
lea eax, [eax+edx-32] ; add scan index, subtract initial bytes
sub eax, [esp+4]
Le16: ret 4
strlen32s endp
New Timings:
Intel(R) Celeron(R) M CPU 420 @ 1.60GHz (SSE3)
codesizes: strlen32s=89, strlen64B=87, _strlen=66
-- test 16k return values Lingo, jj, Agner: 16384, 16384, 16384
crt_strlen : 15288 cycles
strlen32s : 2890 cycles
strlen64LingoB : 2904 cycles
_strlen (Agner Fog): 4253 cycles
-- test 1k return values Lingo, jj, Agner: 1024, 1024, 1024
crt_strlen : 977 cycles
strlen32s : 199 cycles
strlen64LingoB : 193 cycles
_strlen (Agner Fog): 272 cycles
-- test 0 return values Lingo, jj, Agner: 95, 95, 95
crt_strlen : 101 cycles
strlen32s : 29 cycles
strlen64LingoB : 28 cycles
_strlen (Agner Fog): 30 cycles
-- test 1 return values Lingo, jj, Agner: 95, 95, 95
crt_strlen : 112 cycles
strlen32s : 40 cycles
strlen64LingoB : 33 cycles
_strlen (Agner Fog): 34 cycles
-- test 3 return values Lingo, jj, Agner: 15, 15, 15
crt_strlen : 25 cycles
strlen32s : 5 cycles
strlen64LingoB : 6 cycles
_strlen (Agner Fog): 14 cycles
-- test 15 return values Lingo, jj, Agner: 15, 15, 15
crt_strlen : 24 cycles
strlen32s : 5 cycles
strlen64LingoB : 6 cycles
_strlen (Agner Fog): 14 cycles
The new version includes also a correctness test for all algos. My new favourite is strlen32s: For long strings, it is 14 cycles faster than No. 2, strlen64LingoB, while for very short strings it is a whopping 16% faster than the latter. Lingo, you have a challenge!
[attachment deleted by admin]
Hi jj2007:
Even More Results from herge.
Intel(R) Core(TM)2 Duo CPU E4600 @ 2.40GHz (SSE4)
codesizes: strlen32s=89, strlen64B=87, _strlen=66
-- test 16k return values Lingo, jj, Agner: 16384, 16384, 16384
crt_strlen : 9628 cycles
strlen32s : 1489 cycles
strlen64LingoB : 1185 cycles
_strlen (Agner Fog): 2854 cycles
-- test 1k return values Lingo, jj, Agner: 1024, 1024, 1024
crt_strlen : 649 cycles
strlen32s : 101 cycles
strlen64LingoB : 99 cycles
_strlen (Agner Fog): 193 cycles
-- test 0 return values Lingo, jj, Agner: 95, 95, 95
crt_strlen : 64 cycles
strlen32s : 15 cycles
strlen64LingoB : 14 cycles
_strlen (Agner Fog): 19 cycles
-- test 1 return values Lingo, jj, Agner: 95, 95, 95
crt_strlen : 91 cycles
strlen32s : 31 cycles
strlen64LingoB : 25 cycles
_strlen (Agner Fog): 20 cycles
-- test 3 return values Lingo, jj, Agner: 15, 15, 15
crt_strlen : 17 cycles
strlen32s : 3 cycles
strlen64LingoB : 3 cycles
_strlen (Agner Fog): 7 cycles
-- test 15 return values Lingo, jj, Agner: 15, 15, 15
crt_strlen : 15 cycles
strlen32s : 2 cycles
strlen64LingoB : 3 cycles
_strlen (Agner Fog): 7 cycles
Press any key to exit...
Regards herge
?
mov eax, ecx ; copy pointer why ?
and eax, -16 ; align pointer by 16
pxor xmm1, xmm1 ; zero for comparison why ?
you don't need the following lines anymore... whith movups the possible 0 before can't exist...
and ecx, 15 ; lower 4 bits indicate misalignment
je @F ; jumping is a few cycles faster
pcmpeqb xmm0, [eax] ; force FF's into false positives (the SSE2 equivalent to Agner's shr/shl trick)
you just need to modify the end of the algo to obtain the correct result...
EDIT :
Quote from: jj2007 on March 12, 2009, 12:49:21 AM
for very short strings it is a whopping 16% faster than the latter. Lingo, you have a challenge!
:bg, but i remember you there is a jump, so a (certainly) branch misprediction, and
QuoteThe cost of a branch misprediction ranges from 12 to more than 50 clock cycles, depending on the length of the pipeline and other details of the microarchitecture.
(taken fom agner fog's last optimizations pdf file). so 50 cycles... it could be 1000% slower... :bg
jj,
Let's see what you "have": :wink
1. strlen32 - it is 1st half of code from A.Fog end the rest from Lingo - just the name strlen32 is from you
Proof:
"Now I took the best of two worlds, i.e. Lingo's speed and Agner's brilliant alignment scheme, and threw them together. The result (shown as strlen32) is, ehm, how to put it: just about good enough for my own private library: "
2. strlens32s - it is your top of the ice cream... :lol
It is code without nothing from A.Fog and 100 % from Lingo's strlenLingoB code...What happen with "Agner's brilliant alignment scheme"? :lol
Of course the new name- strlens32s and the test program is from you again.
Proof:"My new favorite is strlen32s: bla,blah,bla..." and Lingo's code insight :lol
3. "Lingo, you have a challenge!"
Actually you don't have your own code or ideas to "compete" here and I am not interested to fight with myself ...so there is no challenge for me to try to continue...
against my own code and ideas.
Proof: I have new faster strlen algo based on the new Nehalem string instructions but it is other story and challenge.
Hence, don't hurry up and read and think about NightWare notes carefully because I don't want to publish it yet... :lol
Quote from: lingo on March 12, 2009, 03:44:43 AM
Proof:
"Now I took the best of two worlds, i.e. Lingo's speed and Agner's brilliant alignment scheme, and threw them together. The result (shown as strlen32) is, ehm, how to put it: just about good enough for my own private library: "
Lingo, you don't have to prove something that is openly stated. This code has evolved over time, and you, Nightware and myself, we have produced the two fastest algos ever, despite of certain trolls pretending that a fast
len algo is a waste of time (but argue endlessly elsewhere about bad practices wasting cycles and damaging registers etc.). We are here because assembler can produce lean and mean code, and because it's fun testing the limits. You are excellent in testing these limits, and therefore your name does appear twice in the 30 lines of my current favourite called strlen32s. And if I find the time today, Nightware's corrections will also be tested, and his name will be added somewhere. Take it easy :U
Quote from: NightWare on March 12, 2009, 02:37:16 AM
Quote from: jj2007 on March 12, 2009, 12:49:21 AM
for very short strings it is a whopping 16% faster than the latter. Lingo, you have a challenge!
:bg, but i remember you there is a jump, so a (certainly) branch misprediction, and QuoteThe cost of a branch misprediction ranges from 12 to more than 50 clock cycles, depending on the length of the pipeline and other details of the microarchitecture.
(taken fom agner fog's last optimizations pdf file). so 50 cycles... it could be 1000% slower... :bg
:bg Thanks for your hints, it's now shorter and a bit faster. But Lingo's algo is equally good. New testbed attached below.
align 16 ; jj2007, 12 March 2007, 85 bytes; 0.176 cycles/byte at 16k on Celeron M (0.3 on P4)
strlen32s proc src:DWORD ; with lots of inspiration from Lingo, NightWare and Agner Fog
mov eax, [esp+4] ; get pointer to string
movups xmm1, [eax] ; move 16 bytes into xmm1, unaligned (adapted from Lingo/NightWare)
pxor xmm0, xmm0 ; zero for comparison (no longer needed for xmm1 - thanks, NightWare)
pcmpeqb xmm1, xmm0 ; set bytes in xmm1 to FF if nullbytes found in xmm1
pmovmskb eax, xmm1 ; set byte mask in eax
bsf eax, eax ; bit scan forward
jne Lt16 ; less than 16 bytes, we can return the index in eax
@@: push ecx ; all registers preserved, except eax = return value
push edx ; eax will be pointer to initial string, 16-byte aligned
mov ecx, [esp+12] ; get pointer to string
and ecx, -16 ; align initial pointer to 16-byte boundary
lea eax, [ecx+16] ; aligned pointer + 16 (first 0..15 dealt with by movups above)
@@: pcmpeqb xmm0, [eax] ; ---- inner loop inspired by Lingo, with adaptions -----
pcmpeqb xmm1, [eax+16] ; compare packed bytes in [m128] and xmm1 for equality
por xmm1, xmm0 ; or them: one of the mem locations may contain a nullbyte
lea eax, [eax+32] ; len counter (moving up lea or add costs 3 cycles for the 191 byte string)
pmovmskb edx, xmm1 ; set byte mask in edx
test edx, edx
jz @B
pmovmskb ecx, xmm0 ; set byte mask in ecx (has to be repeated, sorry)
shl edx, 16 ; create space for the ecx bytes
or edx, ecx ; combine xmm0 and xmm1 results
bsf edx, edx ; bit scan for the index
lea eax, [eax+edx-32] ; add scan index, subtract initial bytes
pop edx
sub eax, [esp+8]
pop ecx
Lt16: ret 4
strlen32s endp
Timings:
Intel(R) Pentium(R) 4 CPU 3.40GHz (SSE3)
ERROR in strlen64A at ebx=11: 16 bytes instead of 11
codesizes: strlen32s=77, strlen64A=120, strlen64B=87, _strlen=66
-- test 16k, misaligned 0, 16384 bytes
strlen32s : 4634 cycles
strlen64LingoB : 4978 cycles
_strlen (Agner Fog): 10152 cycles
-- test 4k, misaligned 11, 4096 bytes
crt_strlen : 3955 cycles
strlen32s : 1130 cycles
strlen64LingoB : 1126 cycles
_strlen (Agner Fog): 2235 cycles
-- test 1k, misaligned 0, 1024 bytes
strlen32s : 345 cycles
strlen64LingoB : 349 cycles
_strlen (Agner Fog): 636 cycles
-- test 0, misaligned 0, 95 bytes
crt_strlen : 231 cycles
strlen32s : 80 cycles
strlen64LingoB : 80 cycles
_strlen (Agner Fog): 100 cycles
-- test 1, misaligned 1, 95 bytes
crt_strlen : 203 cycles
strlen32s : 91 cycles
strlen64LingoB : 58 cycles
_strlen (Agner Fog): 64 cycles
-- test 3, misaligned 3, 15 bytes
crt_strlen : 35 cycles
strlen32s : 11 cycles
strlen64LingoB : 13 cycles
_strlen (Agner Fog): 23 cycles
-- test 15, misaligned 15, 15 bytes
crt_strlen : 32 cycles
strlen32s : 12 cycles
strlen64LingoB : 14 cycles
_strlen (Agner Fog): 23 cycles
EDIT: Attached new version with minor modifications.
[attachment deleted by admin]
Hi JJ2007:
The latest results from herge.
Intel(R) Core(TM)2 Duo CPU E4600 @ 2.40GHz (SSE4)
ERROR in strlen64A at ebx=11: 16 bytes instead of 11
codesizes: strlen32s=77, strlen64A=120, strlen64B=87, _strlen=66
-- test 16k, misaligned 0, 16384 bytes
strlen32s : 1457 cycles
strlen64LingoB : 1260 cycles
_strlen (Agner Fog): 2797 cycles
-- test 4k, misaligned 11, 4096 bytes
crt_strlen : 2401 cycles
strlen32s : 387 cycles
strlen64LingoB : 340 cycles
_strlen (Agner Fog): 731 cycles
-- test 1k, misaligned 0, 1024 bytes
strlen32s : 97 cycles
strlen64LingoB : 95 cycles
_strlen (Agner Fog): 178 cycles
-- test 0, misaligned 0, 95 bytes
crt_strlen : 60 cycles
strlen32s : 20 cycles
strlen64LingoB : 14 cycles
_strlen (Agner Fog): 18 cycles
-- test 1, misaligned 1, 95 bytes
crt_strlen : 63 cycles
strlen32s : 32 cycles
strlen64LingoB : 25 cycles
_strlen (Agner Fog): 20 cycles
-- test 3, misaligned 3, 15 bytes
crt_strlen : 15 cycles
strlen32s : 4 cycles
strlen64LingoB : 3 cycles
_strlen (Agner Fog): 7 cycles
-- test 15, misaligned 15, 15 bytes
crt_strlen : 15 cycles
strlen32s : 4 cycles
strlen64LingoB : 3 cycles
_strlen (Agner Fog): 7 cycles
Press any key to exit...
Regards herge
Here's my compulsatory submission for the latest evolution.
AMD Athlon(tm) 64 X2 Dual Core Processor 4000+ (SSE3)
ERROR in strlen64A at ebx=11: 16 bytes instead of 11
codesizes: strlen32s=85, strlen64A=120, strlen64B=87, _strlen=66
-- test 16k, misaligned 0, 16384 bytes
crt_strlen : 12338 cycles
strlen32s : 3135 cycles
strlen64LingoB : 3120 cycles
_strlen (Agner Fog): 13916 cycles
-- test 4k, misaligned 11, 4096 bytes
crt_strlen : 3229 cycles
strlen32s : 828 cycles
strlen64LingoB : 814 cycles
_strlen (Agner Fog): 3496 cycles
-- test 1k, misaligned 15, 1024 bytes
crt_strlen : 826 cycles
strlen32s : 252 cycles
strlen64LingoB : 237 cycles
_strlen (Agner Fog): 900 cycles
-- test 0, misaligned 0, 95 bytes
crt_strlen : 93 cycles
strlen32s : 57 cycles
strlen64LingoB : 40 cycles
_strlen (Agner Fog): 122 cycles
-- test 1, misaligned 1, 95 bytes
crt_strlen : 102 cycles
strlen32s : 59 cycles
strlen64LingoB : 43 cycles
_strlen (Agner Fog): 101 cycles
-- test 3, misaligned 3, 15 bytes
crt_strlen : 20 cycles
strlen32s : 20 cycles
strlen64LingoB : 20 cycles
_strlen (Agner Fog): 34 cycles
-- test 15, misaligned 15, 15 bytes
crt_strlen : 20 cycles
strlen32s : 20 cycles
strlen64LingoB : 20 cycles
_strlen (Agner Fog): 34 cycles
Ya know, tools such as these should also show the OS version and bit-width. It could be assumed erroniously that this box is running 64-bit XP when in fact it is running 32-bit XP. (Wasteful, perhaps, but I cannot afford to upgrade in the foreseeable future.)
Quote from: Mark Jones on March 12, 2009, 03:39:16 PM
Here's my compulsatory submission for the latest evolution.
AMD Athlon(tm) 64 X2 Dual Core Processor 4000+ (SSE3)
-- test 0, misaligned 0, 95 bytes
crt_strlen : 93 cycles
strlen32s : 57 cycles
strlen64LingoB : 40 cycles
_strlen (Agner Fog): 122 cycles
:bg Thanxalot. It seems Lingo has a little edge here. Interesting that Agner's algo gets beaten by crt_strlen, though.
Quote
Ya know, tools such as these should also show the OS version and bit-width. It could be assumed erroniously that this box is running 64-bit XP when in fact it is running 32-bit XP. (Wasteful, perhaps, but I cannot afford to upgrade in the foreseeable future.)
Good idea in principle, but showing the OS with GetVersionEx is so hilariously clumsy that I get an allergy when I even think of it :red
"It seems Lingo has a little edge here."
madjj is never tired to convert my code in lame code... :lol
but, 'Lingo has a BIG edge here' :lol....
Intel(R) Core(TM)2 Duo CPU E8500 @ 3.16GHz (SSE4)
100000000 bytes allocated
codesizes: strlen32sLAME=85, strlen64A=120, strlen64B=84, _strlen=66
-- test 16k, misaligned 0, 16384 bytes
strlen32sLAME 1575 cycles
strlen64LingoB 1532 cycles
_strlen (Agner Fog) 2761 cycles
-- test 4k, misaligned 11, 4096 bytes
strlen32sLAME 427 cycles
strlen64LingoB 404 cycles
_strlen (Agner Fog) 708 cycles
-- test 1k, misaligned 15, 1024 bytes
strlen32sLAME 100 cycles
strlen64LingoB 78 cycles
_strlen (Agner Fog) 193 cycles
-- test 0, misaligned 0, 95 bytes
Masm32 lib szLen 99 cycles
crt strlen 75 cycles
strlen32sLAME 19 cycles
strlen64LingoB 10 cycles
_strlen (Agner Fog) 19 cycles
-- test 1, misaligned 1, 95 bytes
Masm32 lib szLen 99 cycles
crt strlen 79 cycles
strlen32sLAME 19 cycles
strlen64LingoB 10 cycles
_strlen (Agner Fog) 20 cycles
-- test 3, misaligned 3, 15 bytes
Masm32 lib szLen 19 cycles
crt strlen 17 cycles
strlen32sLAME 3 cycles
strlen64LingoB 1 cycles
_strlen (Agner Fog) 7 cycles
-- test 15, misaligned 15, 15 bytes
Masm32 lib szLen 19 cycles
crt strlen 19 cycles
strlen32sLAME 20 cycles
strlen64LingoB 17 cycles
_strlen (Agner Fog) 7 cycles
Press any key to exit...
[attachment deleted by admin]
Intel(R) Core(TM)2 Quad CPU Q6600 @ 2.40GHz (SSE4)
100000000 bytes allocated
codesizes: strlen32sLAME=85, strlen64A=120, strlen64B=84, _strlen=66
-- test 16k, misaligned 0, 16384 bytes
strlen32sLAME 1454 cycles
strlen64LingoB 1183 cycles
_strlen (Agner Fog) 2759 cycles
-- test 4k, misaligned 11, 4096 bytes
strlen32sLAME 393 cycles
strlen64LingoB 330 cycles
_strlen (Agner Fog) 707 cycles
-- test 1k, misaligned 15, 1024 bytes
strlen32sLAME 101 cycles
strlen64LingoB 78 cycles
_strlen (Agner Fog) 193 cycles
-- test 0, misaligned 0, 95 bytes
Masm32 lib szLen 99 cycles
crt strlen 75 cycles
strlen32sLAME 19 cycles
strlen64LingoB 13 cycles
_strlen (Agner Fog) 19 cycles
-- test 1, misaligned 1, 95 bytes
Masm32 lib szLen 99 cycles
crt strlen 80 cycles
strlen32sLAME 19 cycles
strlen64LingoB 11 cycles
_strlen (Agner Fog) 21 cycles
-- test 3, misaligned 3, 15 bytes
Masm32 lib szLen 19 cycles
crt strlen 19 cycles
strlen32sLAME 3 cycles
strlen64LingoB 1 cycles
_strlen (Agner Fog) 7 cycles
-- test 15, misaligned 15, 15 bytes
Masm32 lib szLen 19 cycles
crt strlen 19 cycles
strlen32sLAME 21 cycles
strlen64LingoB 18 cycles
_strlen (Agner Fog) 7 cycles
Press any key to exit...
This is getting ridiculous. "1 cycles" - now we need logic to print "1 cycle" :P
Since we are on the cutting-edge here, can we have some native 64-bit code for 64-bit windows please? I guess we'll need 'timers64.inc' or something...
horse>die>flog :bg
edit: what horrible code in the .asm - most doesn't even get used, what a nightmare to follow...
Quote; this is a comment with trailing blanks
is that zen? :bdg
"can we have some native 64-bit code for 64-bit windows please?"
Thanks, it works for me fine...: :wink
Usage:
lea rax, szBuffer
call strlen64
align 16
db 8Dh,0A4h,24h,0,0,0,0,8Dh,48h,0,10h
strlen64:
pop rcx
movdqu xmm2, [rax]
pxor xmm0, xmm0
pcmpeqb xmm2, xmm0
pxor xmm1, xmm1
pmovmskb edx, xmm2
test edx, edx
jz @f
bsf eax, edx
jmp rcx
@@:
lea rcx, [rax+16]
and rax, -16
@@:
pcmpeqb xmm0, [rax+16]
pcmpeqb xmm1, [rax+32]
por xmm1, xmm0
add rax, 32
pmovmskb edx, xmm1
test edx, edx
jz @b
shl edx, 16
sub rax, rcx
pmovmskb ecx, xmm0
or edx, ecx
mov rcx, [rsp-8]
bsf edx, edx
add rax, rdx
jmp rcx
Quote from: sinsi on March 13, 2009, 06:34:06 AM
edit: what horrible code in the .asm - most doesn't even get used, what a nightmare to follow...
You are perfectly right. I took over some "organically grown" code, and certainly have not added to its readability. But the purpose was not beauty but rather to find those 88 bytes or so that would be considered good enough to replace
len() for the next ten years. I know it is a bad habit to reopen old threads, but this one was unfinished business - now we have two algos that do the job. Lingo's is an edge faster, but I hate paying royalties to people who call me madjj :toothy
Quote; this is a comment with trailing blanks
is that zen? :bdg
Well, kind of. It is a leftover of my attempt to teach RichMasm to autoformat code that comes along with spaces and varying tab sizes. Example:
strlen64:
pop rcx
movdqu xmm2, [rax]
pxor xmm0, xmm0
pcmpeqb xmm2, xmm0
pxor xmm1, xmm1
pmovmskb edx, xmm2
test edx, edx
jz @f
bsf eax, edx
jmp rcx
This is actually nicely formatted, in comparison to many other snippets I have seen, but it forces the eye to jump a lot from left to right. Here is the autoformatted version:
strlen64:
pop rcx
movdqu xmm2, [rax]
pxor xmm0, xmm0
pcmpeqb xmm2, xmm0
pxor xmm1, xmm1
pmovmskb edx, xmm2
test edx, edx
jz @f
bsf eax, edx
jmp rcx
The first one looks more beautiful, the second one is less tiresome for the eyes. A matter of taste, I guess.
jj, you are a mad bastard mate :bg
Switch to using ml64, no worries about cpu types - don't they all support sse3 at least?
Opening old threads? I have no problem with that if it's relevant (I do it with threads I've started - saves remembering) and saves getting 6 million search results...
lingo, the trouble is building a native pe64 and getting some timings from it. If MichaelW doesn't mind, maybe someone can change the timers.asm...when I'm sober I might try (oops then it will never happen).
Quote from: lingo on March 13, 2009, 06:01:51 AM
"It seems Lingo has a little edge here."
madjj is never tired to convert my code in lame code... :lol
The only "conversion" I have ever made to your code is to pass the source over the stack, to make the benchmarks comparable to the others.
Quote
but, 'Lingo has a BIG edge here' :lol....
Intel(R) Pentium(R) 4 CPU 3.40GHz (SSE3)
-- test 16k, misaligned 0, 16384 bytes
strlen32sLAME 4797 cycles <**********
strlen64LingoB 5045 cycles <**********
_strlen (Agner Fog) 9599 cycles
...
-- test 3, misaligned 3, 15 bytes
Masm32 lib szLen 47 cycles
crt strlen 35 cycles
strlen32sLAME 11 cycles <**********
strlen64LingoB 15 cycles <**********
_strlen (Agner Fog) 23 cycles
Ever heard about hardware differences?
:lol
I fix a lot of computers, and most of them are as the following:
AMD Athlon(tm) XP 2600+ (SSE1)
100000000 bytes allocated
ERROR in StrSizeA at ebx=4096: 4101 bytes instead of 4096
ERROR in strlen32c at ebx=4096: 4101 bytes instead of 4096
ERROR in strlen32sLAME at ebx=4096: 4101 bytes instead of 4096
ERROR in strlen64B at ebx=4096: 4101 bytes instead of 4096
codesizes: strlen32sLAME=85, strlen64A=120, strlen64B=84, _strlen=66
-- test 16k, misaligned 0, 16384 bytes
strlen32sLAME 22 cycles
strlen64LingoB 2758 cycles
_strlen (Agner Fog) 22683 cycles
-- test 4k, misaligned 11, 4096 bytes
strlen32sLAME 26 cycles
strlen64LingoB 729 cycles
_strlen (Agner Fog) 5713 cycles
-- test 1k, misaligned 15, 1024 bytes
strlen32sLAME 240 cycles
strlen64LingoB 220 cycles
_strlen (Agner Fog) 1476 cycles
-- test 0, misaligned 0, 95 bytes
Masm32 lib szLen 158 cycles
crt strlen 107 cycles
strlen32sLAME 68 cycles
strlen64LingoB 40 cycles
_strlen (Agner Fog) 192 cycles
-- test 1, misaligned 1, 95 bytes
Masm32 lib szLen 158 cycles
crt strlen 114 cycles
strlen32sLAME 71 cycles
strlen64LingoB 39 cycles
_strlen (Agner Fog) 161 cycles
-- test 3, misaligned 3, 15 bytes
Masm32 lib szLen 26 cycles
crt strlen 25 cycles
strlen32sLAME 60 cycles
strlen64LingoB 32 cycles
_strlen (Agner Fog) 50 cycles
-- test 15, misaligned 15, 15 bytes
Masm32 lib szLen 26 cycles
crt strlen 25 cycles
strlen32sLAME 26 cycles
strlen64LingoB 33 cycles
_strlen (Agner Fog) 73 cycles
Press any key to exit...
I realise that we're looking at sse2+ for ourselves, but most of them I fix are like this (p4's even without hypershite, athlons/durons).
Curious, why do I not get c0000097 (invalid opcode) running on the athlon?
Quote from: sinsi on March 13, 2009, 09:33:48 AM
I fix a lot of computers, and most of them are as the following:
AMD Athlon(tm) XP 2600+ (SSE1)
I realise that we're looking at sse2+ for ourselves, but most of them I fix are like this (p4's even without hypershite, athlons/durons).
Curious, why do I not get c0000097 (invalid opcode) running on the athlon?
Good question indeed. A library version should include a check for the SSE version.
For the madmen, here a new version. It includes now the pretty competitive algo posted by Nightware, 10.03.2009 (http://www.masm32.com/board/index.php?topic=1807.msg81075#msg81075), and Lingo's latest strlen64B with that cute pop ecx, jmp ecx trick. What a pity that it is a bit lame on an ordinary P4 :bg
EDIT: See below for current version.
[attachment deleted by admin]
Hi jj2007:
The latest results from herge.
Intel(R) Core(TM)2 Duo CPU E4600 @ 2.40GHz (SSE4)
codesizes: strlen32s=85, strlen64A=120, strlen64B=84, _strlen=66
-- test 16k, misaligned 0, 16384 bytes
strlen32s 1516 cycles
strlen64LingoB 1221 cycles
NWStrLen 1193 cycles
_strlen (Agner Fog) 2804 cycles
-- test 4k, misaligned 11, 4096 bytes
strlen32s 424 cycles
strlen64LingoB 322 cycles
NWStrLen 333 cycles
_strlen (Agner Fog) 722 cycles
-- test 1k, misaligned 15, 1024 bytes
strlen32s 122 cycles
strlen64LingoB 79 cycles
NWStrLen 102 cycles
_strlen (Agner Fog) 196 cycles
-- test 0, misaligned 0, 95 bytes
Masm32 lib szLen 101 cycles
crt strlen 62 cycles
strlen32s 43 cycles
strlen64LingoB 11 cycles
NWStrLen 15 cycles
_strlen (Agner Fog) 19 cycles
-- test 1, misaligned 1, 95 bytes
Masm32 lib szLen 99 cycles
crt strlen 100 cycles
strlen32s 19 cycles
strlen64LingoB 11 cycles
NWStrLen 21 cycles
_strlen (Agner Fog) 20 cycles
-- test 7, misaligned 7, 15 bytes
Masm32 lib szLen 19 cycles
crt strlen 15 cycles
strlen32s 21 cycles
strlen64LingoB 18 cycles
NWStrLen 25 cycles
_strlen (Agner Fog) 6 cycles
-- test 15, misaligned 15, 15 bytes
Masm32 lib szLen 20 cycles
crt strlen 15 cycles
strlen32s 2 cycles
strlen64LingoB 2 cycles
NWStrLen 9 cycles
_strlen (Agner Fog) 7 cycles
Press any key to exit...
Regards herge
Sorry jj but this .asm is getting way too complex and hard to follow. How about using .inc's for each algo, then we can get rid of the old, slow ones.
Cmon man, there are 16(?) procs in there and we are testing 4 of them...a 64k source file is too much. My brain gets lost in the labyrinth :bdg
Quote from: sinsi on March 13, 2009, 10:10:47 AM
Sorry jj but this .asm is getting way too complex and hard to follow. How about using .inc's for each algo, then we can get rid of the old, slow ones.
Cmon man, there are 16(?) procs in there and we are testing 4 of them...a 64k source file is too much. My brain gets lost in the labyrinth :bdg
OK, I put an effort into simplifying it. 27k instead of 64...
For the legal department: My latest version contains ideas ruthlessly stolen from Lingo:
strlen32s proc src:DWORD ; with lots of inspiration from Lingo, NightWare and Agner Fog
pop eax ; trash the return address to pop...
pop eax ; ...the src pointer
...
jmp dword ptr [esp-8] ; Lingo style equivalent to ret 4 ;-)
strlen32s endp
Although I don't quite understand why he wastes opcodes:
mov ecx, [esp-8] < not needed
bsf edx, edx
add eax, edx
jmp [esp-8] ; ecx < not needed
strlen64B endp
EDIT: Replaced attachment, with very minor modifications (2 bytes less for strlen32s :bg).
EDIT(2): New version - 80 bytes, now as fast as Lingo's algo even on modern hardware... :green
[attachment deleted by admin]
Quote"It seems Lingo has a little edge here."
madjj is never tired to convert my code in lame code...
The only "conversion" I have ever made to your code is to pass the source over the stack, to make the benchmarks comparable to the others.
Really?
Do you want to read together?..."to make it comparable to the others" :lol
Lingo's code:
align 16
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 ; align 15
strlen64xB proc szBuffer : dword ; old 85 bytes version, before [url=http://www.masm32.com/board/index.php?topic=1807.msg81266#msg81266]this post[/url]; commented by jj
;db 0cch
mov eax, [esp+4] ; pointer to src
movdqu xmm2, [eax] ; move 16 bytes into xmm2
pxor xmm0, xmm0 ; zero for comparison
pcmpeqb xmm2, xmm0 ; set bytes in xmm2 to FF if nullbytes found in xmm2
pxor xmm1, xmm1 ; zero for comparison
pmovmskb edx, xmm2 ; set byte mask in edx
lea ecx, [eax+16] ; ecx=eax+16
test edx, edx
jnz Ex_0
and eax, -16 ; align eax to para
@@:
add eax, 32
pcmpeqb xmm0, [eax-16]
pcmpeqb xmm1, [eax]
por xmm1, xmm0
pmovmskb edx, xmm1
test edx, edx
jz @B
shl edx, 16
sub eax, ecx
pmovmskb ecx, xmm0
;sub eax, [esp+4]
or edx, ecx
;sub eax, 16
bsf edx, edx
add eax, edx
ret 4
align 16
Ex_0:
bsf eax, edx
ret 4
strlen64xB endp
and LAME code:
align 16 ; jj2007, 12 March 2007, 85 bytes; 0.176 cycles/byte at 16k on Celeron M (0.3 on P4)
strlen32s proc src:DWORD ; with lots of inspiration from Lingo, NightWare and Agner Fog
mov eax, [esp+4] ; get pointer to string
movups xmm1, [eax] ; move 16 bytes into xmm1, unaligned (adapted from Lingo/NightWare)
pxor xmm0, xmm0 ; zero for comparison (no longer needed for xmm1 - thanks, NightWare)
pcmpeqb xmm1, xmm0 ; set bytes in xmm1 to FF if nullbytes found in xmm1
pmovmskb eax, xmm1 ; set byte mask in eax
bsf eax, eax ; bit scan forward
jne Lt16 ; less than 16 bytes, we can return the index in eax
@@:
push ecx ; all registers preserved, except eax = return value
push edx ; eax will be pointer to initial string, 16-byte aligned
mov ecx, [esp+12] ; get pointer to string
and ecx, -16 ; align initial pointer to 16-byte boundary
lea eax, [ecx+16] ; aligned pointer + 16 (first 0..15 dealt with by movups above)
@@: pcmpeqb xmm0, [eax] ; ---- inner loop inspired by Lingo, with adaptions -----
pcmpeqb xmm1, [eax+16] ; compare packed bytes in [m128] and xmm1 for equality
por xmm1, xmm0 ; or them: one of the mem locations may contain a nullbyte
lea eax, [eax+32] ; len counter (moving up lea or add costs 3 cycles for the 191 byte string)
pmovmskb edx, xmm1 ; set byte mask in edx
test edx, edx
jz @B
pmovmskb ecx, xmm0 ; set byte mask in ecx (has to be repeated, sorry)
shl edx, 16 ; create space for the ecx bytes
or edx, ecx ; combine xmm0 and xmm1 results
bsf edx, edx ; bit scan for the index
lea eax, [eax+edx-32] ; add scan index, subtract initial bytes
pop edx
sub eax, [esp+8]
pop ecx
Lt16: ret 4
strlen32s endp
" ; all registers preserved, except eax = return value"Why except eax ? You must preserve and eax too...to make the diagnosis pretty clear :lol
"Although I don't quite understand why he wastes opcodes:"I like your RichMasm editor and want to see more functionality in it..for instance: asm code highlighting,
so IMO will be better to spend your efforts to get there rather than to "understand why he wastes opcodes"... :lol
and later again...: :lol
Lingo's code:
align 16
db 8Dh,0A4h,24h,0,0,0,0,8Dh,48h,0,10h
strlen64B proc szBuffer : dword
pop ecx
pop eax
movdqu xmm2, [eax]
pxor xmm0, xmm0
pcmpeqb xmm2, xmm0
pxor xmm1, xmm1
pmovmskb edx, xmm2
test edx, edx
jz @f
bsf eax, edx
jmp ecx
@@:
lea ecx, [eax+16]
and eax, -16
@@:
pcmpeqb xmm0, [eax+16]
pcmpeqb xmm1, [eax+32]
por xmm1, xmm0
add eax, 32
pmovmskb edx, xmm1
test edx, edx
jz @B
shl edx, 16
sub eax, ecx
pmovmskb ecx, xmm0
or edx, ecx
mov ecx, [esp-8]
bsf edx, edx
add eax, edx
jmp ecx
strlen64B endp
and LAME code:
align 16 ; jj2007, 13 March 2007, 82 bytes; 0.176 cycles/byte at 16k on Celeron M (0.3 on P4)
strlen32s proc src:DWORD ; with lots of inspiration from Lingo, NightWare and Agner Fog
pop eax ; trash the return address to pop...
pop eax ; ...the src pointer
movups xmm1, [eax] ; move 16 bytes into xmm1, unaligned (adapted from Lingo/NightWare)
pxor xmm0, xmm0 ; zero for comparison (no longer needed for xmm1 - thanks, NightWare)
pcmpeqb xmm1, xmm0 ; set bytes in xmm1 to FF if nullbytes found in xmm1
pmovmskb eax, xmm1 ; set byte mask in eax
bsf eax, eax ; bit scan forward
jne Lt16 ; less than 16 bytes, we are done
mov edx, [esp-4] ; get pointer to string
and edx, -16 ; align initial pointer to 16-byte boundary
lea eax, [edx+16] ; aligned pointer + 16 (first 0..15 dealt with by movups above)
@@: pcmpeqb xmm0, [eax] ; ---- inner loop inspired by Lingo, with adaptions -----
pcmpeqb xmm1, [eax+16] ; compare packed bytes in [m128] and xmm1 for equality
por xmm1, xmm0 ; or them: one of the mem locations may contain a nullbyte
lea eax, [eax+32] ; len counter (moving up lea or add costs 3 cycles for the 191 byte string)
pmovmskb edx, xmm1 ; set byte mask in edx
test edx, edx
jz @B
sub eax, [esp-4] ; subtract original src pointer
push ecx ; all registers preserved, except edx and eax = return value
shl edx, 16 ; create space for the ecx bytes
pmovmskb ecx, xmm0 ; set byte mask in ecx (has to be repeated, sorry)
or edx, ecx ; combine xmm0 and xmm1 results
bsf edx, edx ; bit scan for the index
pop ecx
lea eax, [eax+edx-32] ; add scan index
Lt16:
jmp dword ptr [esp-8] ; Lingo style equivalent to ret 4 ;-)
strlen32s endp
"We don't live in a perfect world so we cannot garrantee that the forum will always be "idiot free"
but we do our best to keep it friendly..." by Hutch :lol
Quote from: lingo on March 13, 2009, 02:07:30 PM
Quote
madjj is never tired to convert my code in lame code...
The only "conversion" I have ever made to your code is to pass the source over the stack, to make the benchmarks comparable to the others.
Really?
Do you want to read together?..."to make it comparable to the others"
Lingo, I had understood that you had accused me of modifying your algos to make them slower. Misunderstanding, sorry. I have never denied that I stole ideas from you. Where else should I get good ideas? :bg
(although some of your code looks suspiciously similar to what NightWare posted a long time ago ::))
Quote
" ; all registers preserved, except eax = return value"
Why except eax ? You must preserve and eax too...to make the diagnosis pretty clear :lol
Finally a glimpse of humour, voilà! NightWare preserves ecx and edx, too. But I dropped support for "safe edx". IMHO, ecx should be preserved because it is so often used as a counter.
Quote
"Although I don't quite understand why he wastes opcodes:"
I like your RichMasm editor and want to see more functionality in it..for instance: asm code highlighting,
so IMO will be better to spend your efforts to get there rather than to "understand why he wastes opcodes"... :lol
Automatic highlighting is a matter of taste. I prefer to see (mostly) plain text, so that I can highlight myself the few areas where I still have a problem to solve. And then, it seems as if there is a conflict between automatic highlighting and making full use of the RichEdit features, such as hyperlinks ("before this post (http://www.masm32.com/board/index.php?topic=1807.msg81266#msg81266); ", see below)
As to the wasted opcodes, check yourself (even with Notepad :wink); two bytes less, and one or two cycles faster.
(http://www.webalice.it/jj2006/pics/strlen64.png)
Okay, Lingo,
A little help here. Who is the better coder, Lingo or Lingo?
Paul
"NightWare preserves ecx and edx, too..."
but Hutch doesn't... :lol
"And then, it seems as if there is a conflict between automatic highlighting and making full use of the RichEdit features, such as hyperlinks .."
It seems... you prefer to steal other's ideas and algos (for example: from lesson 35, Iczelion) rather than to use your own automatic highlighting algo for .RTF files to resolve the problems. :wink
"and one or two cycles faster."
Read A.Fog: Which one is faster - jump to register or jump to memory
For me: mov ecx, [esp-8] ; this instruction is for free!!! :lol
.......
jmp ecx
is faster than
jmp dword ptr [esp-8]
If you disagree just ask herge or sinsi to make tests for you (due to archaic type of your CPUs). :lol
Theirs CPUs are OK.
Quote from: lingo on March 13, 2009, 04:03:00 PM
It seems... you prefer to steal other's ideas and algos (for example: from lesson 35, Iczelion) rather than to use your own automatic highlighting algo for .RTF files to resolve the problems. :wink
Tut 35 has 1265 lines, my RichMasm source has over 9500. No need to steal. Besides, I also wrote already that I don't like Xmas trees. I am beyond that age :bg
Quote
"and one or two cycles faster."
Read A.Fog: Which one is faster - jump to register or jump to memory
For me: mov ecx, [esp-8] ; this instruction is for free!!! :lol
.......
jmp ecx
is faster than
jmp dword ptr [esp-8]
If you disagree just ask herge or sinsi to make tests for you (due to archaic type of your CPUs). :lol
Theirs CPUs are OK.
I am a fan of Agner, but I am an even greater fan of MichaelW's timer.asm :U
47 cycles jmp directly
48 cycles mov+jmp
47 cycles jmp directly
48 cycles mov+jmp
47 cycles jmp directly
48 cycles mov+jmp
Divide by ten. And before you shoot from the hip again: I never said that jumping directly is
much faster.
EDIT: Intel(R) Celeron(R) M CPU 420 @ 1.60GHz (SSE3): 40 for both of them, i.e. 4.00 cycles
hmm... someone here said once "reading asm related posts, is better than smoking marijuna" or something like that... seriously he was under the truth...
just few things, before it degenerate more :
concerning the algos, even if 1000+ instructions in x86, the simd instructions to compare byte are not numberous, so using it is quite logical. since we generally use the same programming schemes, it seams quite logical to obtain a similar conception/algo/result... nothing "strange".
plus, if we post an algo we give to others the possibility to improve it... we implicitly encourage the "copy/use" of the original algo as basement... (it's the purpose of the laboratory, no ?).
to finish, like said by someone else, the benefit of this sort of algo is quite limited... nothing serious to fight for...
Quote from: NightWare on March 14, 2009, 04:12:12 AM
concerning the algos, even if 1000+ instructions in x86, the simd instructions to compare byte are not numberous, so using it is quite logical. since we generally use the same programming schemes, it seams quite logical to obtain a similar conception/algo/result... nothing "strange".
Indeed, I was not hinting at any copyleft issues :wink - it was merely an observation that apparently we (you, Lingo, myself) have pushed the CPU to its limits; so our algos must look almost identical. Tonight, I managed to squeeze out a few cycles by moving a line up or down, and then had the bright idea to unroll the inner loop, but nope, not a single cycle less,
this is the limit. What counts in the end is a factor 5 improvement on szLen and crt_strlen, and a factor 10 on lstrlenA. For my part, this thread can be closed peacefully.
Hi jj2007:
We can't close yet we havn't got to 20 yet?
We at 18 we can do it!
Regards herge
"so using it is quite logical. since we generally use the same programming schemes, it seams quite logical to obtain a similar conception/algo/result... nothing "strange".
plus, if we post an algo we give to others the possibility to improve it... we implicitly encourage the "copy/use" of the original algo as basement... (it's the purpose of the laboratory, no ?)."
I implicitly encourage everyone to improve it too, but not to make it bad to worse.. :wink
In this case as a free human being I have a human right to tell my opinion too... :lol
What about criteria who is right or wrong?
The results!
But we have different results on different CPUs
as a jj respectfully stated "Ever heard about hardware differences?" :wink
Who makes code optimization for archaic CPUs? IMO sick people... :lol
Who preserves ecx and edx registers in "this sort of algo"? IMO lame people...
I can continue with who and IMO... :wink
"the benefit of this sort of algo is quite limited ..."
A lot of people have similar opinion but fortunately some people from Intel
created new faster instructions exactly for "this sort of algo"...
The speed is never enough.
"nothing serious to fight for..."
As an engineer I believe in numbers rather than in emotions and empty words as a serious,
unserious, etc...
For those who have followed this thread, here finally a "library package". All you really need is to extract the file slenSSE2.inc to \masm32\include\slenSSE2.inc
Here is the most basic usage example:
include \masm32\include\masm32rt.inc
include \masm32\include\slenSSE2.inc
.code
ShortString db "My short string", 0
start:
print offset ShortString, " has "
print str$(len(offset ShortString)), " bytes"
print chr$(13, 10, 10, "-- hit any key --")
getkey
exit
end start
If you use the len macro in your code, then the only difference to ordinary Masm32 code is line 2, i.e. you can make entire projects a bit faster just by adding this line.
By default, my own strlen32s algo will be used for len. Lingo's and Nightware's algos can be forced by adding...
SlenUseAlgos = 2 ; Lingo
SlenUseAlgos = 4 ; NightWare
...before the include (see strlenSSE2.asm for more detail, and benchmarks comparing all three).
These two are equally fast; however, only the default algo (strlen32s) has a check if the CPU allows SSE2 code. If that check fails, len will revert to crt_strlen - slow but still a factor 2 faster than the standard Masm32lib szLen.
Cheers, jj
EDIT: I removed the attachment in favour of the new version posted on page 19 (http://www.masm32.com/board/index.php?topic=1807.msg81400#msg81400). See remarks on preserving edx.
Nice work, bit artisans :bg
Hi jj2207:
Eh "Houston we Have Liftoff!".
Great work jj2007.
I almost used the wrong assembler, you have to
use the assembler that comes with VC2005 Express.
Regards herge
Quote from: herge on March 14, 2009, 11:20:47 PM
Hi jj2207:
Eh "Houston we Have Liftoff!".
Great work jj2007.
I almost used the wrong assembler, you have to
use the assembler that comes with VC2005 Express.
Regards herge
Thanxalot, herge. The credits go also to NightWare and Lingo, of course, whose algos can be activated easily as shown above.
@NightWare & Lingo: If you consider adding the CheckSSE2 to your algos, please let me know. The check costs only about one cycle (see below, bottom of tests: 5 instead of 4 cycles for the 15 byte string), and makes sure that code works fine on whatever archaic CPU the user runs :green
Re VC2005 Express: SSE2 code should also work with Masm 6.15, and it definitely works fine with JWasm (http://www.japheth.de/JWasm.html).
Intel(R) Celeron(R) M CPU 420 @ 1.60GHz (SSE3)
codesizes: strlen32s=124strlen64B=84NWStrLen=118, _strlen=66 bytes
-- test 16k, misaligned 0, 16434 bytes
strlen32s 2918 cycles
strlen64LingoB 2921 cycles
NWStrLen 2935 cycles
_strlen (Agner Fog) 4264 cycles
-- test 4k, misaligned 11, 4096 bytes
strlen32s 753 cycles
strlen64LingoB 740 cycles
NWStrLen 757 cycles
_strlen (Agner Fog) 1096 cycles
-- test 1k, misaligned 15, 1024 bytes
Masm32 lib szLen 1308 cycles
crt strlen 971 cycles
strlen32s 198 cycles
strlen64LingoB 192 cycles
NWStrLen 208 cycles
_strlen (Agner Fog) 272 cycles
-- test 0, misaligned 0, 100 bytes
Masm32 lib szLen 132 cycles
crt strlen 110 cycles
strlen32s 27 cycles
strlen64LingoB 25 cycles
NWStrLen 32 cycles
_strlen (Agner Fog) 34 cycles
-- test 1, misaligned 1, 100 bytes
Masm32 lib szLen 132 cycles
crt strlen 132 cycles
strlen32s 28 cycles
strlen64LingoB 25 cycles
NWStrLen 32 cycles
_strlen (Agner Fog) 34 cycles
-- test 5, misaligned 5, 15 bytes
Masm32 lib szLen 24 cycles
crt strlen 28 cycles
strlen32s 5 cycles
strlen64LingoB 4 cycles
NWStrLen 15 cycles
_strlen (Agner Fog) 14 cycles
-- test 15, misaligned 15, 15 bytes
Masm32 lib szLen 25 cycles
crt strlen 25 cycles
strlen32s 5 cycles
strlen64LingoB 4 cycles
NWStrLen 15 cycles
_strlen (Agner Fog) 14 cycles
Hello,
That's a very good work,in masm syntax.
just a few words about compiled it.
It need a masm32rt_586.inc ,the one in masm32 is .486.
Write " include slenSSE2.inc ;include it in your masm32\include directory " in the lensse2.asm,avoid to search it.
Compile it in a console application with at least ml 7.0
That's all.
Hi jj2007:
I seem to have problems debugging it in windbg.
The EXE works great from dos. But I don't think
windbg likes CPUID for some reason.
strslensse2!start+0x1ab [C:\Program Files\Microsoft Visual Studio 8\VC\bin\strslensse2.asm @ 235]:
00401330 33c0 xor eax,eax
00401332 0fa2 cpuid
00401334 0f31 rdtsc
00401336 52 push edx
00401337 50 push eax
00401338 c705dcb7400020a10700 mov dword ptr [strslensse2!__counter__loop__counter__ (0040b7dc)],7A120h
00401342 33c0 xor eax,eax
00401344 0fa2 cpuid
It's not your code it's windbg acting up?
It's screwing up on either a T or P ?
C:\Documents and Settings\User\My Documents\My Pictures\401332.zip
See attachment JPG EXE ASM
Regards herge
[attachment deleted by admin]
Quote from: herge on March 15, 2009, 11:06:10 AM
Hi jj2007:
I seem to have problems debugging it in windbg.
The EXE works great from dos. But I don't think
windbg likes CPUID for some reason.
I remember having the same problem with OllyDbg, but right now I can't reproduce it. Any Olly experts around who could explain what's going on?
Hi jj2007:
We got the wrong EXE there oops!
Attachment EXE ASM
Regards herge
hi jj2007:
Will try that agan
Let me know if you got it.
I am not having much luck with
Winrar today ir's a pain in the butt.
Regards herge
[attachment deleted by admin]
Hi jj2007:
I am making some progress but I could be going backwards?
Application popup: windbg.exe - Application Error : The instruction at "0x65e36abb" referenced memory at "0x00106130".
The memory could not be "read".
Click on OK to terminate the program
Application popup: windbg.exe - Application Error : The instruction at "0x65e36abb" referenced memory at "0x000fe190".
The memory could not be "read".
And when you put a breakpoint on 65e36abb you get a 299 error.
whick get's you this.
Details
Product: Windows Operating System
ID: 26
Source: Application Popup
Version: 5.2
Symbolic Name: STATUS_LOG_HARD_ERROR
Message: Application popup: %1 : %2
Explanation
The program could not load a driver because the program user doesn't have sufficient privileges to access
the driver or because the drive is missing or corrupt.
User Action
To correct this problem:
Ensure that the program user has sufficient privileges to access the directory in which the driver is installed.
Reinstall the program to restore the driver to the correct location.
If these solutions do not work, contact Product Support Services.
//
// MessageId: STATUS_SHARED_POLICY
//
// MessageText:
//
// The policy object is shared and can only be modified at the root
//
#define STATUS_SHARED_POLICY ((NTSTATUS)0xC0000299L)
Unable to insert breakpoint 10000 at 65e36abb, Win32 error 0n299
"Only part of a ReadProcessMemory or WriteProcessMemory request was completed."
The breakpoint was set with BP. If you want breakpoints
to track module load/unload state you must use BU.
go bp10000 at 65e36abb failed
I will keep you posted if we can get help from Microsoft,
but I won't hold my breath.
Regards herge
Runs fine under Olly for me.
Good and bad news:
First, the bad news: The "fast len() with SSE2" package attached below will not work with the ml.exe version 6.14 that gets installed when you download the Masm32 package. The reason is simply that the old Masm 6.14 (Copyright (C) Microsoft Corp 1981-1997) does not yet understand SSE2.
Now the good news:
1. It will work perfectly with JWasm (http://www.japheth.de/JWasm.html) (freeware), and with any later Masm version that comes along with the various VC express etc. downloads (see masm 6.14 or 6.15? (http://www.masm32.com/board/index.php?topic=10863.msg79596#msg79596) - I have tested it only on ml.exe versions 6.15 and 9.0).
2. The default algo is now fully compatible with the Masm32lib len() macro. This means in practice that you can speed up existing projects that use len() simply by adding the include line:
include \masm32\include\masm32rt.inc
include \masm32\include\slenSSE2.inc
I should explain why I put now in red. There was an exchange of views between Lingo and myself on the value of preserving edx and ecx (Lingo: Who preserves ecx and edx registers in "this sort of algo"? (http://www.masm32.com/board/index.php?topic=1807.msg81342#msg81342)). In the end, I kept saving ecx (a valuable counter register) and trashed edx. And, bang, my RichMasm project misbehaved. Intense bug chasing revealed that I had previously and unwillingly relied on a non-documented feature of the Masm32lib szLen routine - the one that is behind the len() macro. It does preserve ecx and edx. Therefore, the new version attached below does the same, in order not to break existing code: ecx and edx are preserved. The same applies to NightWare's version (SlenUseAlgos = 4) but not for Lingo's version (SlenUseAlgos = 2).
Enjoy,
jj2007
[attachment deleted by admin]
JJ,
The szLen algo is correct in its register usage. It only uses EAX and the stack pointer. If you have had problems using it with RichMASM it is because your register usage is non standard.
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
.486
.model flat, stdcall ; 32 bit memory model
option casemap :none ; case sensitive
.code
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
align 4
szLen proc src:DWORD
mov eax, [esp+4]
sub eax, 4
@@:
add eax, 4
cmp BYTE PTR [eax], 0
je lb1
cmp BYTE PTR [eax+1], 0
je lb2
cmp BYTE PTR [eax+2], 0
je lb3
cmp BYTE PTR [eax+3], 0
jne @B
sub eax, [esp+4]
add eax, 3
ret 4
lb3:
sub eax, [esp+4]
add eax, 2
ret 4
lb2:
sub eax, [esp+4]
add eax, 1
ret 4
lb1:
sub eax, [esp+4]
ret 4
szLen endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
end
Quote from: hutch-- on March 16, 2009, 01:36:56 PM
JJ,
The szLen algo is correct in its register usage. It only uses EAX and the stack pointer. If you have had problems using it with RichMASM it is because your register usage is non standard.
Hutch,
1. My register usage is standard,
2. I did not say that szLen was incorrect:
Quote from: jj2007 on March 15, 2009, 10:33:08 PM
I had previously and unwillingly relied on a non-documented feature of the Masm32lib szLen routine - the one that is behind the len() macro. It does preserve ecx and edx.
In contrast to most other Masm32lib functions, len() does preserve ecx and edx.
But it is not documented. And when I wrote "unwillingly", it means that I had forgotten (in only one of 55 uses of len) to follow the ABI convention saying you must preserve ecx and edx yourself if you need them after an API or library call. It also means that, not knowing that len preserves ecx and edx, I reflected 54 times unnecessarily whether I needed to preserve them myself :(
"Feature" is a positive word, and there was no irony involved. For version 11, you might consider mentioning this in the documentation. It's good that a function so frequently used does preserve the registers, that's why in the end I chose to do the same in my implementation of
len().
So can you please accept my friendly clap on the shoulder?
:U
JJ,
I am trying to understand so please help. Do you mean that szLen preserves ECX and EDX by virtue of the fact it does not use them? I am a little confused here.
Paul
There is only one rule,esi edi and ebx must be preserved when a proc used them.That's all.
If your code use others registers than this one ,you must preserve them ,before a call to a subroutine.
If he don't made this,modify your code ,not the subroutine.It's a bad practice.
Quote from: ToutEnMasm on March 16, 2009, 03:40:46 PM
There is only one rule,esi edi and ebx must be preserved when a proc used them.That's all.
If your code use others registers than this one ,you must preserve them ,before a call to a subroutine.
If he don't made this,modify your code ,not the subroutine.It's a bad practice.
You are right, in principle. However, since I wrote code that claims to be a replacement for
len() aka
invoke szLen, offset My$, and since there a lots of newbies and oldbies around who might have written code that relies on this undocumented feature of szLen, I think it's better to modify the subroutine rather than the code. I have added include \masm32\include\slenSSE2.inc as line 2 of my 9,500 lines of RichMasm source, and it works perfectly. That was the goal: give SSE2 speed to an existing application without rewriting it.
JJ,
There are multiple procedures in the MASM32 library that like szLen alter only EAX. Why should they be documented as preserving ECX and EDX when they are following the documented register-preservation conventions? If your code is depending on EAX, ECX, or EDX to be preserved, then your register usage is non-standard by the conventions of the mainstream 32-bit x86 world.
Quote from: MichaelW on March 16, 2009, 04:14:01 PM
JJ,
There are multiple procedures in the MASM32 library that like szLen alter only EAX. Why should they be documented as preserving ECX and EDX when they are following the documented register-preservation conventions? If your code is depending on EAX, ECX, or EDX to be preserved, then your register usage is non-standard by the conventions of the mainstream 32-bit x86 world.
Michael,
You are right. However, my normal register usage
is standard. I had a bug in my source, but I would never had noticed it if my new version of len() had not trashed edx.
However, my goal was to be compatible with the current len() implementation, and be sure that it won't break any existing code.
I invite everybody who uses the len() macro to add a few lines at the top of their biggest source:
len MACRO ptr
invoke szLen, ptr
xor ecx, ecx ; trash two registers that can be legally trashed
xor edx, edx ; according to the convention
EXITM <eax>
ENDM
According to the convention, nobody should experience any problems :boohoo:
P.S.: In \masm32\include\slenSSE2.inc, I added a TestMasmVersion for those who try to assemble with ml 614 (it would assemble with 614, but the code may fail unexpectedly, so I decided to throw an error).
New code attached above.
A little publicite for my ide,If someone had too much trouble modifying a few lines,there is a tool in my ide who can help with this.
His name is cherche (search in engilsh).For example he can find a word in each header file of the sdk and give a result with the name of the file and the line(s) where he found the word.A right clic on the named file,is enough to view the file with notepad and modify it.
The search take about,30 seconds.
There is about 1200 header files in the sdk and i haven't make a count of the lines.
Hi There:
Some intresting results:
Intel(R) Core(TM)2 Duo CPU E4600 @ 2.40GHz (SSE4)
codesizes: strlen32s=124strlen64B=84NWStrLen=118, _strlen=66 bytes
-- test 16k, misaligned 0, 16434 bytes
strlen32s 1467 cycles
strlen64LingoB 1213 cycles
NWStrLen 1323 cycles
_strlen (Agner Fog) 2804 cycles
-- test 4k, misaligned 11, 4096 bytes
strlen32s 394 cycles
strlen64LingoB 321 cycles
NWStrLen 342 cycles
_strlen (Agner Fog) 712 cycles
-- test 1k, misaligned 15, 1024 bytes
Masm32 lib szLen 1055 cycles
crt strlen 618 cycles
strlen32s 114 cycles
strlen64LingoB 85 cycles
NWStrLen 113 cycles
_strlen (Agner Fog) 197 cycles
-- test 0, misaligned 0, 100 bytes
Masm32 lib szLen 106 cycles
crt strlen 69 cycles
strlen32s 17 cycles
strlen64LingoB 11 cycles
NWStrLen 20 cycles
_strlen (Agner Fog) 21 cycles
-- test 1, misaligned 1, 100 bytes
Masm32 lib szLen 106 cycles
crt strlen 105 cycles
strlen32s 17 cycles
strlen64LingoB 11 cycles
NWStrLen 18 cycles
_strlen (Agner Fog) 21 cycles
-- test 5, misaligned 5, 15 bytes
Masm32 lib szLen 19 cycles
crt strlen 17 cycles
strlen32s 4 cycles
strlen64LingoB 1 cycles
NWStrLen 9 cycles
_strlen (Agner Fog) 7 cycles
-- test 15, misaligned 15, 15 bytes
Masm32 lib szLen 19 cycles
crt strlen 16 cycles
strlen32s 3 cycles
strlen64LingoB 2 cycles
NWStrLen 10 cycles
_strlen (Agner Fog) 7 cycles
-- hit any key --
And Under Windbg I can't wait till it finishes?
See Attachment.
It;s VERy VERY SLOW!
Regards herge
[attachment deleted by admin]
Latest:
AMD Athlon(tm) 64 X2 Dual Core Processor 4000+ (SSE3)
codesizes: strlen32s=132strlen64B=84NWStrLen=118, _strlen=66 bytes
-- test 16k, misaligned 0, 16434 bytes
strlen32s 3206 cycles
strlen64LingoB 3188 cycles
NWStrLen 3198 cycles
_strlen (Agner Fog) 14239 cycles
-- test 4k, misaligned 11, 4096 bytes
strlen32s 842 cycles
strlen64LingoB 826 cycles
NWStrLen 842 cycles
_strlen (Agner Fog) 3560 cycles
-- test 1k, misaligned 15, 1024 bytes
Masm32 lib szLen 1240 cycles
crt strlen 843 cycles
strlen32s 254 cycles
strlen64LingoB 240 cycles
NWStrLen 255 cycles
_strlen (Agner Fog) 917 cycles
-- test 0, misaligned 0, 100 bytes
Masm32 lib szLen 140 cycles
crt strlen 99 cycles
strlen32s 55 cycles
strlen64LingoB 40 cycles
NWStrLen 53 cycles
_strlen (Agner Fog) 139 cycles
-- test 1, misaligned 1, 100 bytes
Masm32 lib szLen 140 cycles
crt strlen 109 cycles
strlen32s 58 cycles
strlen64LingoB 43 cycles
NWStrLen 56 cycles
_strlen (Agner Fog) 103 cycles
-- test 5, misaligned 5, 15 bytes
Masm32 lib szLen 22 cycles
crt strlen 26 cycles
strlen32s 25 cycles
strlen64LingoB 22 cycles
NWStrLen 38 cycles
_strlen (Agner Fog) 36 cycles
-- test 15, misaligned 15, 15 bytes
Masm32 lib szLen 23 cycles
crt strlen 21 cycles
strlen32s 24 cycles
strlen64LingoB 21 cycles
NWStrLen 40 cycles
_strlen (Agner Fog) 35 cycles
AMD Athlon(tm) XP 3000+ (SSE1)
ERROR in StrSizeA at ebx=4096: 20535 bytes instead of 4096
ERROR in strlen32c at ebx=4096: 0 bytes instead of 4096
ERROR in strlen64B at ebx=4096: 20535 bytes instead of 4096
ERROR in NWStrLen at ebx=4096: 20535 bytes instead of 4096
codesizes: strlen32s=132strlen64B=84NWStrLen=118, _strlen=66 bytes
-- test 16k, misaligned 0, 16434 bytes
strlen32s 14573 cycles
strlen64LingoB 2782 cycles
NWStrLen 2783 cycles
_strlen (Agner Fog) 22914 cycles
-- test 4k, misaligned 11, 4096 bytes
strlen32s 3661 cycles
strlen64LingoB 3453 cycles
NWStrLen 3470 cycles
_strlen (Agner Fog) 28603 cycles
-- test 1k, misaligned 15, 1024 bytes
Masm32 lib szLen 1574 cycles
crt strlen 931 cycles
strlen32s 944 cycles
strlen64LingoB 225 cycles
NWStrLen 227 cycles
_strlen (Agner Fog) 1487 cycles
-- test 0, misaligned 0, 100 bytes
Masm32 lib szLen 169 cycles
crt strlen 112 cycles
strlen32s 125 cycles
strlen64LingoB 40 cycles
NWStrLen 51 cycles
_strlen (Agner Fog) 193 cycles
-- test 1, misaligned 1, 100 bytes
Masm32 lib szLen 169 cycles
crt strlen 121 cycles
strlen32s 132 cycles
strlen64LingoB 40 cycles
NWStrLen 50 cycles
_strlen (Agner Fog) 161 cycles
-- test 5, misaligned 5, 15 bytes
Masm32 lib szLen 26 cycles
crt strlen 29 cycles
strlen32s 40 cycles
strlen64LingoB 32 cycles
NWStrLen 38 cycles
_strlen (Agner Fog) 50 cycles
-- test 15, misaligned 15, 15 bytes
Masm32 lib szLen 26 cycles
crt strlen 26 cycles
strlen32s 37 cycles
strlen64LingoB 33 cycles
NWStrLen 46 cycles
_strlen (Agner Fog) 72 cycles
-- hit any key --
Quote from: Jimg on March 17, 2009, 04:29:48 PM
AMD Athlon(tm) XP 3000+ (SSE1)
ERROR in StrSizeA at ebx=4096: 20535 bytes instead of 4096
ERROR in strlen32c at ebx=4096: 0 bytes instead of 4096
ERROR in strlen64B at ebx=4096: 20535 bytes instead of 4096
ERROR in NWStrLen at ebx=4096: 20535 bytes instead of 4096
Yeah, it's mostly SSE2 only. My strlen32
s is not in the error list because it reverts to crt_strlen for SSE<2 (compare the timings :bg)
But I wonder whether it would run on an SSE1 CPU...? The instructions I used (movups, pmovmskb, pcmpeqb) seem to be SSE1 ::)
Could you please make a test by adding CheckSSE2 = 0 before the include line, i.e.
CheckSSE2 =0
include \masm32\include\slenSSE2.inc
include \masm32\macros\timers.asm
in slen_timings.asm?
Sure-
AMD Athlon(tm) XP 3000+ (SSE1)
ERROR in StrSizeA at ebx=4096: 20535 bytes instead of 4096
ERROR in strlen32c at ebx=4096: 0 bytes instead of 4096
ERROR in strlen32s at ebx=4096: 2 bytes instead of 4096
ERROR in strlen64B at ebx=4096: 20535 bytes instead of 4096
ERROR in NWStrLen at ebx=4096: 20535 bytes instead of 4096
codesizes: strlen32s=88strlen64B=84NWStrLen=118, _strlen=66 bytes
-- test 16k, misaligned 0, 0 bytes
strlen32s 25 cycles
strlen64LingoB 2782 cycles
NWStrLen 2785 cycles
_strlen (Agner Fog) 22940 cycles
-- test 4k, misaligned 11, 0 bytes
strlen32s 29 cycles
strlen64LingoB 3453 cycles
NWStrLen 3474 cycles
_strlen (Agner Fog) 28719 cycles
-- test 1k, misaligned 15, 0 bytes
Masm32 lib szLen 1577 cycles
crt strlen 933 cycles
strlen32s 29 cycles
strlen64LingoB 226 cycles
NWStrLen 227 cycles
_strlen (Agner Fog) 1489 cycles
-- test 0, misaligned 0, 100 bytes
Masm32 lib szLen 169 cycles
crt strlen 112 cycles
strlen32s 25 cycles
strlen64LingoB 40 cycles
NWStrLen 51 cycles
_strlen (Agner Fog) 193 cycles
-- test 1, misaligned 1, 0 bytes
Masm32 lib szLen 170 cycles
crt strlen 122 cycles
strlen32s 29 cycles
strlen64LingoB 40 cycles
NWStrLen 50 cycles
_strlen (Agner Fog) 161 cycles
-- test 5, misaligned 5, 0 bytes
Masm32 lib szLen 27 cycles
crt strlen 29 cycles
strlen32s 29 cycles
strlen64LingoB 32 cycles
NWStrLen 38 cycles
_strlen (Agner Fog) 50 cycles
-- test 15, misaligned 15, 0 bytes
Masm32 lib szLen 26 cycles
crt strlen 25 cycles
strlen32s 29 cycles
strlen64LingoB 33 cycles
NWStrLen 46 cycles
_strlen (Agner Fog) 72 cycles
-- hit any key --
Quote from: Jimg on March 17, 2009, 06:07:48 PM
Sure-
Thanks. Remarkably fast, and remarkably incorrect :green
Viewing the post,I see that he can be a possible problem with the location of the proc.
Proc with SSE code must be in a separate module to work.
I experiment with this,And find it as the only soluce.
That is put all the sse code in the slensse2.inc.
I don't know why it is like that ( i have ml 9.0),but
i am certain of what is the problem.
a useful macro can be also added to the slensse2.inc.
Quote
numeroversion equ < @Version>
IF numeroversion LT 615
%ECHO MASM numeroversion impossible de compiler SSE2
.ERR <Version Masm must be at least 6.15 to compile SSE2>
ENDIF
Quote from: ToutEnMasm on March 17, 2009, 08:00:53 PM
Viewing the post,I see that he can be a possible problem with the location of the proc.
Proc with SSE code must be in a separate module to work.
I experiment with this,And find it as the only soluce.
That is put all the sse code in the slensse2.inc.
I don't know why it is like that ( i have ml 9.0),but i am certain of what is the problem.
strlenSSE2.asc uses SSE2 code in the main module and the slenSSE2.inc. JimG has an old SSE1 CPU - I tried to dig out my oldest 6 year old puter, but it's SSE2 already. Nonetheless I have a suspicion that the algo could work with SSE1 - but I cannot test it...
Quote
a useful macro can be also added to the slensse2.inc.
Quote
numeroversion equ < @Version>
IF numeroversion LT 615
%ECHO MASM numeroversion impossible de compiler SSE2
.ERR <Version Masm must be at least 6.15 to compile SSE2>
ENDIF
From the package (http://www.masm32.com/board/index.php?topic=1807.msg81400#msg81400) (downloaded 10 times right now):
TestMasmVersion MACRO
ifidn @Version, <614>
echo ####################################################
echo
echo You cannot use the SSE2 library with ml.exe version 614, sorry
echo
echo ####################################################
.err
endif
ENDM
...
.code
TestMasmVersion
But thanks anyway, ToutEnMasm :U
Is this better?
Timings for strlen32s:
25 cycles for len=3
29 cycles for len=3
Timings for Masm32lib szLen:
27 cycles for len=15
24882 cycles for len=16384
-- hit any key --
Hi All:
A picture of some slow response on my computer while
debugging with winDbg. It would appear it's hanging on
the CPUID instruction, and taking it's sweet time.
See the pretty picture Note the Very Large Cycle times.
Also note strslensse2 and WinDbg Cpu time Useage.
Regards herge
[attachment deleted by admin]
Quote from: Jimg on March 17, 2009, 08:44:49 PM
Is this better?
I am afraid the string lengths should be the same as for szLen...
Still trying to find a reliable database giving info which SSE version corresponds to which instruction. This file (http://stuff.mit.edu/afs/athena/software/nasm_v2.02/info/nasm.info-9) documents NASM, but it's not that clear. ::)
JJ,
Take a look at my Opcode Database Project. SSE2 instructions are listed as same. SSE1 instructions are listed as SSE. It is not a fancy app but it has the info you need.
hth,
Paul
[attachment deleted by admin]
Quote from: PBrennick on March 17, 2009, 10:35:24 PM
JJ,
Take a look at my Opcode Database Project. SSE2 instructions are listed as same. SSE1 instructions are listed as SSE. It is not a fancy app but it has the info you need.
hth,
Paul
Thanks, Paul, much appreciated. The problem is indeed that pcmpeqb and pmovskb exist as MMX and SSE2 versions. Which means that JimG has no luck - his SSE1 CPU does not throw an exception, but it cannot interpret the 66h prefix... sorry!
Take care with the entry point of your code.
The slenSSE2.inc that repeat .686 .data and so on,is a bad thing.
Quote from: ToutEnMasm on March 18, 2009, 07:32:31 AM
Take care with the entry point of your code.
The slenSSE2.inc that repeat .686 .data and so on,is a bad thing.
Why?
This explain why there is bad results given by the function .
This explain also why the program seems to works slowly.
I have tested that with windbg.Put your include file with code in the section code (The slenSSE2.inc was in the declare section) ,without the repeat of .686...,and you will have a code that run faster and don't give random results.
That only the fact of an undeterminate entry point,that can be solve randomlly on various machines.
Quote from: herge on March 17, 2009, 09:50:13 PM
Hi All:
A picture of some slow response on my computer while
debugging with winDbg. It would appear it's hanging on
the CPUID instruction, and taking it's sweet time.
See the pretty picture Note the Very Large Cycle times.
Also note strslensse2 and WinDbg Cpu time Useage.
Regards herge
Maybe use OllyDbg ? :U
Hi BlackVortex:
I tried to download ollydebug three times and all you get
ia a corrupt Zip file. If you can't download it one piece it's
Not going to be used!
Regards herge
Quote from: herge on March 18, 2009, 08:55:59 AM
Hi BlackVortex:
I tried to download ollydebug three times and all you get
ia a corrupt Zip file. If you can't download it one piece it's
Not going to be used!
Regards herge
http://www.ollydbg.de/odbg110.zip
This link ? It works fine.
Quote from: herge on March 18, 2009, 08:55:59 AM
I tried to download ollydebug three times and all you get is a corrupt Zip file.
Try this link (http://www.ollydbg.de/odbg200i.zip) with Firefox and IE. For me, it always works fine (with both browsers).
Just for fun, I also downloaded WinDbg, >17 MB, and tried it. The user interface is disgusting, but it has no problem with the CPUID opcode. Googling for WinDbg CPUID is not very successful, either, so it might be something specific to your CPU ::)
Re entry points etc:
include \masm32\include\masm32rt.inc
include \masm32\include\slenSSE2.inc
txt50 equ <"Just some stüpid text containing exäctly 50 bytes ">
.data
szTest_1 db "My short string", 0
...
.code
start:
tmp$ CATSTR <chr$("Timings for >, StrLenAlgo$, <:")>
print tmp$, 13, 10, 10
I cannot see what could possibly wrong here, and more specifically I would like to see an example where the entry point is being determined by the
machine rather than the code. What I see in Olly is that the SSE2 code starts at 00401000 (start of code section), while execution starts at 004010AC, called <ModuleEntryPoint>. That works fine, many coders put procedures before
start in order to save the PROTO's.
Reviewing the posts above, it seems that the slenSSE2.inc works fine unless
a) you have a CPU that does not support SSE2 or
b) you use WinDbg and get hung at the CPUID instruction.
Is that correct? Is there any case where the code did not work properly on an SSE2 machine in normal (non-debugged) execution?
Hi jj2007:
Volume 2A:
Instruction Set Reference, A-M
NOTE: The Intel 64 and IA-32 Architectures Software Developer's Manual
consists of five volumes: Basic Architecture, Order Number 253665;
Instruction Set Reference A-M, Order Number 253666; Instruction Set
Reference N-Z, Order Number 253667; System Programming Guide,
Part 1, Order Number 253668; System Programming Guide, Part 2,
Order Number 253669. Refer to all five volumes when evaluating your
design needs.
Order Number: 253666-029US
CPUID—CPU Identification
Description
The ID flag (bit 21) in the EFLAGS register indicates support for the CPUID instruction.
If a software procedure can set and clear this flag, the processor executing the
procedure supports the CPUID instruction. This instruction operates the same in non-
64-bit modes and 64-bit mode.
CPUID returns processor identification and feature information in the EAX, EBX, ECX,
and EDX registers.1 The instruction's output is dependent on the contents of the EAX
register upon execution (in some cases, ECX as well). For example, the following
pseudocode loads EAX with 00H and causes CPUID to return a Maximum Return
Value and the Vendor Identification String in the appropriate registers:
Go to Intel!
You want CPUID page 228 thru 261.
VOL 2A 3-180 > VOL 2A 3-213
It's about thirty-three pages long and this
manual has 812 pages.
You also need adobe Reader to read it.
Regards herge
Herge, there is no problem with the CPUID instruction. It seems you have a very specific problem with your machine. Can you WinDbg the sample posted here (http://www.masm32.com/board/index.php?board=2;topic=11061.1#msg81562) (SSE2 but totally unrelated to szLen), and maybe insert just for fun the CPUID code to see if it makes any difference?
start:
pushad
push 1
pop eax
db 0Fh, 0A2h ; cpuid 1
xor eax, eax
xor esi, esi
bt edx, 25 ; edx bit 25, SSE1
adc eax, esi
bt edx, 26 ; edx bit 26, SSE2
adc eax, esi
bt ecx, esi ; ecx bit 0, SSE3 (esi=0)
adc eax, esi
bt ecx, 9 ; ecx bit 9, SSE4
adc eax, esi
mov Win$, alloc$(1000000)
Hi jj2007:
It's got something to do with windbg and my computer.
It does not crash but iit is slow if you t or p a CPUID
instruction. It will take most of the day to run.
All cycles times are seven digits long.
The g works great!
00401325 6880000000 push 0x80
0040132a 50 push eax
0040132b e82e320000 call strslensse2!SetPriorityClass (0040455e)
strslensse2!start+0x1ab [C:\Program Files\Microsoft Visual Studio 8\VC\bin\strslensse2.asm @ 235]:
00401330 33c0 xor eax,eax
00401332 0fa2 cpuid <<;; Don't t or P a CPUID
00401334 0f31 rdtsc
00401336 52 push edx
00401337 50 push eax
00401338 c705dcb7400020a10700 mov dword ptr [strslensse2!__counter__loop__counter__ (0040b7dc)],7A120h
00401342 33c0 xor eax,eax
00401344 0fa2 cpuid
Regards herge
Hi jj2007:
I had no problems with CountLinesSSE2.exe
in WinDbg. I have two versions of Windbg and
both choke on CPUID.
Regards herge
Hi All:
Well we finally got OllyDEbug with Firefox and my internet radio OFF.
Olly ran Strslensse2.exe with no problem.
So it's either my computer or Windbg, or it's some
software I am running.
Regards herge
To jj2007,
Quote
and more specifically I would like to see an example where the entry point is being determined by the machine rather than the code.
Perhaps did you search a machine wo write the code at your place ?.
The Entry point is always fixed by the code.
i will repeat my upper post about the soluce.
Quote
You could'nt include code in the declare section
the includelib is just read by the linker and code is added at link time.
No need of special debugger or special machine to run SSE2 instructions.
consoles applications are irrelevent because they are finished before start when lauched with windows.Your one don't make that because there is bad writing in it.
I use windbg and he works perfectly with well written code.
Hi All:
And the loser at 04.75 hrs is strslensse2.exe with WinDbg.
See Attachment.
Regards herge
[attachment deleted by admin]
@ herge
Why do you insist on using WinDbg ? It's next to useless (except maybe as a system debugger)
Quote from: ToutEnMasm on March 18, 2009, 12:38:30 PM
To jj2007,
Quote
and more specifically I would like to see an example where the entry point is being determined by the machine rather than the code.
Perhaps did you search a machine wo write the code at your place ?.
The Entry point is always fixed by the code.
YES, that's perfectly correct. I knew that before, but a certain ToutEnMasm insisted that machines might fumble with the entry point:
Quote from: ToutEnMasm on March 18, 2009, 08:06:08 AM
That only the fact of an undeterminate entry point,that can be solve randomlly on various machines.
Quote
No need of special debugger or special machine to run SSE2 instructions.
You need a "special machine" that is capable of SSE
2. If you had read the source, you would have discovered the macro that throws an error if you try to
assemble it on an SSE1 machine. And there is
run-time check in my code that reverts to crt_strlen if SSE1 is detected. That should be fool-proof, right?
Quote
consoles applications are irrelevent because they are finished before start when lauched with windows.Your one don't make that because there is bad writing in it.
Your phrase does not make sense at all, probably a language problem. Please explain, and use a code example.
SlenSSE2.inc works perfectly with console and GUI applications. The only change I had to make to my 9,500 lines RichMasm source was
one line:
include \masm32\Gfa2Masm\Gfa2Masm.inc
include \masm32\include\slenSSE2.inc
Quote
I use windbg and he works perfectly with well written code.
Me too. It works perfectly well with all my code.
Quote from: herge on March 18, 2009, 01:57:55 PM
And the loser at 04.75 hrs is strslensse2.exe with WinDbg.
Impressing :bg
And I am very glad that for the 100 byte strings my algo is over 1000 cycles faster than Lingo's :cheekygreen:
Hi jj2007:
A small sugestion:
To protect against operator stupity.
ENDM
@@:
inkey chr$(9, 9, 9, 9, 9, "-- Hit X Key --")
cmp AL,"X"
jnz @B
exit
But make sure the inkey MACRO in C:\masm32\macros\macros.asm
is updated.
inkey MACRO user_text:VARARG
IFDIF <user_text>,<NULL> ;; if user text not "NULL"
IFNB <user_text> ;; if user text not blank
print user_text ;; print user defined text
ELSE ;; else
print "Press any key to continue ..." ;; print default text
ENDIF
ENDIF
call wait_key
push eax;; < Note push
print chr$(13,10)
pop eax;; < Note pop
ENDM
Note the push and pop, it was
always returning 2 the length
of CRLF which explains why
a CMP AL,? was always
failing after a inkey call.
Regards herge
Quote from: herge on March 18, 2009, 07:46:05 PM
Hi jj2007:
A small sugestion:
Access violation when reading [herge's suggestion] - Shift+Run/Step to pass exception to the owner of the Masm32 macros :wink
Hi jj2007:
I know the inkey MACRO has nothing to do with you.
But my suggestion will not work with the present
inkey MACRO which I suspect is not working
right at present.
I did mention it in another forum.
Regards herge
herge,
The inkey macro works correctly within the boundaries of what it was designed to do. The inkey function should not be expected to function in a polcat sort of way.
Use the getkey macro which calls ret_key if you expect to receive a value.
JJ,
About CPUID, not all parameters of this instruction are supported on all CPUs. CPUID should be called with EAX = 0 first, as this will return the highest calling parameter that the CPU supports. To obtain extended function information CPUID should be called with bit 31 of EAX set. To determine the highest extended function calling parameter, call CPUID with EAX = 80000000h. If the particular parameter you are trying to use is higher than that number, then report this to the user and do not use the instruction.
You reported in an earlier posting that you had trouble with CPUID, now you know why.
Paul
Quote from: PBrennick on March 18, 2009, 09:40:33 PM
JJ,
About CPUID, not all parameters of this instruction are supported on all CPUs.
You reported in an earlier posting that you had trouble with CPUID, now you know why.
Paul
Paul,
1. Open \masm32\include\slenSSE2.inc in GeneSys.exe
2. Search for CPUID
3. All you will find is:
ChkSSE2 proc ; exactly 40 bytes
pushad
push 1
pop eax
db 0Fh, 0A2h ; cpuid 1
xor eax, eax
xor esi, esi
bt edx, 25 ; edx bit 25, SSE1
adc eax, esi
bt edx, 26 ; edx bit 26, SSE2
adc eax, esi
bt ecx, esi ; ecx bit 0, SSE3 (esi=0)
adc eax, esi
bt ecx, 9 ; ecx bit 9, SSE4
adc eax, esi
mov MbSSE2, eax
popad
ret
ChkSSE2 endp
As you can see,
none of the extended functions are being used.
4. Herge's program works perfectly when launched normally. It's WindDbg that has a problem,
not my code.
Hi jj2007:
Your code is great!
But when I try to trace or Proceed a
CPUID
instruction. Windbg take's its sweet time.
The Go in Windbg works great. But on my
computer, Something weird is going on.
It's either my hardware or software
running on my computer.
Your program has always worked from a Dos
Box. [Command Prompt]
Regards herge
JJ,
Your implimentation looks fine to me. Looks liike he should ditch that debugger. There has always been issues with it, anyway. A few years back there was a thread about the pros and cons of it and it seems lots of people have had negative experiences with it.
By the way, I was not bashing your code, I was just wondering if his CPU has some limitations. Option number one is handled by all CPUs as far as I know, however.
Paul
Hi Paul:
What ever the problems with windbg, at least I can see my
code, I have a lot of trouble seeing the code in Ollly and have
not found out how to change it's font size.
Regards herge
One of the secrets to tracing source code in Olly, is do NOT use the /Fl options when assembling. It messes something up.
I usually use - /c /coff /Cp /nologo /Zi /Zd
and link - /SUBSYSTEM:WINDOWS /DEBUG /DEBUGTYPE:CV /INCREMENTAL:NO
To change the font of the source window, right click in the window, select appearance and font
Also you can change the font in the menu Options/Appearance/Fonts
Hi Jimg:
1. Edit ollydbg.ini
2. Replace the last three lines with this:
Font name[7]=Font Herge
Font[7]=20,0,600,0,0,0,1,2,5,0,0
Face name[7]=Arial
3. Save and restart Olly
4. Right-click, choose Appearance/Font/Font Herge
jj2007 told me how to fix it.
Thanks jj2007.
Regards herge
Quote from: hutch-- on March 11, 2009, 10:21:19 AM
Years of reading posts leave you with a reasonably good idea of the value of an "atom cracking" string length algo. Let me think, "As useful as a hip pocket in a singlet", wjhat about the world's fastest "MessageBoxA" algo ? How about a hobbling horse in the Kentucky Derby ? :P
Quote from: hutchNo I don't, I have been watching musical chairs on string length algos for at least the last 10 years, in about 99.9999999999999999999999999% of cases the slow byte scanner is more than fast enough and in the .0 --- 0001% of other cases Agner Fog's algo is even more than fast enough. Speed is greate but it must also be useful gains and string length algos are rarely ever a big deal.
I'm quite surprised by this statement. I have been involved in writing and profiling enterprise software for years, and the str* functions are
repeatedly found as some of the highest CPU users in various bits of code. Sure, it is not going to be an issue for an MPEG encoder, but for applications that handle user input, communication with other components, whatever, I've often seen these functions be the bottleneck. Using better string functions in cases has resulted in massive improvements in some workflows - even some we didn't know would be affected ahead of time.
Sure, you could argue that strlen itself is kind of useless, since at least someone new the length (at creation, for example) and this length could be passed around rather than using strlen, but the realities of software engineering, such as interop with other components, use of existing APIs, legacy code, and so on mean that it is useful in practice. Other functions, such as strcpy are useful both in theory and in practice since they cannot be optimized away (unlike strlen, arguably).
Saying that the str* functions are useless is like arguing that memcpy and friends aren't important either - since for many programs the former are used more than the latter.
Quote from: BeeOnRope on March 31, 2009, 08:32:37 PM
Saying that the str* functions are useless is like arguing that memcpy and friends aren't important either - since for many programs the former are used more than the latter.
?
strlen algos are never used intensively (anway, not like memcopy) in a serious app, so the comparison is totally inappropriate. plus, if you code YOUR functions correctly (and stop using stupid win APIs), YOU DON'T NEED those algos, coz you "should" return the size with your function with a simple sub instruction... just for info, in ALL my sources i've used a strlen algo just ONCE, and only because i'm too lazy to update a counter, and because speed is not essential... i don't know of what your years of writing consist of, but you have things to learn... seriously...
Quote from: BeeOnRope on March 31, 2009, 08:32:37 PM
Sure, it is not going to be an issue for an MPEG encoder, but for applications that handle user input, communication with other components, whatever, I've often seen these functions be the bottleneck. Using better string functions in cases has resulted in massive improvements in some workflows - even some we didn't know would be affected ahead of time.
A bottleneck for communication with other components, possibly, but the bottleneck for user input is obviously the user.
Quote from: NightWare on March 31, 2009, 09:49:15 PM
just for info, in ALL my sources i've used a strlen algo just ONCE
In all my sources, I use GOTO only once, and for a valid reason, but I just checked
len() and found a value of about 6/kLine of code. I wouldn't mind getting rid of some of them, but it is not that easy in a general purpose app. For highly optimised graphics applications, that might be different, though.
Bee,
I agree that string functions generally need to be fast, particularly when you are doing complex parsing but I would hold to my original comment that almost all string length requirements are more than adequately handled by the simplest byte scanner using one register. It is very rare to use long strings (> 1 meg) and where you do have an unusual case that has to repeatedly scan strings for their length, you write a different algo. Agner Fog's 1995 DWORD algo is still a very good performer here but if your task requires it you write a dedicated string length algo that is faster.
This is my favourite type of string length algo.
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
slen proc pstr:DWORD
mov ecx, [esp+4]
mov eax, ecx
sub eax, 1
@@:
add eax, 1
cmp BYTE PTR [eax], 0
jne @B
sub eax, ecx
ret 4
slen endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
A techynique I regularly use when tokenising a large string is to make a copy of the string if preserving the original matters, do a one pass in place tokenise on the data overwriting the line terminator with a zero and writing the start offset of each line to an array. Now this leaves me with an array of unaligned members but the tokenising method is faster than any data copy to array method by some considerable amount.
If I then need to get the length of any or all of the tokenised strings, I use the very small one above because in most instances its takeoff time makes it faster than the bigger clunkier ones that are very fast on single long strings but hopeless on variable length unaligned short strings.
hi hutch,
lea eax,[ecx-1]
instead of
mov eax,ecx
sub eax,1
no ?
Quote from: hutch-- on April 01, 2009, 01:37:24 AM
... and writing the start offset of each line to an array....
If I then need to get the length of any or all of the tokenised strings
Can't you just use (offset n+1)-(offset n)-2?
NightWare,
Its a good mod but I tend to avoid LEA on a PIV as it is laggy. I would be interested to see if it has become faster again on a core 2 duo or quad.
JJ,
that suggestion makes sense except that you have to calculate the length reduction of either or both the CR and LF. If the task suited it your mod would be faster as the data is already present but it gets untidy if you pass the address of the tokenised string to another procedure.
Quote from: NightWare on March 31, 2009, 09:49:15 PM
Quote from: BeeOnRope on March 31, 2009, 08:32:37 PM
Saying that the str* functions are useless is like arguing that memcpy and friends aren't important either - since for many programs the former are used more than the latter.
?
strlen algos are never used intensively (anway, not like memcopy) in a serious app, so the comparison is totally inappropriate. plus, if you code YOUR functions correctly (and stop using stupid win APIs), YOU DON'T NEED those algos, coz you "should" return the size with your function with a simple sub instruction... just for info, in ALL my sources i've used a strlen algo just ONCE, and only because i'm too lazy to update a counter, and because speed is not essential... i don't know of what your years of writing consist of, but you have things to learn... seriously...
That's an interesting statement. You just said that string algos are *never* used in a serious app, yet I have been in developing several "serious" apps, and I've seen string functions, including strlen, be the bottleneck for interesting workflows for many of them. It's very though to assert that something never happens when I'm saying plainly and without any particular secret motivation that I have seen exactly this in "serious" apps.
I didn't write the functions in question, rather noted the bottleneck in software developed by teams of hundreds of people - I already mentioned that in some cases it is possible to return a length (or to use a class that remembers it), but if you are interoperating with other code you may not have a choice because (a) you don't have the source (b) cannot legally modify the source (c) do not have the time to modify the source, etc.
Quote from: MichaelW on March 31, 2009, 10:04:04 PM
Quote from: BeeOnRope on March 31, 2009, 08:32:37 PM
Sure, it is not going to be an issue for an MPEG encoder, but for applications that handle user input, communication with other components, whatever, I've often seen these functions be the bottleneck. Using better string functions in cases has resulted in massive improvements in some workflows - even some we didn't know would be affected ahead of time.
A bottleneck for communication with other components, possibly, but the bottleneck for user input is obviously the user.
Agreed - I wasn't totally clear there. I meant dealing with text that
originally came as user textual input, but is now be processed, perhaps repeatedly. For example, string columns in a database often came originally from user input, but that happens (for example) once while the string itself may be queries, returned to clients, sorted, etc millions of times. In such applications string functions may be useful and performance sensitive.
Quote from: hutch-- on April 01, 2009, 01:37:24 AM
Bee,
I agree that string functions generally need to be fast, particularly when you are doing complex parsing but I would hold to my original comment that almost all string length requirements are more than adequately handled by the simplest byte scanner using one register. It is very rare to use long strings (> 1 meg) and where you do have an unusual case that has to repeatedly scan strings for their length, you write a different algo. Agner Fog's 1995 DWORD algo is still a very good performer here but if your task requires it you write a dedicated string length algo that is faster.
This is my favourite type of string length algo.
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
slen proc pstr:DWORD
mov ecx, [esp+4]
mov eax, ecx
sub eax, 1
@@:
add eax, 1
cmp BYTE PTR [eax], 0
jne @B
sub eax, ecx
ret 4
slen endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
A techynique I regularly use when tokenising a large string is to make a copy of the string if preserving the original matters, do a one pass in place tokenise on the data overwriting the line terminator with a zero and writing the start offset of each line to an array. Now this leaves me with an array of unaligned members but the tokenising method is faster than any data copy to array method by some considerable amount.
If I then need to get the length of any or all of the tokenised strings, I use the very small one above because in most instances its takeoff time makes it faster than the bigger clunkier ones that are very fast on single long strings but hopeless on variable length unaligned short strings.
I agree more or less - it is rare enough in most applications to take the length of long strings repeatedly, but it definitely does happen. This code could definitely be re-written to make it faster in many cases, but a string algo that is 10x faster in the first place will be an automatic huge win in these places without the need to thread a length through 10s or 100s of functions. Arguably I'm preaching to the wrong crowd here - programs written entirely in assembly perhaps aren't likely to reach the scale (in terms of lines of code) where this becomes a consideration, but in the kind of software I'm interesting in (HLL + selected routines in assembly) it counts.
One thing you are missing from this thread is that the "larger/clunkier" routines developed by Lingo, NW and jj are
blazing fast on short and unaligned strings - much, much faster than than Agner's routine, the CRT routine etc. For example, on some 15 byte
misaligned strings, Lingo's routine takes as little as 1 cycle. That's 19x faster than the existing routine, and 7x faster than Agner's routine.
With such speeds it is arguably faster, for short strings, to
not bother passing around the length, but to call strlen when needed (especially since the length may be need to 4 bytes or more, if you can accommodate larger strings, even if they are usually short in practice).
Quote from: BeeOnRope on April 01, 2009, 06:32:25 PM
One thing you are missing from this thread is that the "larger/clunkier" routines developed by Lingo, NW and jj are blazing fast on short and unaligned strings - much, much faster than than Agner's routine, the CRT routine etc. For example, on some 15 byte misaligned strings, Lingo's routine takes as little as 1 cycle. That's 19x faster than the existing routine, and 7x faster than Agner's routine.
That seems a bit too optimistic, BeeOnRope. Lingo's algo is indeed blazing fast at 4 cycles, but first, the timings may not be so accurate at that scale, and second, it will hardly matter for such short strings - there will be plenty of slower code before and after.
Intel(R) Celeron(R) M CPU 420 @ 1.60GHz (SSE3)
codesizes: strlen32s=132strlen64B=84NWStrLen=118, _strlen=66 bytes
-- test 16k, misaligned 0, 16434 bytes
Masm32 lib szLen 20648 cycles
crt strlen 15255 cycles
strlen32s 2894 cycles
strlen64LingoB 2919 cycles
NWStrLen 2935 cycles
_strlen (Agner Fog) 4264 cycles
-- test 15, misaligned 15, 15 bytes
Masm32 lib szLen 25 cycles
crt strlen 25 cycles
strlen32s 6 cycles
strlen64LingoB 4 cycles
NWStrLen 15 cycles
_strlen (Agner Fog) 14 cycles
Quote
With such speeds it is arguably faster, for short strings, to not bother passing around the length, but to call strlen when needed (especially since the length may be need to 4 bytes or more, if you can accommodate larger strings, even if they are usually short in practice).
Laziness is indeed a valid argument - codesize maybe not. A harmless
mov eax, len(offset MyString) costs 12 bytes...
Hutch,
don't use ecx in strlen because jj will improve your code
and preserve his lovely "count" register ecx. :wink
After that he'll post a message that "his" code is faster... :lol
Quote from: lingo on April 01, 2009, 10:01:46 PM
Hutch,
don't use ecx in strlen because jj will improve your code
and preserve his lovely "count" register ecx. :wink
After that he'll post a message that "his" code is faster... :lol
... and you will reply with a 1% faster SSE8 version, hehe :green2
Lingo, I have changed philosophy completely. No more preserving of count registers! No, just the opposite: How can one set all registers to zero (http://www.masm32.com/board/index.php?topic=11138.msg82449#msg82449) with less than 12 bytes in less than 5 cycles. I swear it's more exciting than solving crosswords. All the gurus here have already contributed a solution. We are waiting for
you :thumbu
memory_search proc address:dword,data:dword,limit:dword
;returns position of char in eax.
;returns limit if char was not found.
;the limit can be set to avoid buffer overrun
push ecx
push edi
mov ecx,limit
mov eax,data
mov edi,address
repne scasb
mov eax,limit
sub eax,ecx
pop edi
pop ecx
ret
memory_search endp
the point was to prevent the program from locking up on errand strings.
i know the REP prefix is long depretated...how slow is it actually compared to the other implementations?
would it improve to have a sufficient number of extra zero's, and to use SCASD?
real-world code needs to prevent lock-up/overrun by all means. so to be fair, you'd also have to add test code to the other string functions.
http://www.azillionmonkeys.com/qed/asmexample.html
http://www.visionx.com/markl/optimization_tips.htm
http://forums.appleinsider.com/archive/index.php/t-27572.html
Quote from: jj2007 on April 01, 2009, 07:07:20 PM
Quote from: BeeOnRope on April 01, 2009, 06:32:25 PM
One thing you are missing from this thread is that the "larger/clunkier" routines developed by Lingo, NW and jj are blazing fast on short and unaligned strings - much, much faster than than Agner's routine, the CRT routine etc. For example, on some 15 byte misaligned strings, Lingo's routine takes as little as 1 cycle. That's 19x faster than the existing routine, and 7x faster than Agner's routine.
That seems a bit too optimistic, BeeOnRope. Lingo's algo is indeed blazing fast at 4 cycles, but first, the timings may not be so accurate at that scale, and second, it will hardly matter for such short strings - there will be plenty of slower code before and after.
Agreed - 1 cycle is not realistic and likely reflects overlapping of loops by the CPU which won't like occur in practice in real code. Still - the point remains, the "clunky" routines test as significantly faster for short or misaligned strings than the simpler routines, contradicting Hutch's assertion that this quest is misguided because these routines will only work well for giant, aligned strings.
Quote
Laziness is indeed a valid argument - codesize maybe not. A harmless mov eax, len(offset MyString) costs 12 bytes...
Actually I was referring to data size, not code size in this case. Imagine 1 million length 0 strings - you might use 4 MB with length-prefixed strings, compared to 1 MB with null terminated strings. Don't get me wrong, I'm nearly always in favor of explicit length for stings, but the termination technique can be appealing for many very short strings.
Quote from: hutch-- on April 01, 2009, 07:33:44 AM
I would be interested to see if it has become faster again on a core 2 duo or quad.
concerning lea, i've never seen difference between my p3-500/celeron-700/P4-2Ghz/Core2-2Ghz
(my P4 was a northwood i think..., maybe there is difference with a prescot)
Quote from: BeeOnRope on April 01, 2009, 06:20:35 PM
That's an interesting statement. You just said that string algos are *never* used in a serious app, yet I have been in developing several "serious" apps, and I've seen string functions, including strlen, be the bottleneck for interesting workflows for many of them. It's very though to assert that something never happens when I'm saying plainly and without any particular secret motivation that I have seen exactly this in "serious" apps.
I didn't write the functions in question, rather noted the bottleneck in software developed by teams of hundreds of people - I already mentioned that in some cases it is possible to return a length (or to use a class that remembers it), but if you are interoperating with other code you may not have a choice because (a) you don't have the source (b) cannot legally modify the source (c) do not have the time to modify the source, etc.
here the problem come from the approach, most coders "think" in term of tasks, and in the contrary they should have a global approach. i don't know a case where updating a counter from time to time is slower than process an entire area, especially a large one.
(hundred/thousand/million people doing the same thing does not mean they're right, especially when they've been paid to quickly produce a "serious" app. it's a well known fact, you only obtain the product you have paid for. and asking coders to respect time limits certainly not encourage them to "think their code").
Quote from: jj2007 on April 01, 2009, 07:07:20 PM
A harmless mov eax, len(offset MyString) costs 12 bytes...
correct, the code size isn't for free, it's ok for fast algo, but only if you can be sure it's maintained in the cache (and the larger the code is, the more often the cache will be updated).
Quote from: NightWare on April 01, 2009, 10:35:18 PM
here the problem come from the approach, most coders "think" in term of tasks, and in the contrary they should have a global approach. i don't know a case where updating a counter from time to time is slower than process an entire area, especially a large one.
(hundred/thousand/million people doing the same thing does not mean they're right, especially when they've been paid to quickly produce a "serious" app. it's a well known fact, you only obtain the product you have paid for. and asking coders to respect time limits certainly not encourage them to "think their code").
Sure, that approach is fine for a monolithic application written by one or a few people (most assembly-only programs will fall into this category). In practice, with components being written by teams around the globe with differing delivery schedules, coding styles, etc, it is important to think in terms of self-contained components, tasks, APIs, whatever. It simply isn't possible for any one person to have the whole end-to-end workflow or code-flow in his mind at once. If you don't believe this, you have never worked on a large, distributed software project.
Even if you don't believe it, it doesn't answer the point about interaction with legacy or proprietary APIs that you
cannot change.
The problem as I see it is the "one size fits all" approach. Having written libraries for many years commercially back in the 90s I am as a matter of fact familiar with distributed projects but I am also familiar with their defects, the risk of being a headless monster screwed together with a multitude of compromises to fit the deviations of opinion, technique and disposition of the sum total of its contributors, a situation that is something like the corporate decision making process but with a random factor added.
I don't see any problem at all with keeping a dozen different routines to do similar tasks and simiply dial up the one that best fits your need if in fact any of them will fit the need, the alternative is to write another that in fact does do exactly what you need.
I put together a library of all the fastest procedures i've found on here awhile ago, under stress testing and even general use a lot of them messed up on me a lot(great deal of the procedures being loco's code, since he's won most of the speed contests). I've come to love the slower yet reliable code :) old trusty, heh.
and on to more pressing business, Hutch I don't know if you've noticed but you have the devil sign in your post count and your names in red in the user list... :'(
Quote from: E^cube on April 02, 2009, 06:14:01 AM
I put together a library of all the fastest procedures i've found on here awhile ago, under stress testing and even general use a lot of them messed up on me a lot
Interesting. Post some example code, please.
Cube,
In 3 posts time I will have four (4) sixes as my post count, I wonder if that encapsulates both old Nick AND his sidekick ? :bg
Quote from: hutch-- on April 02, 2009, 10:36:40 AM
AND his sidekick ? :bg
Not sure what George W Bush has to do with posting here :bg
add Bill Gates ASCII and you get 666, as well MS DOS 6.22 and Windows 3.11!
so the name, DOS and Windows version have been adjusted to sum up to 666. it is known for many years.
for the topic, I've made such a test program in 1997 (won't you guess, it is long lost).
different methods for memory copy! it was a 80386 SLC, some obscure variant.
now, alignment was absolutely irrelevant to performance.
and also 16 bytes, then 16K bytes, is no good testing.
you must test 256 bytes, 1K, 4K (common cache size/page size).
and then you must differentiate:
-linear access within a page
-linear access within L1/L2 cache
-always accessing the same location (cheating the cache)
-random access within a page
-random access within L1/L2 chaches
-long range random access
if you don't implement all this, your test algorithm is more than questionable.
and I've read the new AMD manuals, the REP SCAS is explicitely recommended for small strings!
so, would you include it, and show the result for REP SCASB as well?
if you have extra time, implement all data sizes: 8 bit to 64 bit, and alignment as well.
Quote from: tetsu-jp on April 02, 2009, 02:43:04 PM
and I've read the new AMD manuals, the REP SCAS is explicitely recommended for small strings!
so, would you include it, and show the result for REP SCASB as well?
Mr tetsu,
I am afraid my knowledge of assembler is not sufficient to implement a strlen algo based on
rep scasb. But we have all read your posts with great interest, and are eager to see how you would do it. Could you post a snippet, please? We know that all your sources got lost, but maybe out of your head, or with the help of the AMD manual?
Thank you so much.
Thanks, on occasion, i allow people simply to call me "Alex".
I'm really honestly interested in such a banchmark, because i wrote such a program in 1997 myself.
Yes unfortunately due to my life circumstances, i have lost many source codes.
this is what i wrote 5 years ago to get string length: http://www.masm32.com/board/index.php?topic=1807.msg82540#msg82540
And i am thinking to write a benchmark (again), for strlen, memcopy and the like,
including 64bit!
I'm not assembly professional, let say, intermediate, the largest source i've ever produced was about 300K.
the purpose to visit the forum is to improve my skills, among having some fun!
so I could really write a benchmark using MASM, if people ask me to do it.
simply cheating the cache, always accessing the same string, is not serious testing.
there was IBM service program, it has done testing upwards, downwards, in certain steps, backwards, random, and twenty other options!
I don't think they just accessed one fixed location.
so all the feature i've listed above must be implemented!
I can do this...but I am not the pro, so it is uncertain, when this is going to happen.
for instance, i do not use the "pro" string length algorithms introduced here in this thread (some of them would make sense for certain applications).
It would be a research project to documentate the REP SCASB (SCAS) performance for all CPUs, over the years, I've read it degraded a little on Pentium, but recently, there might have been new implementations (on AMD CPUs).
I can't do it, I do not have many different computers. someone here might be able to create such a software,
with 100s of options, and donate it to the community!
what i think is that alignment is not so much relevant anymore (tough it can cause some trade-off).
Quote from: BeeOnRope on April 02, 2009, 12:31:54 AM
Even if you don't believe it, it doesn't answer the point about interaction with legacy or proprietary APIs that you cannot change.
what i'm supposed to answer ? laws are what they are (i haven't defined them). all i can say is : life is made of choices, nothing else. and you must assume the results of those choices..., so IF a work doesn't follow
YOUR SPECIFICATIONS, the work is supposed to be refused, IF NOT the work has been made correctly !
IF, later, you want modifications, then ask to the developpers, and pay for... it's the normal PRICE to pay when you don't code your apps yourself...
Hi,
Quotebut maybe out of your head
; - - - String length routine. - - -
; Use SCASB to find a C style string's length,
; 3 April 2009, SRN
StrLenS:
CLD ; Search forward.
MOV EDI,OFFSET Test_Str ; Point destination index to string buffer.
MOV ECX,Limit ; Maximum string length.
MOV AL,0 ; Character to search for.
REPNE SCASB ; Scan for character.
MOV EAX,Limit
SUB EAX,ECX ; Return length in EAX (includes the zero).
RET
Or some such.
Cheers,
Steve N.
Quote from: FORTRANS on April 03, 2009, 03:53:23 PM
Hi,
Quotebut maybe out of your head
; - - - String length routine. - - -
; Use SCASB to find a C style string's length,
; 3 April 2009, SRN
StrLenS:
CLD ; Search forward.
MOV EDI,OFFSET Test_Str ; Point destination index to string buffer.
MOV ECX,Limit ; Maximum string length.
MOV AL,0 ; Character to search for.
REPNE SCASB ; Scan for character.
MOV EAX,Limit
SUB EAX,ECX ; Return length in EAX (includes the zero).
RET
Or some such.
Cheers,
Steve N.
Thanksalot, Steve :bg
Quote from: tetsu-jp on April 02, 2009, 02:43:04 PM
and I've read the new AMD manuals, the REP SCAS is explicitely recommended for small strings!
so, would you include it, and show the result for REP SCASB as well?
tetsu-san,
following your request, I have added Steve's code to the testbed, see attachment and timings below. Now we are of course curious how your code will perform on your AMD, and how you will optimise it.
Intel(R) Celeron(R) M CPU 420 @ 1.60GHz (SSE3)
codesizes: strlen32s=132strlen64B=84NWStrLen=118, _strlen=66 bytes
-- test 16k, misaligned 0, 16434 bytes
StrLenS (FORTRANS) 68312 cycles
strlen32s 3019 cycles
strlen64LingoB 3037 cycles
NWStrLen 3061 cycles
_strlen (Agner Fog) 4444 cycles
-- test 4k, misaligned 11, 4096 bytes
StrLenS (FORTRANS) 17029 cycles
strlen32s 768 cycles
strlen64LingoB 770 cycles
NWStrLen 789 cycles
_strlen (Agner Fog) 1142 cycles
-- test 1k, misaligned 15, 1024 bytes
Masm32 lib szLen 1362 cycles
crt strlen 1012 cycles
StrLenS (FORTRANS) 4302 cycles
strlen32s 206 cycles
strlen64LingoB 199 cycles
NWStrLen 215 cycles
_strlen (Agner Fog) 284 cycles
-- test 0, misaligned 0, 100 bytes
Masm32 lib szLen 136 cycles
crt strlen 114 cycles
StrLenS (FORTRANS) 471 cycles
strlen32s 30 cycles
strlen64LingoB 25 cycles
NWStrLen 34 cycles
_strlen (Agner Fog) 37 cycles
-- test 1, misaligned 1, 100 bytes
Masm32 lib szLen 138 cycles
crt strlen 127 cycles
StrLenS (FORTRANS) 473 cycles
strlen32s 28 cycles
strlen64LingoB 27 cycles
NWStrLen 34 cycles
_strlen (Agner Fog) 35 cycles
-- test 5, misaligned 5, 15 bytes
Masm32 lib szLen 26 cycles
crt strlen 29 cycles
StrLenS (FORTRANS) 125 cycles
strlen32s 6 cycles
strlen64LingoB 5 cycles
NWStrLen 17 cycles
_strlen (Agner Fog) 14 cycles
-- test 15, misaligned 15, 15 bytes
Masm32 lib szLen 27 cycles
crt strlen 26 cycles
StrLenS (FORTRANS) 124 cycles
strlen32s 7 cycles
strlen64LingoB 3 cycles
NWStrLen 15 cycles
_strlen (Agner Fog) 14 cycles
[attachment deleted by admin]
Hi jj2007:
The results from here:
Friday, April 03, 2009 3:32 PM
Intel(R) Core(TM)2 Duo CPU E4600 @ 2.40GHz (SSE4)
codesizes: strlen32s=132strlen64B=84NWStrLen=118, _strlen=66 bytes
-- test 16k, misaligned 0, 16434 bytes
strlen32s 1522 cycles
strlen64LingoB 1231 cycles
NWStrLen 1334 cycles
_strlen (Agner Fog) 2844 cycles
-- test 4k, misaligned 11, 4096 bytes
strlen32s 395 cycles
strlen64LingoB 322 cycles
NWStrLen 348 cycles
_strlen (Agner Fog) 735 cycles
-- test 1k, misaligned 15, 1024 bytes
Masm32 lib szLen 1071 cycles
crt strlen 629 cycles
strlen32s 111 cycles
strlen64LingoB 85 cycles
NWStrLen 111 cycles
_strlen (Agner Fog) 182 cycles
-- test 0, misaligned 0, 100 bytes
Masm32 lib szLen 107 cycles
crt strlen 69 cycles
strlen32s 17 cycles
strlen64LingoB 11 cycles
NWStrLen 18 cycles
_strlen (Agner Fog) 21 cycles
-- test 1, misaligned 1, 100 bytes
Masm32 lib szLen 105 cycles
crt strlen 100 cycles
strlen32s 17 cycles
strlen64LingoB 11 cycles
NWStrLen 18 cycles
_strlen (Agner Fog) 21 cycles
-- test 5, misaligned 5, 15 bytes
Masm32 lib szLen 19 cycles
crt strlen 17 cycles
strlen32s 5 cycles
strlen64LingoB 1 cycles
NWStrLen 8 cycles
_strlen (Agner Fog) 7 cycles
-- test 15, misaligned 15, 15 bytes
Masm32 lib szLen 19 cycles
crt strlen 16 cycles
strlen32s 4 cycles
strlen64LingoB 2 cycles
NWStrLen 9 cycles
_strlen (Agner Fog) 7 cycles
-- Hit X Key --
Regards herge
(http://i25.photobucket.com/albums/c69/nikemoto2511/web200902/strlen.png)
how can the exe file be produced? i tried with the include file from previous attachment,
all i get is a blank line, and then command prompt (and the exe file locked).
i have copied ML.EXE from the VC directory, it is 9.0, it is assembling,
and also linking works.
but the program can not work correctly!
what i am doing wrong???
I've just started with MASM32!
any idea why it can not act?
and yes, REP SCAS is slower...
if i can get the source working, I'll try SCASW, SCASD, and SCASQ (should be faster).
I have tried both linkers, the original MASM32, and from the VC directory: 43520 bytes exe file
> how can the exe file be produced?
Did you choose CONSOLE assembly? I use RichMasm, which autodetects console/windows, but in other IDE's you might need to specify that explicitly.
I can assemble the supplied MASM32 examples, both via IDE, and via CLI:
-using the supplied .BAT file
-typing the command directly ~(ARGHH ..... this can work via copying binaries into the work directory.
so all this works, but the .EXE can not perform anything. something is not set up right.
I've removed the .EXE, and it is freshly generated, so assembler and linker work.
EDIT: I get along now! as i've guessed, the options have not been set up correctly, MASM32 just performs a plain call.
well, i had some fun with AZTEC C in a similar manner (and it requires a small file from a commercial SDK, one disk is defective a little, so people wo don't know, well they can try forever).
2 hours or 3 hours (I did other things as well).
by the way, the thumbnail is 70Kbyte, and the fullsize PNG just 16K
(http://i25.photobucket.com/albums/c69/nikemoto2511/web200902/richmasm.png)
so please, include information about how to build this project. not everyone can read your mind.
therre are many other threads.
so i think your code (strlen) will be gently skipped (by me).
the problem was there is no makefile.
i wanted the timing for the SCAS, and that's the point.
someone already added it.
by the way i can understand most of the strlen sources, thanks.
it's really a waste of time to write you a reply but here you go.
1. You are new in assembly ->"I've just started with MASM32!"
(http://i25.photobucket.com/albums/c69/nikemoto2511/web200902/th_DailyComic_Page.jpg) (http://i25.photobucket.com/albums/c69/nikemoto2511/web200902/DailyComic_Page.jpg)
you don't read carefully. i used MASM32 before, and wrote other assembly programs as well.
i just...had a break of 5 years.
i hope..you will not experience "the beans" in your life.
some people...just experience it, you know.
PS: it works now, see screenshot.
so the correct spelling is: I've just started with MASM32 (again) on a new machine...after a break of 5 years (not using assembly language).
tetsu-jp,
while lingo's personality is strong and he can be very direct, he is one of if not the most gifted assembly programmer on this forum/anywhere. Rarely can anyone write faster code than him, which signify's that he has deep underlying system understanding so keeping that in mind, and what he said to you, i'd listen to him. The Gensis project is aimed at helping people quick start with MASM and i'm sure they help with assembly questions ingeneral.
hmm, the laboratory is certainly not the appropriate place, yes.
however, his comment concerning SCACSB isn't totally wrong, it slower yes, but it use a hack to avoid branch misprediction (similar to movcc), so it WAS faster for small string... unfortunately, later, simd instructions have been introduced, and of course the speed difference has changed... yep, things must always been replaced in their context...
so he's an assembler wiz.
I've run a few tests, and notice differences each time the program runs.
large differences, upto 30 percent (no modification).
also i have modified the code for REP SCASD (within cache), and now the difference is only 4x.
it is OK you are the pro's, for years, if not decades? but who can deal with you?
i am willing to do it, and i can understand all of the source code, no worry.
don't understand your trouble.
i do not have general assembly questions, and just to work with the examples supplied,
there would be no need to deal with the forum.
it is just for fun, i do not use assembly for commercial projects.
so i have added SCASW and SCASD, and figured out, 30% difference each time the program is started.
so the numbers are not very relieable- performance can depend on many factors (usually there are more programs running at the same time, occupying the cache and all that).
but yes, you are the small group of pro's, and know, SCAS is ten times slower, 15 times slower.
i guess, SCASQ is just two times slower, in some contexts.
but saves people from artistic code (which also can be good).
(http://i25.photobucket.com/albums/c69/nikemoto2511/web200902/repnescasd.png)
this is the result for REP SCASD, cache=0
Quote from: E^cube on April 03, 2009, 10:34:36 PM
tetsu-jp,
while lingo's personality is strong and he can be very direct, he is one of if not the most gifted assembly programmer on this forum/anywhere. Rarely can anyone write faster code than him, which signify's that he has deep underlying system understanding so keeping that in mind, and what he said to you, i'd listen to him. The Gensis project is aimed at helping people quick start with MASM and i'm sure they help with assembly questions ingeneral.
I'm not gifted...you can clearly see that, i need two hours to get the source working!
but makefile is no shame, you could call a makefile:
providing "deep underlying system understanding" for people who don't have it for some reason
(for instance, they can not read the brains of a small insider group).
there are people who h&te assembler, or refuse to deal with it completely. well i like it, but I understand why.
because the assembler wiz, who simply does not know that his world is just a special case, and not real world.
someone wrote "
the strlen algorithms are not used in commercial applications"?
i think i've read this 3 pages ago.
so the wiz status is just to show off, in reality, the numbers are different.
yes, i like this SSE2 stuff, and will read all your code.
Quote from: tetsu-jp on April 03, 2009, 11:26:44 PM
someone wrote "the strlen algorithms are not used in commercial applications"?
i think i've read this 3 pages ago.
so the wiz status is just to show off, in reality, the numbers are different.
no, i've said it's never used in SERIOUS apps, and it's not exactly what commercial applications are... by this you "should" have understood : nationnal app/database systems for administrations, army, etc... you've just avoided another occasion to keep some credibility... :(
Quote from: tetsu-jp on April 03, 2009, 11:22:04 PM
this is the result for REP SCASD, cache=0
You might have a look at the second line of your screenshot.
yes i know, error message. i have not modified the macro, which is generating the string sequence.
so i think it is just a buggy message.
align 16
db 11 dup (0) ; misaligned by 11
szTest_4k db txt50
REPEAT 80-1
db txt50
ENDM
db txt50, "4096 bytes************************************", 0,0,0,0,0
align 16
szTest_16k db txt50
still should be 4K?!?
**************
I was thinking to extend the software:
-use random strings, random length, random location, get some more parameters for that.
CPU will behave differently than just one&the same string, same length, again and again.
-link with a VB program, and put the results in a database!
then it can be compared using EXCEL.
-provide a web service, to upload results!
then after a while, numerous CPUs can be compared.
-add more algorithms: memory copy, search for specific pattern
-64 bit support
I can do all this, but...as you write, there are members with superior knowledge.
so, why hijack the project, and steal the show?
I mean, i just made requests, and suggestions.
the question i had was "HOW MUCH FASTER compared to SCAS".
and, yes, why not waste a few bytes, and use longword instructions?
NP if you have Gbytes of memory.
so what do you think about the extensions?
for instance, you could generate a list in C++/C#/VB, and pass it to the assembly program.
this would be "real world data", not just a static string.
I never wrote my solution is superior, or i am the better wizard.
just, there are features missing in this software, i just wrote a few of them.
Quote from: tetsu-jp on April 04, 2009, 12:43:48 PM
yes i know, error message. i have not modified the macro, which is generating the string sequence.
so i think it is just a buggy message.
Njet. The message is correct. It's your code that is buggy.
unlikely if you compare the relation to 16K (which is about 1:4)
anyway, i will investigate later on today. the source is not that difficult, it's about the level i can follow without major problems.
and the CPU detect- I think I'll borrow that for my own projects- and give a copyright reference.
no need to re-invent such a code...
so i have some fun...
Uhm...i can copy strings (in C)
StrLenS proc src:DWORD
push esi
mov esi,[esp+8]
xor ecx,ecx
pxor mm0,mm0
xor ebx,ebx
_reloop:
movq mm1,[esi]
pcmpeqd mm1,mm0
inc ecx
movq mm2,[esi+8]
pcmpeqd mm2,mm0
lea esi,[esi+16]
por mm1, mm2
pmovmskb eax, mm1
test eax,eax
jz _reloop
clc
shl ecx,4
mov eax,ecx
pop esi
; push eax
; push edi
; mov edi,[esp+12]
; add edi,eax
; mov ecx,32
; xor eax,eax
; std
; repe scasb
; mov eax,32
; sub eax,ecx
; mov ecx,eax
; pop edi
;
; cld
; pop eax
; sub eax,ecx
ret 4
StrLenS endp
i wrote this (using 64bit MMX).
it's a little faster than Agner Fog's stuff.
but i can not fix the string length stuff correctly!
at least, not today.
so you see, i have examined your codes a little.
I've just downloaded the manuals with 128bit instructions a few days ago.
they must be aligned, or exception will happen.
my idea is to use 64bit, do not care about alignment at all (maybe enfore it in software anyway),
and fix the length via SCAS.
short strings can be copied to aligned space.
long strings- unaligned, and determine their length? i can not think of such a case.
i understand your efforts are to align the data, and also to test byte by byte.
is such code really required? i try to think of a real-world software, which has large unaligned strings.
(http://i25.photobucket.com/albums/c69/nikemoto2511/web200902/th_tetsu-1.gif) (http://i25.photobucket.com/albums/c69/nikemoto2511/web200902/tetsu-1.gif)
now, i have made modifications...
can't get the correct string length. what's wrong with the code?
it works using 64bit MMX, not 128bits.
so it's hard to be the 128bit MMX!
also i think the Genesys is not active at all- and there won't be an explanation what's wrong with the string length.
the code at MyTest is strange (patching bytes). can someone explain? i tried an hour to determine the extra bytes.
[attachment deleted by admin]