News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

Bit-array access using the bit test instructions

Started by MichaelW, June 04, 2011, 09:08:23 AM

Previous topic - Next topic

MichaelW

I recently tested, in another language, bit-array access using the bit test instructions and found that these instructions are surprisingly fast, considering what they can do. This code is a MASM port of the essential parts of the test.

;==============================================================================
    include \masm32\include\masm32rt.inc
    .686
    include \masm32\macros\timers.asm
;==============================================================================
    .data
    .code
;==============================================================================
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
;==============================================================================

align 16
dummy proc parray:DWORD, index:DWORD
    mov edx, [esp+4]
    mov eax, [esp+8]
    ret 8
dummy endp

;==============================================================================

align 16
bit_set proc parray:DWORD, index:DWORD
    mov edx, [esp+4]
    mov eax, [esp+8]
    bts DWORD PTR [edx], eax
    ret 8
bit_set endp

;==============================================================================

align 16
bit_reset proc parray:DWORD, index:DWORD
    mov edx, [esp+4]
    mov eax, [esp+8]
    btr DWORD PTR [edx], eax
    ret 8
bit_reset endp

;==============================================================================

align 16
bit_get proc parray:DWORD, index:DWORD
    mov edx, [esp+4]
    mov ecx, [esp+8]
    xor eax, eax
    bt  DWORD PTR [edx], ecx
    adc eax, 0
    ret 8
bit_get endp

;==============================================================================
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
;==============================================================================
start:
;==============================================================================
    ARRAY_SIZE equ 20000000

    mov esi, alloc(ARRAY_SIZE)

    xor edi, edi                  ; byte index
    xor ebx, ebx                  ; bit index

    .WHILE edi < ARRAY_SIZE

        FOR idx,<0,2,4,6>
            mov eax, ebx
            add eax, idx
            invoke bit_set, esi, eax
        ENDM
        movzx eax, BYTE PTR [esi+edi]
        .IF eax != 01010101b
            print "error"
        .ENDIF
        FOR idx,<0,2,4,6>
            mov eax, ebx
            add eax, idx
            invoke bit_get, esi, eax
            .IF eax != 1
                print "error"
            .ENDIF
        ENDM
        FOR idx,<0,2,4,6>
            mov eax, ebx
            add eax, idx
            invoke bit_reset, esi, eax
        ENDM
        movzx eax, BYTE PTR [esi+edi]
        .IF eax != 0
            print "error"
        .ENDIF
        FOR idx,<0,2,4,6>
            mov eax, ebx
            add eax, idx
            invoke bit_get, esi, eax
            .IF eax != 0
                print "error"
            .ENDIF
        ENDM

        FOR idx,<1,3,5,7>
            mov eax, ebx
            add eax, idx
            invoke bit_set, esi, eax
        ENDM
        movzx eax, BYTE PTR [esi+edi]
        .IF eax != 10101010b
            print "error"
        .ENDIF
        FOR idx,<1,3,5,7>
            mov eax, ebx
            add eax, idx
            invoke bit_get, esi, eax
            .IF eax != 1
                print "error"
            .ENDIF
        ENDM
        FOR idx,<1,3,5,7>
            mov eax, ebx
            add eax, idx
            invoke bit_reset, esi, eax
        ENDM
        movzx eax, BYTE PTR [esi+edi]
        .IF eax != 0
            print "error"
        .ENDIF
        FOR idx,<1,3,5,7>
            mov eax, ebx
            add eax, idx
            invoke bit_get, esi, eax
            .IF eax != 0
                print "error"
            .ENDIF
        ENDM

        inc edi
        add ebx, 8

    .ENDW

    invoke Sleep, 3000

    I=0

    REPEAT 3

        counter_begin 1000, HIGH_PRIORITY_CLASS
            invoke dummy, esi, I
        counter_end
        print str$(eax)," cycles, dummy",13,10

        counter_begin 1000, HIGH_PRIORITY_CLASS
            invoke bit_set, esi, I
        counter_end
        print str$(eax)," cycles, bit_set",13,10

        counter_begin 1000, HIGH_PRIORITY_CLASS
            invoke bit_reset, esi, I
        counter_end
        print str$(eax)," cycles, bit_reset",13,10

        counter_begin 1000, HIGH_PRIORITY_CLASS
            invoke bit_get, esi, I
        counter_end
        print str$(eax)," cycles, bit_get",13,10

        print chr$(13,10)

        I=I+1

    ENDM

    I=ARRAY_SIZE

    REPEAT 3

        counter_begin 1000, HIGH_PRIORITY_CLASS
            invoke dummy, esi, I
        counter_end
        print str$(eax)," cycles, dummy",13,10

        counter_begin 1000, HIGH_PRIORITY_CLASS
            invoke bit_set, esi, I
        counter_end
        print str$(eax)," cycles, bit_set",13,10

        counter_begin 1000, HIGH_PRIORITY_CLASS
            invoke bit_reset, esi, I
        counter_end
        print str$(eax)," cycles, bit_reset",13,10

        counter_begin 1000, HIGH_PRIORITY_CLASS
            invoke bit_get, esi, I
        counter_end
        print str$(eax)," cycles, bit_get",13,10

        print chr$(13,10)

        I=I+12345

    ENDM

    free esi

    inkey "Press any key to exit..."
    exit
;==============================================================================
end start

Running on a P3:

3 cycles, dummy
8 cycles, bit_set
8 cycles, bit_reset
8 cycles, bit_get

3 cycles, dummy
8 cycles, bit_set
8 cycles, bit_reset
8 cycles, bit_get

3 cycles, dummy
8 cycles, bit_set
8 cycles, bit_reset
8 cycles, bit_get

3 cycles, dummy
8 cycles, bit_set
8 cycles, bit_reset
8 cycles, bit_get

3 cycles, dummy
7 cycles, bit_set
7 cycles, bit_reset
8 cycles, bit_get

3 cycles, dummy
7 cycles, bit_set
7 cycles, bit_reset
8 cycles, bit_get


On my P3 a setc al version of the bit_get procedure was no faster.
eschew obfuscation

jj2007

Celeron M:
2 cycles, dummy
7 cycles, bit_set
7 cycles, bit_reset
6 cycles, bit_get

2 cycles, dummy
7 cycles, bit_set
6 cycles, bit_reset
5 cycles, bit_get

2 cycles, dummy
6 cycles, bit_set
7 cycles, bit_reset
5 cycles, bit_get


I wonder, though, if test is not a better choice in many cases... see attachment.
Intel(R) Celeron(R) M CPU        420  @ 1.60GHz (SSE3)
297     cycles for bt
147     cycles for test


MichaelW

QuoteI wonder, though, if test is not a better choice in many cases

Sorry, I made a mistake in my function-test code that obscured what the code is supposed to do, and prevented it from doing with it is supposed to do, but didn't make it misbehave in any obvious way. The error is now corrected.

In the function-test code, for the bit test instructions the first operand is the base of the allocated memory and the second is a small offset from the bit index in EBX, which varies over the range 0 to 19,999,999*8. And the byte index in EDI, which varies over the range 0 to 19,999,999, is used to verify that the bit test instruction acted on the intended byte. This is what I meant by "considering what they can do".
eschew obfuscation

Rockoon

Optimizing this sort of stuff is pointless and here is why:

If the dataset is large then
..If you are doing random access, then cache misses will completely dominate your runtime.
..If you are doing sequential access, then there are much better ways.
else
..a byte array is faster
When C++ compilers can be coerced to emit rcl and rcr, I *might* consider using one.

fearless

Id often thought about doing something like that for memory bitmaps of database sets. For filters or other conditions, even sql like queries. I think Visual Foxpro works this way with simple queries so that two or more bitmap arrays can be ANDed or ORed to produce a final working resultset/dataset to work with.
ƒearless

MichaelW

Quote from: Rockoon on June 04, 2011, 01:28:42 PM
Optimizing this sort of stuff is pointless...

I found something that I thought was interesting so I coded an app and posted it. I made no claims regarding the speed of the bit test instructions relative to other instructions, just that they are IMO surprisingly fast, considering what they can do. This may be pointless for you, but it's not for me.

Quote
If the dataset is large then
..If you are doing random access, then cache misses will completely dominate your runtime.

Yes.

Quote
..If you are doing sequential access, then there are much better ways.

Yes, depending on how you define "much better".

Quote
else
..a byte array is faster

Show me. If the byte array is storing one bit per byte then sequential access to the bits will be fast, but for random access I expect a more compact bit array, accessed with the bit test instructions, to be faster.

The following is an attempt to compare access times for a one-bit-per-byte byte array accessed conventionally, and a bit array accessed with the bit test instructions, for sequential access and random access.

;==============================================================================
    include \masm32\include\masm32rt.inc
;==============================================================================

printf MACRO format:REQ, args:VARARG
    IFNB <args>
        invoke crt_printf, cfm$(format), args
    ELSE
        invoke crt_printf, cfm$(format)
    ENDIF
    EXITM <>
ENDM

;==============================================================================

    ; -------------------------------------------------------
    ; This is an assembly-time random number generator based
    ; on code by George Marsaglia:
    ;   #define znew  ((z=36969*(z&65535)+(z>>16))<<16)
    ;   #define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
    ;   #define MWC   (znew+wnew)
    ; -------------------------------------------------------

    @znew_seed@ = 362436069
    @wnew_seed@ = 521288629

    @rnd MACRO base:REQ
      LOCAL znew, wnew

      @znew_seed@ = 36969 * (@znew_seed@ AND 65535) + (@znew_seed@ SHR 16)
      znew = @znew_seed@ SHL 16

      @wnew_seed@ = 18000 * (@wnew_seed@ AND 65535) + (@wnew_seed@ SHR 16)
      wnew = @wnew_seed@ AND 65535

      EXITM <(znew + wnew) MOD base>
    ENDM

;==============================================================================
    .data
    .code
;==============================================================================
start:
;==============================================================================

    ARRAY_SIZE    equ 100000000   ; size in bits
    REPEAT_COUNT  equ 500

    invoke GetCurrentProcess
    invoke SetPriorityClass, eax, HIGH_PRIORITY_CLASS

    REPEAT 3

        mov esi, alloc(ARRAY_SIZE/8)
        invoke Sleep, 3000

        invoke GetTickCount
        push eax
        xor ebx, ebx
        .WHILE ebx < ARRAY_SIZE
            REPEAT REPEAT_COUNT
                bts DWORD PTR [esi], ebx
                inc ebx
            ENDM
        .ENDW
        invoke GetTickCount
        pop edx
        sub eax, edx
        printf("bit set   seq  %d\n", eax)

        invoke GetTickCount
        push eax
        xor ebx, ebx
        .WHILE ebx < ARRAY_SIZE
            REPEAT REPEAT_COUNT
                xor eax, eax
                bt  DWORD PTR [esi], ebx
                adc eax, 0
                inc ebx
            ENDM
        .ENDW
        invoke GetTickCount
        pop edx
        sub eax, edx
        printf("bit get   seq  %d\n", eax)
        free esi

        mov esi, alloc(ARRAY_SIZE)
        invoke Sleep, 3000

        invoke GetTickCount
        push eax
        xor ebx, ebx
        .WHILE ebx < ARRAY_SIZE
            REPEAT REPEAT_COUNT
                mov BYTE PTR [esi+ebx], 1
                inc ebx
            ENDM
        .ENDW
        invoke GetTickCount
        pop edx
        sub eax, edx
        printf("byte set  seq  %d\n", eax)

        invoke GetTickCount
        push eax
        xor ebx, ebx
        .WHILE ebx < ARRAY_SIZE
            REPEAT REPEAT_COUNT
                movzx eax, BYTE PTR [esi+ebx]
                inc ebx
            ENDM
        .ENDW
        invoke GetTickCount
        pop edx
        sub eax, edx
        printf("byte get  seq  %d\n\n", eax)
        free esi

    ENDM
    printf("\n")

;==============================================================================

    REPEAT 3

        mov esi, alloc(ARRAY_SIZE/8)
        invoke Sleep, 3000

        invoke GetTickCount
        push eax
        xor ebx, ebx
        .WHILE ebx < ARRAY_SIZE
            REPEAT REPEAT_COUNT
                mov ecx, @rnd(ARRAY_SIZE)
                bts DWORD PTR [esi], ecx
                inc ebx
            ENDM
        .ENDW
        invoke GetTickCount
        pop edx
        sub eax, edx
        printf("bit set   rnd  %d\n", eax)

        invoke GetTickCount
        push eax
        xor ebx, ebx
        .WHILE ebx < ARRAY_SIZE
            REPEAT REPEAT_COUNT
                xor eax, eax
                mov ecx, @rnd(ARRAY_SIZE)
                bt  DWORD PTR [esi], ecx
                adc eax, 0
                inc ebx
            ENDM
        .ENDW
        invoke GetTickCount
        pop edx
        sub eax, edx
        printf("bit get   rnd  %d\n", eax)
        free esi

        mov esi, alloc(ARRAY_SIZE)
        invoke Sleep, 3000

        invoke GetTickCount
        push eax
        xor ebx, ebx
        .WHILE ebx < ARRAY_SIZE
            REPEAT REPEAT_COUNT
                mov ecx, @rnd(ARRAY_SIZE)
                mov BYTE PTR [esi+ecx], 1
                inc ebx
            ENDM
        .ENDW
        invoke GetTickCount
        pop edx
        sub eax, edx
        printf("byte set  rnd  %d\n", eax)

        invoke GetTickCount
        push eax
        xor ebx, ebx
        .WHILE ebx < ARRAY_SIZE
            REPEAT REPEAT_COUNT
                mov ecx, @rnd(ARRAY_SIZE)
                movzx eax, BYTE PTR [esi+ecx]
                inc ebx
            ENDM
        .ENDW
        invoke GetTickCount
        pop edx
        sub eax, edx
        printf("byte get  rnd  %d\n\n", eax)
        free esi

    ENDM
    printf("\n")

;==============================================================================

    REPEAT 3

        mov esi, alloc(ARRAY_SIZE/8)
        invoke Sleep, 3000

        invoke GetTickCount
        push eax
        xor ebx, ebx
        .WHILE ebx < ARRAY_SIZE
            REPEAT REPEAT_COUNT
                mov ecx, @rnd(ARRAY_SIZE)
                bts DWORD PTR [esi], ecx
                inc ebx
            ENDM
        .ENDW
        invoke GetTickCount
        pop edx
        sub eax, edx
        printf("bit set   rnd  %d\n", eax)

        invoke GetTickCount
        push eax
        xor ebx, ebx
        .WHILE ebx < ARRAY_SIZE
            REPEAT REPEAT_COUNT
                xor eax, eax
                mov ecx, @rnd(ARRAY_SIZE)
                bt  DWORD PTR [esi], ecx
                adc eax, 0
                inc ebx
            ENDM
        .ENDW
        invoke GetTickCount
        pop edx
        sub eax, edx
        printf("bit get   rnd  %d\n", eax)
        free esi

        mov esi, alloc(ARRAY_SIZE)
        invoke Sleep, 3000

        invoke GetTickCount
        push eax
        xor ebx, ebx
        .WHILE ebx < ARRAY_SIZE
            REPEAT REPEAT_COUNT
                mov BYTE PTR [esi+@rnd(ARRAY_SIZE)], 1
                inc ebx
            ENDM
        .ENDW
        invoke GetTickCount
        pop edx
        sub eax, edx
        printf("byte set  rnd  %d\n", eax)

        invoke GetTickCount
        push eax
        xor ebx, ebx
        .WHILE ebx < ARRAY_SIZE
            REPEAT REPEAT_COUNT
                movzx eax, BYTE PTR [esi+@rnd(ARRAY_SIZE)]
                inc ebx
            ENDM
        .ENDW
        invoke GetTickCount
        pop edx
        sub eax, edx
        printf("byte get  rnd  %d\n\n", eax)
        free esi

    ENDM

    inkey "Press any key to exit..."
    exit
;==============================================================================
end start


bit set   seq  1182
bit get   seq  1302
byte set  seq  591
byte get  seq  721

bit set   seq  1182
bit get   seq  1302
byte set  seq  591
byte get  seq  711

bit set   seq  1181
bit get   seq  1302
byte set  seq  591
byte get  seq  711


bit set   rnd  7591
bit get   rnd  3855
byte set  rnd  9965
byte get  rnd  5888

bit set   rnd  7400
bit get   rnd  3916
byte set  rnd  9754
byte get  rnd  5799

bit set   rnd  7641
bit get   rnd  3685
byte set  rnd  9734
byte get  rnd  5819


bit set   rnd  7391
bit get   rnd  3825
byte set  rnd  10264
byte get  rnd  5879

bit set   rnd  7481
bit get   rnd  3785
byte set  rnd  9814
byte get  rnd  5808

bit set   rnd  7651
bit get   rnd  3866
byte set  rnd  9895
byte get  rnd  5788

eschew obfuscation

FORTRANS

Hi,

   FWIW.

Cheers,


A:\WORK>BITTEST                 G:\WORK>bittest               bit set   seq  531
bit set   seq  11854            bit set   seq  781            bit get   seq  360
bit get   seq  34976            bit get   seq  831            byte set  seq  310
byte set  seq  40769            byte set  seq  611            byte get  seq  80
byte get  seq  355036           byte get  seq  521
                                                              bit set   seq  480
bit set   seq  10205            bit set   seq  781            bit get   seq  361
bit get   seq  16749            bit get   seq  842            byte set  seq  250
byte set  seq  40984            byte set  seq  601            byte get  seq  141
byte get  seq  355629           byte get  seq  521
                                                              bit set   seq  561
bit set   seq  11124            bit set   seq  781            bit get   seq  360
bit get   seq  34172            bit get   seq  831            byte set  seq  260
byte set  seq  41288            byte set  seq  600            byte get  seq  80
byte get  seq  390531           byte get  seq  521

                                                               bit set   rnd  992
bit set   rnd  54929            bit set   rnd  2052            bit get   rnd  611
bit get   rnd  43308            bit get   rnd  1753            byte set  rnd  1753
byte set  rnd  59710            byte set  rnd  2995            byte get  rnd  1031
byte get  rnd  52234            byte get  rnd  2554
                                                               bit set   rnd  992
bit set   rnd  62500            bit set   rnd  2023            bit get   rnd  611
bit get   rnd  41249            bit get   rnd  1783            byte set  rnd  1753
byte set  rnd  57929            byte set  rnd  2954            byte get  rnd  1041
byte get  rnd  50904            byte get  rnd  2534
                                                               bit set   rnd  982
bit set   rnd  48824            bit set   rnd  2043            bit get   rnd  591
bit get   rnd  42260            bit get   rnd  1693            byte set  rnd  1773
byte set  rnd  60984            byte set  rnd  2994            byte get  rnd  1061
byte get  rnd  51626            byte get  rnd  2504

                                                               bit set   rnd  982
bit set   rnd  49159            bit set   rnd  2003            bit get   rnd  611
bit get   rnd  41565            bit get   rnd  1743            byte set  rnd  1742
byte set  rnd  59019            byte set  rnd  3055            byte get  rnd  1052
byte get  rnd  51135            byte get  rnd  2493
                                                               bit set   rnd  992
bit set   rnd  49659            bit set   rnd  1983            bit get   rnd  591
bit get   rnd  42075            bit get   rnd  1722            byte set  rnd  1662
byte set  rnd  60724            byte set  rnd  2985            byte get  rnd  1062
byte get  rnd  46155            byte get  rnd  2463
                                                               bit set   rnd  982
bit set   rnd  49645            bit set   rnd  2013            bit get   rnd  600
bit get   rnd  42179            bit get   rnd  1753            byte set  rnd  1712
byte set  rnd  64354            byte set  rnd  2984            byte get  rnd  1052
byte get  rnd  49721            byte get  rnd  2444
                                                               Press any key to exit...
Press any key to exit...        Press any key to exit...

Rockoon

Quote from: MichaelW on June 05, 2011, 07:47:03 AM
I found something that I thought was interesting so I coded an app and posted it. I made no claims regarding the speed of the bit test instructions relative to other instructions, just that they are IMO surprisingly fast, considering what they can do. This may be pointless for you, but it's not for me.

Why take things so personal? Seriously... It wasnt an attack on you.

Quote from: MichaelW on June 05, 2011, 07:47:03 AM
Show me. If the byte array is storing one bit per byte then sequential access to the bits will be fast, but for random access I expect a more compact bit array, accessed with the bit test instructions, to be faster.

You can show yourself. Just reduce that array size to something that follows the logic I laid out (else -> its a small data set)

The biggest killer for BTR/BTS/BTC is that they generate 10 uops on the latest intels and 8 uops on the latest AMD's for the BTx [mem], reg form
When C++ compilers can be coerced to emit rcl and rcr, I *might* consider using one.

MichaelW

Quote from: Rockoon on June 06, 2011, 07:32:34 PM
Why take things so personal? Seriously... It wasnt an attack on you.

How am I supposed to take it otherwise when you insult my intelligence with your "pointless and here is why" statement?
eschew obfuscation

Rockoon

Quote from: MichaelW on June 08, 2011, 04:51:31 AM
Quote from: Rockoon on June 06, 2011, 07:32:34 PM
Why take things so personal? Seriously... It wasnt an attack on you.

How am I supposed to take it otherwise when you insult my intelligence with your "pointless and here is why" statement?


I see. So any time someone talks about anything that you might be doing then its personal, and if its at all contrary to what you are doing then its insulting your intelligence.

What are you... 12 years old?
When C++ compilers can be coerced to emit rcl and rcr, I *might* consider using one.

jj2007

Rockoon,

Michael is one of the most polite gentlemen here on the board, and your words were clearly insulting. Being a good programmer, and having a good knowledge on Windows internals and processors, is no excuse for bad manners. Go have a beer with lingo.

Rockoon

Quote from: jj2007 on June 08, 2011, 03:52:48 PM
Rockoon,

Michael is one of the most polite gentlemen here on the board, and your words were clearly insulting.

I understand that you want to defend your friend, but dont be dishonest with yourself (and others) in the process.

Lets repeat this "insulting" statement..

"Optimizing this stuff is pointless and here is why"

No, thats not insulting. Yes, you are being dishonest.
When C++ compilers can be coerced to emit rcl and rcr, I *might* consider using one.

brethren

your whole attitude is insulting, with your pointless this and 12 years old that. the fact that you can't see it leads me to believe i've just met yet another programmer without social skills :U

oex

We are all of us insane, just to varying degrees and intelligently balanced through networking

http://www.hereford.tv