Print Page - Gigabit integer add optimization

Title: Gigabit integer add optimization
Post by: jnm2 on February 15, 2012, 03:50:31 PM

I wrote this addition code to add the high half to the low half of four-gigabit integers fairly quickly. I'm a little new to assembly. Can anyone help me spot speed optimizations?

What's going on:
I have a contiguous four gigabits (512 MB) allocated in the RAM. I need to add the first half (two gigabits) to the lower half with the high end carrying back around to the least significant bit.
I'm using an eight core CPU, so each core is adding 256 megabits of the number and carrying to the next chunk. The highest chunk carries to the lowest.
Pardon the mysterious numbers, but hard-coding the addition length (0x400000 quadwords for each core) and offset (0x20000000) runs faster than using registers and passing in parameters from C++.
Interleaving the integers (rather than adding integers that are physically two gigabits apart) actually slows the code down, which end up being convenient for me.
(When I say faster here, I mean I've profiled it.)

rcx contains a pointer to the beginning of the chunk. rdx contains a pointer to the quadword that will be carried to.
(Do I need to lock the initial add and final carry for multiple cores? It's much slower and will only be a problem if one block is finishing as another starts. They will all be told to start simultaneously.)

Unrolling the loop 32 times seems to hit a sweet spot with my CPU.
Doing two reads then two adds seems to hit a sweet spot, and significantly more speed is gained by repeating rax, rbx, adc rbx, adc rax rather than rax, rbx, rax, rbx. I tested configurations of three and four.
I wish there was a way to decrement the counter by 32 without touching the carry flag or repeating dec r8 32 times. (Can I use rep here?)
I looked into SIMD and could not find a solution faster than the normal quadword add with carry.

It's currently running at ~92 adds per second being called from a C++ loop. I will be moving thins into assembler as I get more pieces in place.
But for me, this means just under nine months of continuous execution tacked on to the wait time for the result I'm working on. I would like to see that amount of time get as small as possible.

Without further ado, here it is:

Code Select

.code
public HugeAdd
HugeAdd proc
	mov r8, 400000h
	mov rbx, qword ptr [rcx+r8*8-010h]
	mov rax, qword ptr [rcx+r8*8-008h]
	add qword ptr [rcx+r8*8-008h+2000000h], rax ; lock?
	adc qword ptr [rcx+r8*8-010h+2000000h], rbx
	mov rbx, qword ptr [rcx+r8*8-020h]
	mov rax, qword ptr [rcx+r8*8-018h]
	adc qword ptr [rcx+r8*8-018h+2000000h], rax
	adc qword ptr [rcx+r8*8-020h+2000000h], rbx
	mov rbx, qword ptr [rcx+r8*8-030h]
	mov rax, qword ptr [rcx+r8*8-028h]	
	adc qword ptr [rcx+r8*8-028h+2000000h], rax
	adc qword ptr [rcx+r8*8-030h+2000000h], rbx
	mov rbx, qword ptr [rcx+r8*8-040h]
	mov rax, qword ptr [rcx+r8*8-038h]	
	adc qword ptr [rcx+r8*8-038h+2000000h], rax
	adc qword ptr [rcx+r8*8-040h+2000000h], rbx
	mov rbx, qword ptr [rcx+r8*8-050h]
	mov rax, qword ptr [rcx+r8*8-048h]	
	adc qword ptr [rcx+r8*8-048h+2000000h], rax
	adc qword ptr [rcx+r8*8-050h+2000000h], rbx
	mov rbx, qword ptr [rcx+r8*8-060h]
	mov rax, qword ptr [rcx+r8*8-058h]
	adc qword ptr [rcx+r8*8-058h+2000000h], rax
	adc qword ptr [rcx+r8*8-060h+2000000h], rbx
	mov rbx, qword ptr [rcx+r8*8-070h]
	mov rax, qword ptr [rcx+r8*8-068h]	
	adc qword ptr [rcx+r8*8-068h+2000000h], rax
	adc qword ptr [rcx+r8*8-070h+2000000h], rbx
	mov rbx, qword ptr [rcx+r8*8-080h]
	mov rax, qword ptr [rcx+r8*8-078h]	
	adc qword ptr [rcx+r8*8-078h+2000000h], rax
	adc qword ptr [rcx+r8*8-080h+2000000h], rbx
	mov rbx, qword ptr [rcx+r8*8-090h]
	mov rax, qword ptr [rcx+r8*8-088h]	
	adc qword ptr [rcx+r8*8-088h+2000000h], rax
	adc qword ptr [rcx+r8*8-090h+2000000h], rbx
	mov rbx, qword ptr [rcx+r8*8-0A0h]
	mov rax, qword ptr [rcx+r8*8-098h]	
	adc qword ptr [rcx+r8*8-098h+2000000h], rax
	adc qword ptr [rcx+r8*8-0A0h+2000000h], rbx
	mov rbx, qword ptr [rcx+r8*8-0B0h]
	mov rax, qword ptr [rcx+r8*8-0A8h]	
	adc qword ptr [rcx+r8*8-0A8h+2000000h], rax	
	adc qword ptr [rcx+r8*8-0B0h+2000000h], rbx
	mov rbx, qword ptr [rcx+r8*8-0C0h]
	mov rax, qword ptr [rcx+r8*8-0B8h]
	adc qword ptr [rcx+r8*8-0B8h+2000000h], rax	
	adc qword ptr [rcx+r8*8-0C0h+2000000h], rbx
	mov rbx, qword ptr [rcx+r8*8-0D0h]
	mov rax, qword ptr [rcx+r8*8-0C8h]	
	adc qword ptr [rcx+r8*8-0C8h+2000000h], rax	
	adc qword ptr [rcx+r8*8-0D0h+2000000h], rbx
	mov rbx, qword ptr [rcx+r8*8-0E0h]
	mov rax, qword ptr [rcx+r8*8-0D8h]	
	adc qword ptr [rcx+r8*8-0D8h+2000000h], rax	
	adc qword ptr [rcx+r8*8-0E0h+2000000h], rbx
	mov rbx, qword ptr [rcx+r8*8-0F0h]
	mov rax, qword ptr [rcx+r8*8-0E8h]	
	adc qword ptr [rcx+r8*8-0E8h+2000000h], rax	
	adc qword ptr [rcx+r8*8-0F0h+2000000h], rbx
	mov rbx, qword ptr [rcx+r8*8-100h]
	mov rax, qword ptr [rcx+r8*8-0F8h]	
	adc qword ptr [rcx+r8*8-0F8h+2000000h], rax	
	adc qword ptr [rcx+r8*8-100h+2000000h], rbx
	pushf
	sub r8, 32
AddLoop:
	popf
	mov rbx, qword ptr [rcx+r8*8-010h]
	mov rax, qword ptr [rcx+r8*8-008h]
	adc qword ptr [rcx+r8*8-008h+2000000h], rax
	adc qword ptr [rcx+r8*8-010h+2000000h], rbx
	mov rbx, qword ptr [rcx+r8*8-020h]
	mov rax, qword ptr [rcx+r8*8-018h]
	adc qword ptr [rcx+r8*8-018h+2000000h], rax
	adc qword ptr [rcx+r8*8-020h+2000000h], rbx
	mov rbx, qword ptr [rcx+r8*8-030h]
	mov rax, qword ptr [rcx+r8*8-028h]	
	adc qword ptr [rcx+r8*8-028h+2000000h], rax
	adc qword ptr [rcx+r8*8-030h+2000000h], rbx
	mov rbx, qword ptr [rcx+r8*8-040h]
	mov rax, qword ptr [rcx+r8*8-038h]	
	adc qword ptr [rcx+r8*8-038h+2000000h], rax
	adc qword ptr [rcx+r8*8-040h+2000000h], rbx
	mov rbx, qword ptr [rcx+r8*8-050h]
	mov rax, qword ptr [rcx+r8*8-048h]	
	adc qword ptr [rcx+r8*8-048h+2000000h], rax
	adc qword ptr [rcx+r8*8-050h+2000000h], rbx
	mov rbx, qword ptr [rcx+r8*8-060h]
	mov rax, qword ptr [rcx+r8*8-058h]
	adc qword ptr [rcx+r8*8-058h+2000000h], rax
	adc qword ptr [rcx+r8*8-060h+2000000h], rbx
	mov rbx, qword ptr [rcx+r8*8-070h]
	mov rax, qword ptr [rcx+r8*8-068h]	
	adc qword ptr [rcx+r8*8-068h+2000000h], rax
	adc qword ptr [rcx+r8*8-070h+2000000h], rbx
	mov rbx, qword ptr [rcx+r8*8-080h]
	mov rax, qword ptr [rcx+r8*8-078h]	
	adc qword ptr [rcx+r8*8-078h+2000000h], rax
	adc qword ptr [rcx+r8*8-080h+2000000h], rbx
	mov rbx, qword ptr [rcx+r8*8-090h]
	mov rax, qword ptr [rcx+r8*8-088h]	
	adc qword ptr [rcx+r8*8-088h+2000000h], rax
	adc qword ptr [rcx+r8*8-090h+2000000h], rbx
	mov rbx, qword ptr [rcx+r8*8-0A0h]
	mov rax, qword ptr [rcx+r8*8-098h]	
	adc qword ptr [rcx+r8*8-098h+2000000h], rax
	adc qword ptr [rcx+r8*8-0A0h+2000000h], rbx
	mov rbx, qword ptr [rcx+r8*8-0B0h]
	mov rax, qword ptr [rcx+r8*8-0A8h]	
	adc qword ptr [rcx+r8*8-0A8h+2000000h], rax	
	adc qword ptr [rcx+r8*8-0B0h+2000000h], rbx
	mov rbx, qword ptr [rcx+r8*8-0C0h]
	mov rax, qword ptr [rcx+r8*8-0B8h]
	adc qword ptr [rcx+r8*8-0B8h+2000000h], rax	
	adc qword ptr [rcx+r8*8-0C0h+2000000h], rbx
	mov rbx, qword ptr [rcx+r8*8-0D0h]
	mov rax, qword ptr [rcx+r8*8-0C8h]	
	adc qword ptr [rcx+r8*8-0C8h+2000000h], rax	
	adc qword ptr [rcx+r8*8-0D0h+2000000h], rbx
	mov rbx, qword ptr [rcx+r8*8-0E0h]
	mov rax, qword ptr [rcx+r8*8-0D8h]	
	adc qword ptr [rcx+r8*8-0D8h+2000000h], rax	
	adc qword ptr [rcx+r8*8-0E0h+2000000h], rbx
	mov rbx, qword ptr [rcx+r8*8-0F0h]
	mov rax, qword ptr [rcx+r8*8-0E8h]	
	adc qword ptr [rcx+r8*8-0E8h+2000000h], rax	
	adc qword ptr [rcx+r8*8-0F0h+2000000h], rbx
	mov rbx, qword ptr [rcx+r8*8-100h]
	mov rax, qword ptr [rcx+r8*8-0F8h]	
	adc qword ptr [rcx+r8*8-0F8h+2000000h], rax	
	adc qword ptr [rcx+r8*8-100h+2000000h], rbx
	pushf
	sub r8, 32
	jnz AddLoop
	popf
	adc qword ptr [rdx], 0 ; lock?
	ret
HugeAdd endp
end

Title: Re: Gigabit integer add optimization
Post by: dedndave on February 15, 2012, 03:59:45 PM

welcome to the forum :U

first, "qword ptr" is not required when a register is used as one of the operands - that specifies the size

i am sure there are many ways to optimize this code
but the first one that pops out at me (pun) is the use if PUSHF and POPF
instead of saving and restoring flags across the SUB, use LEA - it does not alter the flags

Code Select

lea r8,[r8-32]

i am not a 64-bit guy, so i don't know about PUSHF and POPF in 64-bit world
but PUSHF is slow, POPF is VERY slow

for the case of the last one, where you need to test for ZF...
use rotate and shift instructions to store and restore the carry flag

Code Select

        rcl     rax,1   ;CF -> RAX bit 0
        sub     r8,32
        jnz

then, to restore the carry flag...

Code Select

shr rax,1 ;RAX bit 0 -> CF

Title: Re: Gigabit integer add optimization
Post by: Tedd on February 15, 2012, 04:15:47 PM

Assuming you have them available, I'd make use of more registers (this is 64-bit, so there's also r8-r15), which should allow you to reduce stalls between inter-dependent sequences.

The other thing is that pushf/popf is likely to be quite slow, as dave mentioned. 'Copy' it to a register with adc (or sbb) and either copy it back or use it when you need to. (This should be faster than shift/rotate.)

Title: Re: Gigabit integer add optimization
Post by: qWord on February 15, 2012, 04:16:14 PM

hi,
my tip: do not use scaling (r8*8) for addressing: instead calculate the base address of the blocks at procedure entry and then increment them using ADD or LEA

Code Select

mov r8,[rsi+0*8]
mov r9,[rsi+1*8]
...
mov r15,[rsi+7*8]
lea rsi,[rsi+8*8]
adc [rdi+0*8],r8
adc [rdi+1*8],r9
...
adc [rdi+7*8],r15
lea rdi,[rdi+8*8]

I'm wondering how do you transmit the carry-flag between the blocks/threads - AFAIKS this is sequential problem, which can't be speed up by using several threads.

Title: Re: Gigabit integer add optimization
Post by: dedndave on February 15, 2012, 04:44:47 PM

i can see how it may be done using as many threads as there are cores
for example:
you total "block A" on core 1 - store the total and the carry
you total "block B" on core 2 - store the total and the carry
you total "block C" on core 3 - store the total and the carry
you total "block D" on core 4 - store the total and the carry
when all threads have completed, add the stored totals/carries
of course, you have to affinitize each thread to individual cores :P
the block size will be dependant on the number of cores available

Title: Re: Gigabit integer add optimization
Post by: dedndave on February 15, 2012, 05:13:30 PM

...i might add....
the practical number of usable cores for this does not include intel hyper-threaded "cores"
if you are doing this on your machine, alone, it will simplify things, as you know how many physical cores you have
but, if you want to run on different machines, a little more work may be involved

there is no well-documented method of mapping the affinity mask bits to cores
however, from experimentation, i can tell you that hyper-threaded cores have ~~adjacent~~ juxtapositioned mask bits :P
(you like that word ? - i have more - lol)

Title: Re: Gigabit integer add optimization
Post by: qWord on February 15, 2012, 05:14:04 PM

Quote from: dedndave on February 15, 2012, 04:44:47 PMwhen all threads have completed, add the stored totals/carries

Did I miss something? - in this case he needs to make two ADCs for the Blocks B,C and D.
A sequential solution only requires one ADC for each Block.

Title: Re: Gigabit integer add optimization
Post by: dedndave on February 15, 2012, 05:18:45 PM

i am afraid i don't understand what your line of thinking is, qWord :(

Title: Re: Gigabit integer add optimization
Post by: qWord on February 15, 2012, 05:35:45 PM

Quote from: dedndave on February 15, 2012, 05:18:45 PM
i am afraid i don't understand what your line of thinking is, qWord :(

ok - I've missed what jnm2 is trying to archive ... Just thought, that he simply want to add two large integers.

Title: Re: Gigabit integer add optimization
Post by: clive on February 15, 2012, 06:15:18 PM

I guess because you can look at the high order end of the sum and be pretty sure when it won't carry to the next block, or will. The only uncertain case is where the summed QWORD is 0xFFFFFFFF,FFFFFFFF and then you're dependent on a earlier carry to push you over or not.

Title: Re: Gigabit integer add optimization
Post by: jj2007 on February 15, 2012, 06:39:57 PM

You tried PADDQ, I suppose?

Title: Re: Gigabit integer add optimization
Post by: jnm2 on February 16, 2012, 01:29:28 AM

PADDQ doesn't do carries.

This is an AMD FX-8150. Eight cores, no hyperthreading.

I don't need to do this sequentially. Think of it this way. The largest possible add is 0xFFFFFFFFFFFFFFFF + 0xFFFFFFFFFFFFFFFF = 0xFFFFFFFFFFFFFFFE carry 1. So the previous block will be able to carry here and it will never cause a second carry in this block.

Lots of great tips for me to look at! I was able to get rid of popf and pushf, but I'm not familiar with lea. Until I get through your suggestions, this is my current code.

Code Select


	mov rbx, qword ptr [rcx-010h+2000000h]
	mov rax, qword ptr [rcx-008h+2000000h]
	add qword ptr [rcx-008h+4000000h], rax
	adc qword ptr [rcx-010h+4000000h], rbx
	mov rbx, qword ptr [rcx-020h+2000000h]
	mov rax, qword ptr [rcx-018h+2000000h]
	adc qword ptr [rcx-018h+4000000h], rax
	adc qword ptr [rcx-020h+4000000h], rbx
	[...]
	mov rbx, qword ptr [rcx-100h+2000000h]
	mov rax, qword ptr [rcx-0F8h+2000000h]
	adc qword ptr [rcx-0F8h+4000000h], rax
	adc qword ptr [rcx-100h+4000000h], rbx
	mov r9, 1FFFFh
AddLoop:
	mov r8, r9
	shl r8, 5
	mov rbx, qword ptr [rcx+r8*8-010h]
	mov rax, qword ptr [rcx+r8*8-008h]
	adc qword ptr [rcx+r8*8-008h+2000000h], rax
	adc qword ptr [rcx+r8*8-010h+2000000h], rbx
	mov rbx, qword ptr [rcx+r8*8-020h]
	mov rax, qword ptr [rcx+r8*8-018h]
	adc qword ptr [rcx+r8*8-018h+2000000h], rax
	adc qword ptr [rcx+r8*8-020h+2000000h], rbx
	[...]
	mov rbx, qword ptr [rcx+r8*8-100h]
	mov rax, qword ptr [rcx+r8*8-0F8h]	
	adc qword ptr [rcx+r8*8-0F8h+2000000h], rax	
	adc qword ptr [rcx+r8*8-100h+2000000h], rbx
	dec r9
	jnz AddLoop
	adc qword ptr [rdx], 0
	ret

Title: Re: Gigabit integer add optimization
Post by: qWord on February 16, 2012, 01:49:36 AM

Just for interest: what kind of data is it - where does it come from?

qWord

Title: Re: Gigabit integer add optimization
Post by: jnm2 on February 16, 2012, 02:02:33 AM

Basically - http://en.wikipedia.org/wiki/Lucas%E2%80%93Lehmer_primality_test

By the way, if the constants above look confusing, it's because I'm confusing myself. Let me sort this out.

Title: Re: Gigabit integer add optimization
Post by: jnm2 on February 16, 2012, 02:19:02 AM

Here is the current code with the correct distances:

Code Select


	mov rbx, [rcx-010h+2000000h]
	mov rax, [rcx-008h+2000000h]
	add [rcx-008h+12000000h], rax
	adc [rcx-010h+12000000h], rbx
	mov rbx, [rcx-020h+2000000h]
	mov rax, [rcx-018h+2000000h]
	adc [rcx-018h+12000000h], rax
	adc [rcx-020h+12000000h], rbx
[...]
	mov rbx, [rcx-100h+2000000h]
	mov rax, [rcx-0F8h+2000000h]
	adc [rcx-0F8h+12000000h], rax
	adc [rcx-100h+12000000h], rbx
	lea r8, [rcx+1FFFF00h]
AddLoop:
	mov rbx, [r8-010h]
	mov rax, [r8-008h]
	adc [r8-008h+10000000h], rax
	adc [r8-010h+10000000h], rbx
	mov rbx, [r8-020h]
	mov rax, [r8-018h]
	adc [r8-018h+10000000h], rax
	adc [r8-020h+10000000h], rbx
[...]
	mov rbx, [r8-100h]
	mov rax, [r8-0F8h]	
	adc [r8-0F8h+10000000h], rax	
	adc [r8-100h+10000000h], rbx
	lea r8, [r8-256]
	cmp r8, rcx
	jne AddLoop
	adc qword ptr [rdx], 0
	ret

Somewhere I divided 10000000h by eight. Now with the correct offsets, this takes twice as long.

Title: Re: Gigabit integer add optimization
Post by: qWord on February 16, 2012, 02:32:45 AM

my suggestion would be:

Code Select

CHUNK_INFO struct
    pChunkA     PVOID   ? ; in , destination
    pChunkB     PVOID   ? ; in , source
    n           DWORD   ? ; in , number of QWORDs
    carry       DWORD   ? ; in/out  , 0 or 1
CHUNK_INFO ends


; thread procedure
BlockAdd proc pChunkInfo:ptr CHUNK_INFO
    
    mov rbx,pChunkInfo
    mov ecx,[rbx].CHUNK_INFO.n
    mov rdi,[rbx].CHUNK_INFO.pChunkA
    mov rsi,[rbx].CHUNK_INFO.pChunkB
    shr ecx,3
    mov edx,[rbx].CHUNK_INFO.carry
    align 16
@@: 
    mov r8 ,[rsi+0*8]
    mov r9 ,[rsi+1*8]
    mov r10,[rsi+2*8]
    mov r11,[rsi+3*8]
    mov r12,[rsi+4*8]
    mov r13,[rsi+5*8]
    mov r14,[rsi+6*8]
    mov r15,[rsi+7*8]
    lea rsi,[rsi+8*8]
    add r8,rdx
    mov rdx,0
    adc [rdi+0*8],r8 
    adc [rdi+1*8],r9 
    adc [rdi+2*8],r10
    adc [rdi+3*8],r11
    adc [rdi+4*8],r12
    adc [rdi+5*8],r13
    adc [rdi+6*8],r14
    adc [rdi+7*8],r15
    adc rdx,0
    lea rdi,[rdi+8*8]
    
    ; this loop could be unrolled several times ...
    
    dec ecx
    jnz @B
    
    mov [rbx].CHUNK_INFO.carry,edx
    
    mov rax,1
    ret
    
BlockAdd endp

The structure is filled before calling CreateThread() with the needed addresses. Also a possible carry is returned through it.

EDIT: code changed

Title: Re: Gigabit integer add optimization
Post by: jj2007 on February 16, 2012, 06:50:48 AM

Quote from: jnm2 on February 16, 2012, 01:29:28 AM
PADDQ doesn't do carries.

Valid argument. Now if your data are predictable, you could branch into SSE code if no carry expected (test for bit 63?). Usually the speed gain with SSE2 is considerable...

The MASM Forum Archive 2004 to 2012

General Forums => The Laboratory => Topic started by: jnm2 on February 15, 2012, 03:50:31 PM