News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

Gigabit integer add optimization

Started by jnm2, February 15, 2012, 03:50:31 PM

Previous topic - Next topic

jnm2

I wrote this addition code to add the high half to the low half of four-gigabit integers fairly quickly.  I'm a little new to assembly.  Can anyone help me spot speed optimizations?

What's going on:
I have a contiguous four gigabits (512 MB) allocated in the RAM.  I need to add the first half (two gigabits) to the lower half with the high end carrying back around to the least significant bit.
I'm using an eight core CPU, so each core is adding 256 megabits of the number and carrying to the next chunk.  The highest chunk carries to the lowest.
Pardon the mysterious numbers, but hard-coding the addition length (0x400000 quadwords for each core) and offset (0x20000000) runs faster than using registers and passing in parameters from C++.
Interleaving the integers (rather than adding integers that are physically two gigabits apart) actually slows the code down, which end up being convenient for me.
(When I say faster here, I mean I've profiled it.)

rcx contains a pointer to the beginning of the chunk.  rdx contains a pointer to the quadword that will be carried to.
(Do I need to lock the initial add and final carry for multiple cores?  It's much slower and will only be a problem if one block is finishing as another starts.  They will all be told to start simultaneously.)

Unrolling the loop 32 times seems to hit a sweet spot with my CPU.
Doing two reads then two adds seems to hit a sweet spot, and significantly more speed is gained by repeating rax, rbx, adc rbx, adc rax rather than rax, rbx, rax, rbx.  I tested configurations of three and four.
I wish there was a way to decrement the counter by 32 without touching the carry flag or repeating dec r8 32 times.  (Can I use rep here?)
I looked into SIMD and could not find a solution faster than the normal quadword add with carry.

It's currently running at ~92 adds per second being called from a C++ loop.  I will be moving thins into assembler as I get more pieces in place.
But for me, this means just under nine months of continuous execution tacked on to the wait time for the result I'm working on.  I would like to see that amount of time get as small as possible.

Without further ado, here it is:
.code
public HugeAdd
HugeAdd proc
mov r8, 400000h
mov rbx, qword ptr [rcx+r8*8-010h]
mov rax, qword ptr [rcx+r8*8-008h]
add qword ptr [rcx+r8*8-008h+2000000h], rax ; lock?
adc qword ptr [rcx+r8*8-010h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-020h]
mov rax, qword ptr [rcx+r8*8-018h]
adc qword ptr [rcx+r8*8-018h+2000000h], rax
adc qword ptr [rcx+r8*8-020h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-030h]
mov rax, qword ptr [rcx+r8*8-028h]
adc qword ptr [rcx+r8*8-028h+2000000h], rax
adc qword ptr [rcx+r8*8-030h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-040h]
mov rax, qword ptr [rcx+r8*8-038h]
adc qword ptr [rcx+r8*8-038h+2000000h], rax
adc qword ptr [rcx+r8*8-040h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-050h]
mov rax, qword ptr [rcx+r8*8-048h]
adc qword ptr [rcx+r8*8-048h+2000000h], rax
adc qword ptr [rcx+r8*8-050h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-060h]
mov rax, qword ptr [rcx+r8*8-058h]
adc qword ptr [rcx+r8*8-058h+2000000h], rax
adc qword ptr [rcx+r8*8-060h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-070h]
mov rax, qword ptr [rcx+r8*8-068h]
adc qword ptr [rcx+r8*8-068h+2000000h], rax
adc qword ptr [rcx+r8*8-070h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-080h]
mov rax, qword ptr [rcx+r8*8-078h]
adc qword ptr [rcx+r8*8-078h+2000000h], rax
adc qword ptr [rcx+r8*8-080h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-090h]
mov rax, qword ptr [rcx+r8*8-088h]
adc qword ptr [rcx+r8*8-088h+2000000h], rax
adc qword ptr [rcx+r8*8-090h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-0A0h]
mov rax, qword ptr [rcx+r8*8-098h]
adc qword ptr [rcx+r8*8-098h+2000000h], rax
adc qword ptr [rcx+r8*8-0A0h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-0B0h]
mov rax, qword ptr [rcx+r8*8-0A8h]
adc qword ptr [rcx+r8*8-0A8h+2000000h], rax
adc qword ptr [rcx+r8*8-0B0h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-0C0h]
mov rax, qword ptr [rcx+r8*8-0B8h]
adc qword ptr [rcx+r8*8-0B8h+2000000h], rax
adc qword ptr [rcx+r8*8-0C0h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-0D0h]
mov rax, qword ptr [rcx+r8*8-0C8h]
adc qword ptr [rcx+r8*8-0C8h+2000000h], rax
adc qword ptr [rcx+r8*8-0D0h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-0E0h]
mov rax, qword ptr [rcx+r8*8-0D8h]
adc qword ptr [rcx+r8*8-0D8h+2000000h], rax
adc qword ptr [rcx+r8*8-0E0h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-0F0h]
mov rax, qword ptr [rcx+r8*8-0E8h]
adc qword ptr [rcx+r8*8-0E8h+2000000h], rax
adc qword ptr [rcx+r8*8-0F0h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-100h]
mov rax, qword ptr [rcx+r8*8-0F8h]
adc qword ptr [rcx+r8*8-0F8h+2000000h], rax
adc qword ptr [rcx+r8*8-100h+2000000h], rbx
pushf
sub r8, 32
AddLoop:
popf
mov rbx, qword ptr [rcx+r8*8-010h]
mov rax, qword ptr [rcx+r8*8-008h]
adc qword ptr [rcx+r8*8-008h+2000000h], rax
adc qword ptr [rcx+r8*8-010h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-020h]
mov rax, qword ptr [rcx+r8*8-018h]
adc qword ptr [rcx+r8*8-018h+2000000h], rax
adc qword ptr [rcx+r8*8-020h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-030h]
mov rax, qword ptr [rcx+r8*8-028h]
adc qword ptr [rcx+r8*8-028h+2000000h], rax
adc qword ptr [rcx+r8*8-030h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-040h]
mov rax, qword ptr [rcx+r8*8-038h]
adc qword ptr [rcx+r8*8-038h+2000000h], rax
adc qword ptr [rcx+r8*8-040h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-050h]
mov rax, qword ptr [rcx+r8*8-048h]
adc qword ptr [rcx+r8*8-048h+2000000h], rax
adc qword ptr [rcx+r8*8-050h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-060h]
mov rax, qword ptr [rcx+r8*8-058h]
adc qword ptr [rcx+r8*8-058h+2000000h], rax
adc qword ptr [rcx+r8*8-060h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-070h]
mov rax, qword ptr [rcx+r8*8-068h]
adc qword ptr [rcx+r8*8-068h+2000000h], rax
adc qword ptr [rcx+r8*8-070h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-080h]
mov rax, qword ptr [rcx+r8*8-078h]
adc qword ptr [rcx+r8*8-078h+2000000h], rax
adc qword ptr [rcx+r8*8-080h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-090h]
mov rax, qword ptr [rcx+r8*8-088h]
adc qword ptr [rcx+r8*8-088h+2000000h], rax
adc qword ptr [rcx+r8*8-090h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-0A0h]
mov rax, qword ptr [rcx+r8*8-098h]
adc qword ptr [rcx+r8*8-098h+2000000h], rax
adc qword ptr [rcx+r8*8-0A0h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-0B0h]
mov rax, qword ptr [rcx+r8*8-0A8h]
adc qword ptr [rcx+r8*8-0A8h+2000000h], rax
adc qword ptr [rcx+r8*8-0B0h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-0C0h]
mov rax, qword ptr [rcx+r8*8-0B8h]
adc qword ptr [rcx+r8*8-0B8h+2000000h], rax
adc qword ptr [rcx+r8*8-0C0h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-0D0h]
mov rax, qword ptr [rcx+r8*8-0C8h]
adc qword ptr [rcx+r8*8-0C8h+2000000h], rax
adc qword ptr [rcx+r8*8-0D0h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-0E0h]
mov rax, qword ptr [rcx+r8*8-0D8h]
adc qword ptr [rcx+r8*8-0D8h+2000000h], rax
adc qword ptr [rcx+r8*8-0E0h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-0F0h]
mov rax, qword ptr [rcx+r8*8-0E8h]
adc qword ptr [rcx+r8*8-0E8h+2000000h], rax
adc qword ptr [rcx+r8*8-0F0h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-100h]
mov rax, qword ptr [rcx+r8*8-0F8h]
adc qword ptr [rcx+r8*8-0F8h+2000000h], rax
adc qword ptr [rcx+r8*8-100h+2000000h], rbx
pushf
sub r8, 32
jnz AddLoop
popf
adc qword ptr [rdx], 0 ; lock?
ret
HugeAdd endp
end

dedndave

welcome to the forum   :U

first, "qword ptr" is not required when a register is used as one of the operands - that specifies the size

i am sure there are many ways to optimize this code
but the first one that pops out at me (pun) is the use if PUSHF and POPF
instead of saving and restoring flags across the SUB, use LEA - it does not alter the flags
        lea     r8,[r8-32]

i am not a 64-bit guy, so i don't know about PUSHF and POPF in 64-bit world
but PUSHF is slow, POPF is VERY slow

for the case of the last one, where you need to test for ZF...
use rotate and shift instructions to store and restore the carry flag
        rcl     rax,1   ;CF -> RAX bit 0
        sub     r8,32
        jnz


then, to restore the carry flag...
        shr     rax,1   ;RAX bit 0 -> CF

Tedd

Assuming you have them available, I'd make use of more registers (this is 64-bit, so there's also r8-r15), which should allow you to reduce stalls between inter-dependent sequences.

The other thing is that pushf/popf is likely to be quite slow, as dave mentioned. 'Copy' it to a register with adc (or sbb) and either copy it back or use it when you need to. (This should be faster than shift/rotate.)
No snowflake in an avalanche feels responsible.

qWord

hi,
my tip: do not use scaling (r8*8) for addressing: instead calculate the base address of the blocks at procedure entry and then increment them using ADD or LEA

mov r8,[rsi+0*8]
mov r9,[rsi+1*8]
...
mov r15,[rsi+7*8]
lea rsi,[rsi+8*8]
adc [rdi+0*8],r8
adc [rdi+1*8],r9
...
adc [rdi+7*8],r15
lea rdi,[rdi+8*8]


I'm wondering how do you transmit the carry-flag between the blocks/threads - AFAIKS this is sequential problem, which can't be speed up by using several threads.
FPU in a trice: SmplMath
It's that simple!

dedndave

i can see how it may be done using as many threads as there are cores
for example:
you total "block A" on core 1 - store the total and the carry
you total "block B" on core 2 - store the total and the carry
you total "block C" on core 3 - store the total and the carry
you total "block D" on core 4 - store the total and the carry
when all threads have completed, add the stored totals/carries
of course, you have to affinitize each thread to individual cores   :P
the block size will be dependant on the number of cores available

dedndave

...i might add....
the practical number of usable cores for this does not include intel hyper-threaded "cores"
if you are doing this on your machine, alone, it will simplify things, as you know how many physical cores you have
but, if you want to run on different machines, a little more work may be involved

there is no well-documented method of mapping the affinity mask bits to cores
however, from experimentation, i can tell you that hyper-threaded cores have adjacent juxtapositioned mask bits   :P
(you like that word ? - i have more - lol)

qWord

Quote from: dedndave on February 15, 2012, 04:44:47 PMwhen all threads have completed, add the stored totals/carries
Did I miss something? - in this case he needs to make two ADCs for the Blocks B,C and D.
A sequential solution only requires one ADC for each Block.
FPU in a trice: SmplMath
It's that simple!

dedndave

i am afraid i don't understand what your line of thinking is, qWord   :(

qWord

Quote from: dedndave on February 15, 2012, 05:18:45 PM
i am afraid i don't understand what your line of thinking is, qWord   :(
ok - I've missed what jnm2 is trying to archive ... Just thought, that he simply want to add two large integers.
FPU in a trice: SmplMath
It's that simple!

clive

I guess because you can look at the high order end of the sum and be pretty sure when it won't carry to the next block, or will. The only uncertain case is where the summed QWORD is 0xFFFFFFFF,FFFFFFFF and then you're dependent on a earlier carry to push you over or not.
It could be a random act of randomness. Those happen a lot as well.

jj2007


jnm2

PADDQ doesn't do carries.

This is an AMD FX-8150.  Eight cores, no hyperthreading.

I don't need to do this sequentially.  Think of it this way.  The largest possible add is 0xFFFFFFFFFFFFFFFF + 0xFFFFFFFFFFFFFFFF = 0xFFFFFFFFFFFFFFFE carry 1.  So the previous block will be able to carry here and it will never cause a second carry in this block.

Lots of great tips for me to look at!  I was able to get rid of popf and pushf, but I'm not familiar with lea.  Until I get through your suggestions, this is my current code. 

mov rbx, qword ptr [rcx-010h+2000000h]
mov rax, qword ptr [rcx-008h+2000000h]
add qword ptr [rcx-008h+4000000h], rax
adc qword ptr [rcx-010h+4000000h], rbx
mov rbx, qword ptr [rcx-020h+2000000h]
mov rax, qword ptr [rcx-018h+2000000h]
adc qword ptr [rcx-018h+4000000h], rax
adc qword ptr [rcx-020h+4000000h], rbx
[...]
mov rbx, qword ptr [rcx-100h+2000000h]
mov rax, qword ptr [rcx-0F8h+2000000h]
adc qword ptr [rcx-0F8h+4000000h], rax
adc qword ptr [rcx-100h+4000000h], rbx
mov r9, 1FFFFh
AddLoop:
mov r8, r9
shl r8, 5
mov rbx, qword ptr [rcx+r8*8-010h]
mov rax, qword ptr [rcx+r8*8-008h]
adc qword ptr [rcx+r8*8-008h+2000000h], rax
adc qword ptr [rcx+r8*8-010h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-020h]
mov rax, qword ptr [rcx+r8*8-018h]
adc qword ptr [rcx+r8*8-018h+2000000h], rax
adc qword ptr [rcx+r8*8-020h+2000000h], rbx
[...]
mov rbx, qword ptr [rcx+r8*8-100h]
mov rax, qword ptr [rcx+r8*8-0F8h]
adc qword ptr [rcx+r8*8-0F8h+2000000h], rax
adc qword ptr [rcx+r8*8-100h+2000000h], rbx
dec r9
jnz AddLoop
adc qword ptr [rdx], 0
ret

qWord

Just for interest: what kind of data is it - where does it come from?

qWord
FPU in a trice: SmplMath
It's that simple!

jnm2

Basically - http://en.wikipedia.org/wiki/Lucas%E2%80%93Lehmer_primality_test

By the way, if the constants above look confusing, it's because I'm confusing myself.  Let me sort this out.

jnm2

Here is the current code with the correct distances:

mov rbx, [rcx-010h+2000000h]
mov rax, [rcx-008h+2000000h]
add [rcx-008h+12000000h], rax
adc [rcx-010h+12000000h], rbx
mov rbx, [rcx-020h+2000000h]
mov rax, [rcx-018h+2000000h]
adc [rcx-018h+12000000h], rax
adc [rcx-020h+12000000h], rbx
[...]
mov rbx, [rcx-100h+2000000h]
mov rax, [rcx-0F8h+2000000h]
adc [rcx-0F8h+12000000h], rax
adc [rcx-100h+12000000h], rbx
lea r8, [rcx+1FFFF00h]
AddLoop:
mov rbx, [r8-010h]
mov rax, [r8-008h]
adc [r8-008h+10000000h], rax
adc [r8-010h+10000000h], rbx
mov rbx, [r8-020h]
mov rax, [r8-018h]
adc [r8-018h+10000000h], rax
adc [r8-020h+10000000h], rbx
[...]
mov rbx, [r8-100h]
mov rax, [r8-0F8h]
adc [r8-0F8h+10000000h], rax
adc [r8-100h+10000000h], rbx
lea r8, [r8-256]
cmp r8, rcx
jne AddLoop
adc qword ptr [rdx], 0
ret


Somewhere I divided 10000000h by eight.  Now with the correct offsets, this takes twice as long.