I wrote this addition code to add the high half to the low half of four-gigabit integers fairly quickly. I'm a little new to assembly. Can anyone help me spot speed optimizations?
What's going on:
I have a contiguous four gigabits (512 MB) allocated in the RAM. I need to add the first half (two gigabits) to the lower half with the high end carrying back around to the least significant bit.
I'm using an eight core CPU, so each core is adding 256 megabits of the number and carrying to the next chunk. The highest chunk carries to the lowest.
Pardon the mysterious numbers, but hard-coding the addition length (0x400000 quadwords for each core) and offset (0x20000000) runs faster than using registers and passing in parameters from C++.
Interleaving the integers (rather than adding integers that are physically two gigabits apart) actually slows the code down, which end up being convenient for me.
(When I say faster here, I mean I've profiled it.)
rcx contains a pointer to the beginning of the chunk. rdx contains a pointer to the quadword that will be carried to.
(Do I need to lock the initial add and final carry for multiple cores? It's much slower and will only be a problem if one block is finishing as another starts. They will all be told to start simultaneously.)
Unrolling the loop 32 times seems to hit a sweet spot with my CPU.
Doing two reads then two adds seems to hit a sweet spot, and significantly more speed is gained by repeating rax, rbx, adc rbx, adc rax rather than rax, rbx, rax, rbx. I tested configurations of three and four.
I wish there was a way to decrement the counter by 32 without touching the carry flag or repeating dec r8 32 times. (Can I use rep here?)
I looked into SIMD and could not find a solution faster than the normal quadword add with carry.
It's currently running at ~92 adds per second being called from a C++ loop. I will be moving thins into assembler as I get more pieces in place.
But for me, this means just under nine months of continuous execution tacked on to the wait time for the result I'm working on. I would like to see that amount of time get as small as possible.
Without further ado, here it is:
.code
public HugeAdd
HugeAdd proc
mov r8, 400000h
mov rbx, qword ptr [rcx+r8*8-010h]
mov rax, qword ptr [rcx+r8*8-008h]
add qword ptr [rcx+r8*8-008h+2000000h], rax ; lock?
adc qword ptr [rcx+r8*8-010h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-020h]
mov rax, qword ptr [rcx+r8*8-018h]
adc qword ptr [rcx+r8*8-018h+2000000h], rax
adc qword ptr [rcx+r8*8-020h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-030h]
mov rax, qword ptr [rcx+r8*8-028h]
adc qword ptr [rcx+r8*8-028h+2000000h], rax
adc qword ptr [rcx+r8*8-030h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-040h]
mov rax, qword ptr [rcx+r8*8-038h]
adc qword ptr [rcx+r8*8-038h+2000000h], rax
adc qword ptr [rcx+r8*8-040h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-050h]
mov rax, qword ptr [rcx+r8*8-048h]
adc qword ptr [rcx+r8*8-048h+2000000h], rax
adc qword ptr [rcx+r8*8-050h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-060h]
mov rax, qword ptr [rcx+r8*8-058h]
adc qword ptr [rcx+r8*8-058h+2000000h], rax
adc qword ptr [rcx+r8*8-060h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-070h]
mov rax, qword ptr [rcx+r8*8-068h]
adc qword ptr [rcx+r8*8-068h+2000000h], rax
adc qword ptr [rcx+r8*8-070h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-080h]
mov rax, qword ptr [rcx+r8*8-078h]
adc qword ptr [rcx+r8*8-078h+2000000h], rax
adc qword ptr [rcx+r8*8-080h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-090h]
mov rax, qword ptr [rcx+r8*8-088h]
adc qword ptr [rcx+r8*8-088h+2000000h], rax
adc qword ptr [rcx+r8*8-090h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-0A0h]
mov rax, qword ptr [rcx+r8*8-098h]
adc qword ptr [rcx+r8*8-098h+2000000h], rax
adc qword ptr [rcx+r8*8-0A0h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-0B0h]
mov rax, qword ptr [rcx+r8*8-0A8h]
adc qword ptr [rcx+r8*8-0A8h+2000000h], rax
adc qword ptr [rcx+r8*8-0B0h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-0C0h]
mov rax, qword ptr [rcx+r8*8-0B8h]
adc qword ptr [rcx+r8*8-0B8h+2000000h], rax
adc qword ptr [rcx+r8*8-0C0h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-0D0h]
mov rax, qword ptr [rcx+r8*8-0C8h]
adc qword ptr [rcx+r8*8-0C8h+2000000h], rax
adc qword ptr [rcx+r8*8-0D0h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-0E0h]
mov rax, qword ptr [rcx+r8*8-0D8h]
adc qword ptr [rcx+r8*8-0D8h+2000000h], rax
adc qword ptr [rcx+r8*8-0E0h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-0F0h]
mov rax, qword ptr [rcx+r8*8-0E8h]
adc qword ptr [rcx+r8*8-0E8h+2000000h], rax
adc qword ptr [rcx+r8*8-0F0h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-100h]
mov rax, qword ptr [rcx+r8*8-0F8h]
adc qword ptr [rcx+r8*8-0F8h+2000000h], rax
adc qword ptr [rcx+r8*8-100h+2000000h], rbx
pushf
sub r8, 32
AddLoop:
popf
mov rbx, qword ptr [rcx+r8*8-010h]
mov rax, qword ptr [rcx+r8*8-008h]
adc qword ptr [rcx+r8*8-008h+2000000h], rax
adc qword ptr [rcx+r8*8-010h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-020h]
mov rax, qword ptr [rcx+r8*8-018h]
adc qword ptr [rcx+r8*8-018h+2000000h], rax
adc qword ptr [rcx+r8*8-020h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-030h]
mov rax, qword ptr [rcx+r8*8-028h]
adc qword ptr [rcx+r8*8-028h+2000000h], rax
adc qword ptr [rcx+r8*8-030h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-040h]
mov rax, qword ptr [rcx+r8*8-038h]
adc qword ptr [rcx+r8*8-038h+2000000h], rax
adc qword ptr [rcx+r8*8-040h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-050h]
mov rax, qword ptr [rcx+r8*8-048h]
adc qword ptr [rcx+r8*8-048h+2000000h], rax
adc qword ptr [rcx+r8*8-050h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-060h]
mov rax, qword ptr [rcx+r8*8-058h]
adc qword ptr [rcx+r8*8-058h+2000000h], rax
adc qword ptr [rcx+r8*8-060h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-070h]
mov rax, qword ptr [rcx+r8*8-068h]
adc qword ptr [rcx+r8*8-068h+2000000h], rax
adc qword ptr [rcx+r8*8-070h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-080h]
mov rax, qword ptr [rcx+r8*8-078h]
adc qword ptr [rcx+r8*8-078h+2000000h], rax
adc qword ptr [rcx+r8*8-080h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-090h]
mov rax, qword ptr [rcx+r8*8-088h]
adc qword ptr [rcx+r8*8-088h+2000000h], rax
adc qword ptr [rcx+r8*8-090h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-0A0h]
mov rax, qword ptr [rcx+r8*8-098h]
adc qword ptr [rcx+r8*8-098h+2000000h], rax
adc qword ptr [rcx+r8*8-0A0h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-0B0h]
mov rax, qword ptr [rcx+r8*8-0A8h]
adc qword ptr [rcx+r8*8-0A8h+2000000h], rax
adc qword ptr [rcx+r8*8-0B0h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-0C0h]
mov rax, qword ptr [rcx+r8*8-0B8h]
adc qword ptr [rcx+r8*8-0B8h+2000000h], rax
adc qword ptr [rcx+r8*8-0C0h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-0D0h]
mov rax, qword ptr [rcx+r8*8-0C8h]
adc qword ptr [rcx+r8*8-0C8h+2000000h], rax
adc qword ptr [rcx+r8*8-0D0h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-0E0h]
mov rax, qword ptr [rcx+r8*8-0D8h]
adc qword ptr [rcx+r8*8-0D8h+2000000h], rax
adc qword ptr [rcx+r8*8-0E0h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-0F0h]
mov rax, qword ptr [rcx+r8*8-0E8h]
adc qword ptr [rcx+r8*8-0E8h+2000000h], rax
adc qword ptr [rcx+r8*8-0F0h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-100h]
mov rax, qword ptr [rcx+r8*8-0F8h]
adc qword ptr [rcx+r8*8-0F8h+2000000h], rax
adc qword ptr [rcx+r8*8-100h+2000000h], rbx
pushf
sub r8, 32
jnz AddLoop
popf
adc qword ptr [rdx], 0 ; lock?
ret
HugeAdd endp
end
welcome to the forum :U
first, "qword ptr" is not required when a register is used as one of the operands - that specifies the size
i am sure there are many ways to optimize this code
but the first one that pops out at me (pun) is the use if PUSHF and POPF
instead of saving and restoring flags across the SUB, use LEA - it does not alter the flags
lea r8,[r8-32]
i am not a 64-bit guy, so i don't know about PUSHF and POPF in 64-bit world
but PUSHF is slow, POPF is VERY slow
for the case of the last one, where you need to test for ZF...
use rotate and shift instructions to store and restore the carry flag
rcl rax,1 ;CF -> RAX bit 0
sub r8,32
jnz
then, to restore the carry flag...
shr rax,1 ;RAX bit 0 -> CF
Assuming you have them available, I'd make use of more registers (this is 64-bit, so there's also r8-r15), which should allow you to reduce stalls between inter-dependent sequences.
The other thing is that pushf/popf is likely to be quite slow, as dave mentioned. 'Copy' it to a register with adc (or sbb) and either copy it back or use it when you need to. (This should be faster than shift/rotate.)
hi,
my tip: do not use scaling (r8*8) for addressing: instead calculate the base address of the blocks at procedure entry and then increment them using ADD or LEA
mov r8,[rsi+0*8]
mov r9,[rsi+1*8]
...
mov r15,[rsi+7*8]
lea rsi,[rsi+8*8]
adc [rdi+0*8],r8
adc [rdi+1*8],r9
...
adc [rdi+7*8],r15
lea rdi,[rdi+8*8]
I'm wondering how do you transmit the carry-flag between the blocks/threads - AFAIKS this is sequential problem, which can't be speed up by using several threads.
i can see how it may be done using as many threads as there are cores
for example:
you total "block A" on core 1 - store the total and the carry
you total "block B" on core 2 - store the total and the carry
you total "block C" on core 3 - store the total and the carry
you total "block D" on core 4 - store the total and the carry
when all threads have completed, add the stored totals/carries
of course, you have to affinitize each thread to individual cores :P
the block size will be dependant on the number of cores available
...i might add....
the practical number of usable cores for this does not include intel hyper-threaded "cores"
if you are doing this on your machine, alone, it will simplify things, as you know how many physical cores you have
but, if you want to run on different machines, a little more work may be involved
there is no well-documented method of mapping the affinity mask bits to cores
however, from experimentation, i can tell you that hyper-threaded cores have adjacent juxtapositioned mask bits :P
(you like that word ? - i have more - lol)
Quote from: dedndave on February 15, 2012, 04:44:47 PMwhen all threads have completed, add the stored totals/carries
Did I miss something? - in this case he needs to make two ADCs for the Blocks B,C and D.
A sequential solution only requires one ADC for each Block.
i am afraid i don't understand what your line of thinking is, qWord :(
Quote from: dedndave on February 15, 2012, 05:18:45 PM
i am afraid i don't understand what your line of thinking is, qWord :(
ok - I've missed what jnm2 is trying to archive ... Just thought, that he simply want to add two large integers.
I guess because you can look at the high order end of the sum and be pretty sure when it won't carry to the next block, or will. The only uncertain case is where the summed QWORD is 0xFFFFFFFF,FFFFFFFF and then you're dependent on a earlier carry to push you over or not.
You tried PADDQ, I suppose?
PADDQ doesn't do carries.
This is an AMD FX-8150. Eight cores, no hyperthreading.
I don't need to do this sequentially. Think of it this way. The largest possible add is 0xFFFFFFFFFFFFFFFF + 0xFFFFFFFFFFFFFFFF = 0xFFFFFFFFFFFFFFFE carry 1. So the previous block will be able to carry here and it will never cause a second carry in this block.
Lots of great tips for me to look at! I was able to get rid of popf and pushf, but I'm not familiar with lea. Until I get through your suggestions, this is my current code.
mov rbx, qword ptr [rcx-010h+2000000h]
mov rax, qword ptr [rcx-008h+2000000h]
add qword ptr [rcx-008h+4000000h], rax
adc qword ptr [rcx-010h+4000000h], rbx
mov rbx, qword ptr [rcx-020h+2000000h]
mov rax, qword ptr [rcx-018h+2000000h]
adc qword ptr [rcx-018h+4000000h], rax
adc qword ptr [rcx-020h+4000000h], rbx
[...]
mov rbx, qword ptr [rcx-100h+2000000h]
mov rax, qword ptr [rcx-0F8h+2000000h]
adc qword ptr [rcx-0F8h+4000000h], rax
adc qword ptr [rcx-100h+4000000h], rbx
mov r9, 1FFFFh
AddLoop:
mov r8, r9
shl r8, 5
mov rbx, qword ptr [rcx+r8*8-010h]
mov rax, qword ptr [rcx+r8*8-008h]
adc qword ptr [rcx+r8*8-008h+2000000h], rax
adc qword ptr [rcx+r8*8-010h+2000000h], rbx
mov rbx, qword ptr [rcx+r8*8-020h]
mov rax, qword ptr [rcx+r8*8-018h]
adc qword ptr [rcx+r8*8-018h+2000000h], rax
adc qword ptr [rcx+r8*8-020h+2000000h], rbx
[...]
mov rbx, qword ptr [rcx+r8*8-100h]
mov rax, qword ptr [rcx+r8*8-0F8h]
adc qword ptr [rcx+r8*8-0F8h+2000000h], rax
adc qword ptr [rcx+r8*8-100h+2000000h], rbx
dec r9
jnz AddLoop
adc qword ptr [rdx], 0
ret
Just for interest: what kind of data is it - where does it come from?
qWord
Basically - http://en.wikipedia.org/wiki/Lucas%E2%80%93Lehmer_primality_test
By the way, if the constants above look confusing, it's because I'm confusing myself. Let me sort this out.
Here is the current code with the correct distances:
mov rbx, [rcx-010h+2000000h]
mov rax, [rcx-008h+2000000h]
add [rcx-008h+12000000h], rax
adc [rcx-010h+12000000h], rbx
mov rbx, [rcx-020h+2000000h]
mov rax, [rcx-018h+2000000h]
adc [rcx-018h+12000000h], rax
adc [rcx-020h+12000000h], rbx
[...]
mov rbx, [rcx-100h+2000000h]
mov rax, [rcx-0F8h+2000000h]
adc [rcx-0F8h+12000000h], rax
adc [rcx-100h+12000000h], rbx
lea r8, [rcx+1FFFF00h]
AddLoop:
mov rbx, [r8-010h]
mov rax, [r8-008h]
adc [r8-008h+10000000h], rax
adc [r8-010h+10000000h], rbx
mov rbx, [r8-020h]
mov rax, [r8-018h]
adc [r8-018h+10000000h], rax
adc [r8-020h+10000000h], rbx
[...]
mov rbx, [r8-100h]
mov rax, [r8-0F8h]
adc [r8-0F8h+10000000h], rax
adc [r8-100h+10000000h], rbx
lea r8, [r8-256]
cmp r8, rcx
jne AddLoop
adc qword ptr [rdx], 0
ret
Somewhere I divided 10000000h by eight. Now with the correct offsets, this takes twice as long.
my suggestion would be:
CHUNK_INFO struct
pChunkA PVOID ? ; in , destination
pChunkB PVOID ? ; in , source
n DWORD ? ; in , number of QWORDs
carry DWORD ? ; in/out , 0 or 1
CHUNK_INFO ends
; thread procedure
BlockAdd proc pChunkInfo:ptr CHUNK_INFO
mov rbx,pChunkInfo
mov ecx,[rbx].CHUNK_INFO.n
mov rdi,[rbx].CHUNK_INFO.pChunkA
mov rsi,[rbx].CHUNK_INFO.pChunkB
shr ecx,3
mov edx,[rbx].CHUNK_INFO.carry
align 16
@@:
mov r8 ,[rsi+0*8]
mov r9 ,[rsi+1*8]
mov r10,[rsi+2*8]
mov r11,[rsi+3*8]
mov r12,[rsi+4*8]
mov r13,[rsi+5*8]
mov r14,[rsi+6*8]
mov r15,[rsi+7*8]
lea rsi,[rsi+8*8]
add r8,rdx
mov rdx,0
adc [rdi+0*8],r8
adc [rdi+1*8],r9
adc [rdi+2*8],r10
adc [rdi+3*8],r11
adc [rdi+4*8],r12
adc [rdi+5*8],r13
adc [rdi+6*8],r14
adc [rdi+7*8],r15
adc rdx,0
lea rdi,[rdi+8*8]
; this loop could be unrolled several times ...
dec ecx
jnz @B
mov [rbx].CHUNK_INFO.carry,edx
mov rax,1
ret
BlockAdd endp
The structure is filled before calling CreateThread() with the needed addresses. Also a possible carry is returned through it.
EDIT: code changed
Quote from: jnm2 on February 16, 2012, 01:29:28 AM
PADDQ doesn't do carries.
Valid argument. Now if your data are predictable, you could branch into SSE code if no carry expected (test for bit 63?). Usually the speed gain with SSE2 is considerable...