[use macros to allow switching between 4-limb and 8-limb unroll peter@cordes.ca**20080324212029 also make it easy to move the add/lea up/down in the loop ] { hunk ./rshift.asm 316 -C K8: -C size 497 4.524 hunk ./rshift.asm 317 -C Conroe: timings _with_ cmp/jge sse2, since that's how it will be for real -C size 1 9.024 (8.000 w/o cmp jge sse2) (not unrolled: 9.024 c/l) -C size 2 7 c/l (not unrolled: 6.5 c/l) -C size 3 (not unrolled: 4.0 c/l) -C size 4 3.780 (not unrolled: 3.76 c/l). computed jmp: 4.260 -C size 5 2.995 -C size 496 1.571 (not unrolled: 1.892 c/l (addq $2 with offset addressing: 1.796c/l) -C size 497 1.555 -C size 10000 2.448 (not unrolled: 2.48 c/l) -C size 16000 2.448 -C size 17000 13.025 beyond here, look at timings for sse2 routing which we call instead. -C size 100000 13.042 -C size 1000000 13.943 -C size 10000000 13.961 (not unrolled: 13.977 c/l) +C fastest ever was 1.166 c/l, with an 8-limb unroll achieving 3.886 uops/clock (av. over the whole function) on SIZE=1001 +C see email sent to gmp-devel for order of instructions in the loop. +C It had the loads before the stores in the bottom half of the loop, so wouldn't work with 4 regs. hunk ./rshift.asm 321 -C probably if we care about small n performance, we should have a separate version of the function. -C conroe: intro loop computed goto ; triple jcc version (pollutes the branch predictor) -C SIZE=1; 11.008 10.016 cycles/limb 8.992 -C SIZE=2; 6.504 7.008 7.464 -C SIZE=3; 5.355 5.333 cycles/limb 5.013 -C SIZE=4; 4.760 4.260 cycles/limb 5.760 -C SIZE=5; 3.014 3.206 cycles/limb 3.014 -C SIZE=6; 3.005 3.509 cycles/limb 3.509 -C SIZE=7; 2.999 3.163 cycles/limb 2.999 -C SIZE=8; 3.006 2.934 cycles/limb 2.718 -C SIZE=9; 2.222 2.596 cycles/limb 2.347 -C SIZE=10; 2.306 2.693 cycles/limb 2.710 -C SIZE=496; 1.331 1.571 cycles/limb 1.571 - - -C Harpertown: -C size 497 1.562 +C 4-limb unroll fits in the loop-stream buffer, and does at best 1.281 c/l. 3.922 uops/clock. hunk ./rshift.asm 333 + +C still need to comment/uncomment loop counters/pointer increments. and the ALIGN(16) + define(`C2_UNROLL8') +C define(`C2_8REG') + +define(`unroll', ifdef(`C2_UNROLL8',8,4)) + hunk ./rshift.asm 341 -define(n,%rdx) -define(reg1,%r9) -define(reg2,%rax) -define(reg3,%r8) C referenced the fewest times -define(reg4,%r11) -define(reg5,%r12) -define(reg6,%r13) -define(reg7,%r14) -define(reg8,%r15) -C define(reg8,reg4) +define(`n',%rdx) +define(`reg1',%r9) +define(`reg2',%rax) +define(`reg3',%r8) C referenced the fewest times +define(`reg4',%r11) +C pipeline depth is only 4, so we can avoid overlap with only 4 regs. +ifdef(`C2_8REG', ` + define(`reg5',%r12) + define(`reg6',%r13) + define(`reg7',%r14) + define(`reg8',%r15) + ', ` + define(reg5,reg1) + define(reg6,reg2) + define(reg7,reg3) + define(reg8,reg4) +') + +ifdef(`C2_8REG', ` + push %r12 + push %r13 + push %r14 + push %r15 +', ) hunk ./rshift.asm 366 -C push reg2 -C push reg4 - push %r12 - push %r13 - push %r14 - push %r15 hunk ./rshift.asm 385 - test $7, %dl + test $(unroll-1), %dl hunk ./rshift.asm 410 +define(`srcoff',0) +define(`dstoff',0) hunk ./rshift.asm 413 -L(c2_loop): shrd %cl, reg1, reg8 +L(c2_loop): + shrd %cl, reg1, reg8 hunk ./rshift.asm 418 -C add $32, %rsi +ifdef(`C2_UNROLL8',, `add $32, %rsi; define(`srcoff',32)') hunk ./rshift.asm 420 - mov reg1, 8(%rdi) - mov (8-64)(%rsi), reg5 - lea 64(%rsi),%rsi + mov reg1, (8-dstoff)(%rdi) + mov (8-srcoff)(%rsi), reg5 +ifdef(`C2_UNROLL8', `lea 64(%rsi),%rsi; define(`srcoff',64)',) hunk ./rshift.asm 425 -C add $32, %rdi - mov reg2, (16-0)(%rdi) - mov (16-64)(%rsi), reg6 +ifdef(`C2_UNROLL8',, `add $32, %rdi; define(`dstoff',32)') + mov reg2, (16-dstoff)(%rdi) + mov (16-srcoff)(%rsi), reg6 hunk ./rshift.asm 430 -C sub $4, n - mov reg3, (24-0)(%rdi) - mov (24-64)(%rsi), reg7 +ifdef(`C2_UNROLL8',, `sub $4, n') + mov reg3, (24-dstoff)(%rdi) + mov (24-srcoff)(%rsi), reg7 hunk ./rshift.asm 434 - shrd %cl, reg5, reg4 - mov (32-64)(%rsi), reg8 - mov reg4, (32-0)(%rdi) - lea 64(%rdi),%rdi - C L(c2_10): - shrd %cl, reg6, reg5 - mov (40-64)(%rsi), reg1 - mov reg5, (40-64)(%rdi) - C L(c2_01): - shrd %cl, reg7, reg6 - mov (48-64)(%rsi), reg2 - mov reg6, (48-64)(%rdi) - C L(c2_00): - shrd %cl, reg8, reg7 - sub $8, n - mov (56-64)(%rsi), reg3 - mov reg7, (56-64)(%rdi) hunk ./rshift.asm 435 +ifdef(`C2_UNROLL8', ` + shrd %cl, reg5, reg4 + mov reg4, (32-dstoff)(%rdi) +ifdef(`C2_UNROLL8', `lea 64(%rdi),%rdi; define(`dstoff',64)',) + mov (32-srcoff)(%rsi), reg8 +C L(c2_10): + shrd %cl, reg6, reg5 + mov reg5, (40-dstoff)(%rdi) + mov (40-srcoff)(%rsi), reg1 +C L(c2_01): + shrd %cl, reg7, reg6 + mov reg6, (48-dstoff)(%rdi) + mov (48-srcoff)(%rsi), reg2 +C L(c2_00): + shrd %cl, reg8, reg7 +ifdef(`C2_UNROLL8', `sub $8, n',) + mov reg7, (56-dstoff)(%rdi) + mov (56-srcoff)(%rsi), reg3 +') C endif hunk ./rshift.asm 461 -C pop reg4 -C pop reg2 - pop %r15 - pop %r14 - pop %r13 - pop %r12 +ifdef(`C2_8REG', ` + pop %r15 + pop %r14 + pop %r13 + pop %r12 +',) }