[working computed-jmp version peter@cordes.ca**20080320084938 not as fast as triple-branch except when triple branch fell through to the last one. But hopefully less polluting of the branch predictor ] { hunk ./rshift.asm 245 +ALIGN(64) hunk ./rshift.asm 287 + C todo: use shrd to shorten the outro and put the start of this fn within -128bytes of the jge in the non-sse2 version hunk ./rshift.asm 313 -C non-SSE rshift for Core 2, copied from 32bit mpn/x86/rshift.asm +C non-SSE rshift for Core 2, basic structure copied from 32bit mpn/x86/rshift.asm hunk ./rshift.asm 317 -C Conroe: -C size 1 8.000 (9.024 w/cmp jge sse2) (not unrolled: 9.024 c/l) -C size 2 (not unrolled: 6.5 c/l) +C Conroe: timings _with_ cmp/jge sse2, since that's how it will be for real +C size 1 9.024 (8.000 w/o cmp jge sse2) (not unrolled: 9.024 c/l) +C size 2 7 c/l (not unrolled: 6.5 c/l) hunk ./rshift.asm 321 -C size 4 3.780 (not unrolled: 3.76 c/l) +C size 4 3.780 (not unrolled: 3.76 c/l). computed jmp: 4.260 hunk ./rshift.asm 327 -C size 17000 13.025 beyond here, take timings +C size 17000 13.025 beyond here, look at timings for sse2 routing which we call instead. hunk ./rshift.asm 332 +C probably if we care about small n performance, we should have a separate version of the function. +C conroe: computed goto ; triple jcc version (downside: pollutes the branch predictor) +C SIZE=1; 10.016 cycles/limb 8.992 +C SIZE=2; 7.008 7.464 +C SIZE=3; 5.333 cycles/limb 5.013 +C SIZE=4; 4.260 cycles/limb 5.760 +C SIZE=5; 3.206 cycles/limb 3.014 +C SIZE=6; 3.509 cycles/limb 3.509 +C SIZE=7; 3.163 cycles/limb 2.999 +C SIZE=8; 2.934 cycles/limb 2.718 +C SIZE=9; 2.596 cycles/limb 2.347 +C SIZE=10; 2.693 cycles/limb 2.710 +C SIZE=496; 1.571 cycles/limb 1.571 + + hunk ./rshift.asm 352 -ALIGN(1024) +C ALIGN(1024) +L(sse2_thresh): .word 16000 + .word 0 + .word 0 + .word 0 +C ALIGN(16) hunk ./rshift.asm 364 + C would like to get lots of instructions into the OOO execution engine early so it has plenty to work on... hunk ./rshift.asm 366 +C cmp L(sse2_thresh)(%rip), %rdx C no addressing mode can make this take less space, so use imm32 hunk ./rshift.asm 370 -C mov %rbx, %r9 C regs >= 8 need an extra prefix to access, so just use for saving +C mov %rbx, %r9 C regs >= 8 need an extra prefix to access, so just use for saving. push takes fewer bytes hunk ./rshift.asm 377 + C do some work that's wasted for (n-1)%4==0. This hopefully shortens the critical path for the computed jump + mov %rdx, %rbp + sub $1, %rbp + and $3, %rbp + imul $-12, %rbp, %r8 C do this early to hide the latency + lea (L(c2_1)+3*12)(%r8), %r8 + hunk ./rshift.asm 393 -C mov %rax, %r8 C save ret val hunk ./rshift.asm 396 - mov %rdx, %rbp - and $3, %rbp C %rbp = (n-1)%4 +C mov %rdx, %rbp +C and $3, %rbp C %rbp = (n-1)%4 + test %rbp, %rbp hunk ./rshift.asm 401 +C moved earlier +C mov %rbp, %r8 +C sub $3, %r8 C %rbp: 0->c2_1. -1->c2_2. -2->c2_3. +C imul $-12, %rbp, %r8 C each mov and shrd takes 4 bytes, so loop uses 12 bytes per block +C lea (L(c2_1)+3*12)(%r8), %r8 + hunk ./rshift.asm 412 - cmp $2, %rbp + +C cmp $2, %rbp hunk ./rshift.asm 415 - jg L(c2_1) C %rbp=3, (n-1)%4=3, n=4m - je L(c2_2) C %rbp=2, (n-1)%4=2, n=4m-1 - jl L(c2_3) C %rbp=1, (n-1)%4=1, n=4m-2 +C jg L(c2_1) C %rbp=3, (n-1)%4=3, n=4m +C je L(c2_2) C %rbp=2, (n-1)%4=2, n=4m-1 +C jl L(c2_3) C %rbp=1, (n-1)%4=1, n=4m-2 hunk ./rshift.asm 419 -C jmp *(L(c2_loop)+%rbp) +C jmp *L(c2_1)(%r8) + jmp *%r8 hunk ./rshift.asm 427 -C further unrolling will push it beyond the size of the loop stream detector. (already close in bytes) +C further unrolling will push it beyond the size of the loop stream detector. (already close in bytes). 8 limbs/iter runs at ~1.67 c/l hunk ./rshift.asm 430 - mov 0(%rsi), %rbx C load next higher limb + mov (%rsi), %rbx C load next higher limb hunk ./rshift.asm 432 - mov %rax, 0(%rdi) C store it + mov %rax, (%rdi) C store it +C xchg %eax, %eax C 2 byte nop. might be better to use zero displacements in above addressing modes. + C but still have to treat loop entry as special because of the lea needed before falling in. hunk ./rshift.asm 451 +L(c2_4): C jump in here could be better than coming in the top hunk ./rshift.asm 462 -C mov %r8, %rax }