[better-pipelined main loop based on Torbjorn's work. With good short-loop intro peter@cordes.ca**20080322105200 runs 1.331c/l ] { hunk ./rshift.asm 335 -C conroe: computed goto ; triple jcc version (downside: pollutes the branch predictor) -C SIZE=1; 10.016 cycles/limb 8.992 -C SIZE=2; 7.008 7.464 -C SIZE=3; 5.333 cycles/limb 5.013 -C SIZE=4; 4.260 cycles/limb 5.760 -C SIZE=5; 3.206 cycles/limb 3.014 -C SIZE=6; 3.509 cycles/limb 3.509 -C SIZE=7; 3.163 cycles/limb 2.999 -C SIZE=8; 2.934 cycles/limb 2.718 -C SIZE=9; 2.596 cycles/limb 2.347 -C SIZE=10; 2.693 cycles/limb 2.710 -C SIZE=496; 1.571 cycles/limb 1.571 +C conroe: intro loop computed goto ; triple jcc version (pollutes the branch predictor) +C SIZE=1; 11.008 10.016 cycles/limb 8.992 +C SIZE=2; 6.504 7.008 7.464 +C SIZE=3; 5.355 5.333 cycles/limb 5.013 +C SIZE=4; 4.760 4.260 cycles/limb 5.760 +C SIZE=5; 3.014 3.206 cycles/limb 3.014 +C SIZE=6; 3.005 3.509 cycles/limb 3.509 +C SIZE=7; 2.999 3.163 cycles/limb 2.999 +C SIZE=8; 3.006 2.934 cycles/limb 2.718 +C SIZE=9; 2.222 2.596 cycles/limb 2.347 +C SIZE=10; 2.306 2.693 cycles/limb 2.710 +C SIZE=496; 1.331 1.571 cycles/limb 1.571 hunk ./rshift.asm 355 -C L(sse2_thresh): .word 16000 -C .word 0 -C .word 0 -C .word 0 hunk ./rshift.asm 357 - C If we're working from L2 cache, because args don't fit in L1, sse2 goes ~9c/l, this fn goes ~13c/l - C From main memory, sse2 goes ~11.5c/l, while we go ~14c/l - C TODO: find a good threshold. With shift.c running in a loop, the jump is between n=16000 and 17000. - C If src isn't perfectly cached, the threshold would probably be lower. +C C would like to get lots of instructions into the OOO execution engine early so it has plenty to work on... +C cmp $16000, %rdx +C jge mpn_rshift_sse2 C TODO: and not penryn/harpertown unless we can use the aligned version hunk ./rshift.asm 361 - C would like to get lots of instructions into the OOO execution engine early so it has plenty to work on... - cmp $16000, %rdx -C cmp L(sse2_thresh)(%rip), %rdx C no addressing mode can make this take less space, so use imm32 - jge mpn_rshift_sse2 C TODO: and not penryn/harpertown unless we can use the aligned version + +C regs for the loop. use macros to make register reallocation easy. +define(n,%rdx) +define(reg1,%r9) +define(reg2,%rax) +define(reg3,%r8) C referenced the fewest times +define(reg4,%r11) hunk ./rshift.asm 370 -C mov %rbx, %r9 C regs >= 8 need an extra prefix to access, so just use for saving. push takes fewer bytes - push %rbx - mov (%rsi), %rbx C %rbx = limb0 - push %rbp +C push reg2 +C push reg4 + mov (%rsi), reg4 C reg4 = limb0 hunk ./rshift.asm 374 - shrd %cl, %rbx, %rax C %rax = ret val limb. %rbx still = limb0 - - sub $1, %rdx C %rdx = n-1 - jle L(c2_end_short) C if(n<=1), no loop. %rdi on entry points to top (only) limb - add $8, %rsi + shrd %cl, reg4, %rax C %rax = ret val limb. %rbx still = limb0 hunk ./rshift.asm 377 +C mov %rsi, %r9 + jmp L(c2_entry) hunk ./rshift.asm 380 - mov (%rsi), %rax - shrd %cl, %rax, %rbx - mov %rbx, (%rdi) - mov %rax, %rbx + shrd %cl, reg1, reg4 + mov reg4, (%rdi) + mov reg1, reg4 hunk ./rshift.asm 384 - sub $1, %rdx - jz L(c2_end) +C add $8, %r9 +L(c2_entry): + dec n C sub looks like it makes things align better, but dec has the same timings +C sub $1, n + jle L(c2_end) hunk ./rshift.asm 390 + mov (%rsi), reg1 C reg4=limb0 reg1=limb1 hunk ./rshift.asm 395 -C %rax=%rbx=limb(i). %rdx=n-i-1 %rsi -> next limb to load -C mov %rbx, %rax C get limb0 in both regs, so we can jump into the loop anywhere. - - C do some work that's wasted for (n-1)%4==0. This hopefully shortens the critical path for the computed jump -C mov %rdx, %rbp -C sub $1, %rbp -C and $3, %rbp -C imul $-12, %rbp, %r8 C do this early to hide the latency -C lea (L(c2_1)+3*12)(%r8), %r8 - -C lea -8(%rdi,%rdx,8), %rdi C rdi = last limb -C lea (%rsi,%rdx,8), %rsi C rsi = last limb + 1. - -C neg %rdx -C add $1, %rdx C n= -n + 1 -C %rax = ret val; %rbx=limb0; %rcx=shift count; %rdx=size - -C mov %rdx, %rbp -C and $3, %rbp C %rbp = (n-1)%4 -C test %rbp, %rbp -C jz L(c2_plus1) C (n-1)%4==0, n==4m+1. special case: don't need to adjust pointers (and the code below would fail because (n-1)%4 = 0, not 4.) - -C moved earlier -C mov %rbp, %r8 -C sub $3, %r8 C %rbp: 0->c2_1. -1->c2_2. -2->c2_3. -C imul $-12, %rbp, %r8 C each mov and shrd takes 4 bytes, so loop uses 12 bytes per block -C lea (L(c2_1)+3*12)(%r8), %r8 C requires a 4-byte displacement :( +C reg4=limb(i) reg1=limb(i+1). %rdx=n-i-1, %rdx%4=0 %rsi -> limb(i+1) hunk ./rshift.asm 397 -C adjust pointers for jump into loop: %rsi, %rdi -= (4 - (n-1)%4)*8 == add -4*8 + ((n-1%4))*8 -C lea (-4*8)(%rdi,%rbp,8), %rdi -C lea (8-4*8)(%rsi,%rbp,8), %rsi -C lea 8(%rsi), %rsi C already loaded the first limb +C mov (%rsi), reg1 + mov 8(%rsi), reg2 + lea 16(%rsi),%rsi hunk ./rshift.asm 401 +C debug: %r9 = %rsi +C mov (%r9), reg4 +C mov 8(%r9), reg1 +C mov 16(%r9),reg2 hunk ./rshift.asm 406 -C cmp $2, %rbp -C FIXME: definitely use a computed jump; this is slow on small n. when it's the last branch that's taken -C jg L(c2_1) C %rbp=3, (n-1)%4=3, n=4m -C je L(c2_2) C %rbp=2, (n-1)%4=2, n=4m-1 -C jl L(c2_3) C %rbp=1, (n-1)%4=1, n=4m-2 +C require: reg1=limb1; reg2=limb2; reg3=xxx; reg4=limb0 hunk ./rshift.asm 408 -C jmp *L(c2_1)(%r8) -C jmp *%r8 - -L(c2_plus1): -C lea 8(%rsi), %rsi -C add $4, %rdx - -C ALIGN(16) C loop is <= 18 insn and <= 64 aligned bytes, so fits into Core 2's loop stream buffer, so alignment doesn't matter +C loop is <= 18 insn and <= 4 16byte aligned blocks, so fits into Core 2's loop stream buffer, so alignment doesn't matter +C If this is misaligned so it doesn't fit in the loob buffer, it runs ~1.57 c/l instead of ~1.33 hunk ./rshift.asm 411 -C use simple addressing modes, not (%rdi,%rdx,8). That generates too many ALU uops that compete with the shifts. -L(c2_loop): C start here on n = 4m+1. with rdi,rsi -= 0. (n-1)%4 = 0 - mov (%rsi), %rbx C load next higher limb - shrd %cl, %rbx, %rax C compute result limb - mov %rax, (%rdi) C store it -C xchg %eax, %eax C 2 byte nop. might be better to use zero displacements in above addressing modes. - C but still have to treat loop entry as special because of the lea needed before falling in. -C stosq -L(c2_1): mov 8(%rsi), %rax C start here on n = 4m. with rdi,rsi -= 1*8. (n-1)%4=3 - shrd %cl, %rax, %rbx - mov %rbx, 8(%rdi) -C mov %rbx, (%rdi) -C add $8, %rdi -L(c2_2): mov 16(%rsi), %rbx C start here on n = 4m-1. with rdi,rsi -= 2*8 (n-1)%4=2 - shrd %cl, %rbx, %rax C compute result limb - mov %rax, 16(%rdi) C store it -C stosq -L(c2_3): mov 24(%rsi), %rax C start here on n = 4m-2. with rdi,rsi -= 3*8 (n-1)%4=1 - shrd %cl, %rax, %rbx - mov %rbx, 24(%rdi) -C mov %rbx, (%rdi) -C add $8, %rdi - -L(c2_4): C jump in here could be better than coming in the top - add $32, %rsi - add $32, %rdi - sub $4, %rdx -C inc %rdx +C use simple addressing modes, not (%rdi,%rdx,8). That generates too many reads of not-in-flight register values +C ALIGN(16) +L(c2_loop): shrd %cl, reg1, reg4 + mov (%rsi), reg3 + mov reg4, (%rdi) +C L(c2_10): + shrd %cl, reg2, reg1 + mov 8(%rsi), reg4 + mov reg1, 8(%rdi) +C L(c2_01): + shrd %cl, reg3, reg2 + mov 16(%rsi),reg1 + mov reg2, 16(%rdi) +C L(c2_00): + shrd %cl, reg4, reg3 + mov 24(%rsi),reg2 + lea 32(%rsi),%rsi + mov reg3, 24(%rdi) + lea 32(%rdi),%rdi + sub $4, n hunk ./rshift.asm 432 - +C ALIGN(16) would align the branch target, but it doesn't seem to matter. +C L(c2_endshort): hunk ./rshift.asm 435 - shr %cl, %rax C compute most significant limb - mov %rax, (%rdi) C store it hunk ./rshift.asm 436 - pop %rbp - pop %rbx - ret - -C probably get rid of this and put an extra insn or two on the n=1 path, allowing a jump to c2_end -L(c2_end_short): - shrq %cl, %rbx C compute most significant limb - movq %rbx, (%rdi) C store it -C mov %r9, %rbx - pop %rbp - pop %rbx + shr %cl, reg4 C compute most significant limb + mov reg4, (%rdi) C store it +C pop reg4 +C pop reg2 }