[working unrolled version. 1.55c/l peter@cordes.ca**20080320044409 slow when n is small and not = 4*m+1. Use a computed jump ] { hunk ./rshift.asm 275 +C movdqa %xmm6, (%rdi,%rdx,8) hunk ./rshift.asm 307 -C non-SSE rshift for Core 2, copied from 32bit mpn/x86/rshift.asm hunk ./rshift.asm 308 -C %r11: used for linking -C %r12: unused. +C %r11: used for linking. looks like it is used as a tmp in gcc-compiled functions. +C %r12: unused in C. wrong, it is used. gcc-code saves and restores it. + +C non-SSE rshift for Core 2, copied from 32bit mpn/x86/rshift.asm +C K8: +C size 497 4.524 + +C Conroe: +C size 1 8.000 (9.024 w/cmp jge sse2) (not unrolled: 9.024 c/l) +C size 2 (not unrolled: 6.5 c/l) +C size 3 (not unrolled: 4.0 c/l) +C size 4 3.780 (not unrolled: 3.76 c/l) +C size 5 2.995 +C size 496 1.571 (not unrolled: 1.892 c/l (addq $2 with offset addressing: 1.796c/l) +C size 497 1.555 +C size 10000 2.448 (not unrolled: 2.48 c/l) +C size 16000 2.448 +C size 17000 13.025 beyond here, take timings +C size 100000 13.042 +C size 1000000 13.943 +C size 10000000 13.961 (not unrolled: 13.977 c/l) + +C Harpertown: +C size 497 1.562 hunk ./rshift.asm 333 - C Conroe: - C size 1 9.024 c/l - C size 2 6.5 c/l - C size 3 4.0 c/l - C size 4 3.76 c/l - C size 496 1.892 c/l (addq $2, with offset addressing: 1.796c/l) - C size 10000 2.48 c/l - C size 10000000 13.977 c/l hunk ./rshift.asm 334 -ALIGN(64) +C TODO: remove this, or make it 16 +ALIGN(1024) hunk ./rshift.asm 337 + C If we're working from L2 cache, because args don't fit in L1, sse2 goes ~9c/l, this fn goes ~13c/l + C From main memory, sse2 goes ~11.5c/l, while we go ~14c/l + C TODO: find a good threshold. With shift.c running in a loop, the jump is between n=16000 and 17000. + C If src isn't perfectly cached, the threshold would probably be lower. + + cmp $16000, %rdx + jge mpn_rshift_sse2 C TODO: and not penryn/harpertown unless we can use the aligned version + hunk ./rshift.asm 346 -C movq %rbx, %xmm2 C save regs in xmm, not stack +C mov %rbx, %r9 C regs >= 8 need an extra prefix to access, so just use for saving hunk ./rshift.asm 349 + push %rbp hunk ./rshift.asm 353 - lea -8(%rdi,%rdx,8), %rdi C rdi = last limb - lea (%rsi,%rdx,8), %rsi C rsi = last limb + 1. - neg %rdx -C rdx=-n; %0=64-cnt; %1=cnt; -C %rbx=limb0; - add $1, %rdx C n= -n + 1 - jz L(c2_end_short) - push %rax C save ret val - testb $1,%dl - jnz L(c2_1) C enter loop in the middle - mov %rbx,%rax +C lea -8(%rdi,%rdx,8), %rdi C rdi = last limb +C lea (%rsi,%rdx,8), %rsi C rsi = last limb + 1. + +C neg %rdx +C add $1, %rdx C n= -n + 1 +C %rax = ret val; %rbx=limb0; %rcx=shift count; %rdx=size + + sub $1, %rdx C %rdx = n-1 + jle L(c2_end_short) C if(n<=1), no loop. %rdi on entry points to top (only) limb +C mov %rax, %r8 C save ret val + push %rax + mov %rbx, %rax C get limb0 in both regs, so we can jump into the loop anywhere. hunk ./rshift.asm 366 - ALIGN(8) -L(c2_loop): + mov %rdx, %rbp + and $3, %rbp C %rbp = (n-1)%4 + jz L(c2_plus1) C (n-1)%4==0, n==4m+1. special case: don't need to adjust pointers (and the code below would fail because (n-1)%4 = 0, not 4.) hunk ./rshift.asm 370 - mov (%rsi,%rdx,8), %rbx C load next higher limb +C adjust pointers for jump into loop: %rsi, %rdi -= (4 - (n-1)%4)*8 == add -4*8 + ((n-1%4))*8 + lea (-4*8)(%rdi,%rbp,8), %rdi + lea (8-4*8)(%rsi,%rbp,8), %rsi +C lea 8(%rsi), %rsi C already loaded the first limb + + cmp $2, %rbp +C FIXME: definitely use a computed jump; this is slow on small n. when it's the last branch that's taken + jg L(c2_1) C %rbp=3, (n-1)%4=3, n=4m + je L(c2_2) C %rbp=2, (n-1)%4=2, n=4m-1 + jl L(c2_3) C %rbp=1, (n-1)%4=1, n=4m-2 + +C jmp *(L(c2_loop)+%rbp) + +L(c2_plus1): + lea 8(%rsi), %rsi +C add $4, %rdx + +C ALIGN(16) C loop is <= 18 insn and <= 64 aligned bytes, so fits into Core 2's loop stream buffer, so alignment doesn't matter +C further unrolling will push it beyond the size of the loop stream detector. (already close in bytes) +C use simple addressing modes, not (%rdi,%rdx,8). That generates too many ALU uops that compete with the shifts. +L(c2_loop): C start here on n = 4m+1. with rdi,rsi -= 0. (n-1)%4 = 0 + mov 0(%rsi), %rbx C load next higher limb hunk ./rshift.asm 393 - mov %rax,(%rdi,%rdx,8) C store it - add $1, %rdx -C inc %rdx -L(c2_1): mov (%rsi,%rdx,8),%rax + mov %rax, 0(%rdi) C store it +C stosq +L(c2_1): mov 8(%rsi), %rax C start here on n = 4m. with rdi,rsi -= 1*8. (n-1)%4=3 hunk ./rshift.asm 397 - mov %rbx, (%rdi,%rdx,8) - add $1, %rdx + mov %rbx, 8(%rdi) +C mov %rbx, (%rdi) +C add $8, %rdi +L(c2_2): mov 16(%rsi), %rbx C start here on n = 4m-1. with rdi,rsi -= 2*8 (n-1)%4=2 + shrd %cl, %rbx, %rax C compute result limb + mov %rax, 16(%rdi) C store it +C stosq +L(c2_3): mov 24(%rsi), %rax C start here on n = 4m-2. with rdi,rsi -= 3*8 (n-1)%4=1 + shrd %cl, %rax, %rbx + mov %rbx, 24(%rdi) +C mov %rbx, (%rdi) +C add $8, %rdi + + add $32, %rsi + add $32, %rdi + sub $4, %rdx hunk ./rshift.asm 414 - jnz L(c2_loop) + jg L(c2_loop) hunk ./rshift.asm 420 +C mov %r8, %rax + pop %rbp hunk ./rshift.asm 423 -C movq %xmm2, %rbx hunk ./rshift.asm 425 +C probably get rid of this and put an extra insn or two on the n=1 path, allowing a jump to c2_end hunk ./rshift.asm 429 -C popq %rax C return val - movq %xmm2, %rbx +C mov %r9, %rbx + pop %rbp + pop %rbx }