[add a non-SSE version copied from mpn/x86/rshift.asm. uses shrd for ~1.89 c/l on Core 2 peter@cordes.ca**20080319184937] { hunk ./rshift.asm 305 + +C non-SSE rshift for Core 2, copied from 32bit mpn/x86/rshift.asm +C regs: args: %rdi, %rsi, %rdx, %rcx, %r8, %r9 +C %r11: used for linking +C %r12: unused. + + C Conroe: + C size 1 9.024 c/l + C size 2 6.5 c/l + C size 3 4.0 c/l + C size 4 3.76 c/l + C size 496 1.892 c/l (addq $2, with offset addressing: 1.796c/l) + C size 10000 2.48 c/l + C size 10000000 13.977 c/l +ASM_START() +ALIGN(64) +PROLOGUE(mpn_rshift_core2) + C shift count can stay where it is in %rcx +C movq %rbx, %xmm2 C save regs in xmm, not stack + push %rbx + mov (%rsi), %rbx C %rbx = limb0 + xor %eax, %eax + shrd %cl, %rbx, %rax C %rax = ret val limb. %rbx still = limb0 + + lea -8(%rdi,%rdx,8), %rdi C rdi = last limb + lea (%rsi,%rdx,8), %rsi C rsi = last limb + 1. + neg %rdx +C rdx=-n; %0=64-cnt; %1=cnt; +C %rbx=limb0; + add $1, %rdx C n= -n + 1 + jz L(c2_end_short) + push %rax C save ret val + testb $1,%dl + jnz L(c2_1) C enter loop in the middle + mov %rbx,%rax + + ALIGN(8) +L(c2_loop): + + mov (%rsi,%rdx,8), %rbx C load next higher limb + shrd %cl, %rbx, %rax C compute result limb + mov %rax,(%rdi,%rdx,8) C store it + add $1, %rdx +C inc %rdx +L(c2_1): mov (%rsi,%rdx,8),%rax + shrd %cl, %rax, %rbx + mov %rbx, (%rdi,%rdx,8) + add $1, %rdx +C inc %rdx + jnz L(c2_loop) + +C L(c2_end): + shr %cl, %rax C compute most significant limb + mov %rax, (%rdi) C store it + pop %rax C return val + pop %rbx +C movq %xmm2, %rbx + ret + +L(c2_end_short): + shrq %cl, %rbx C compute most significant limb + movq %rbx, (%rdi) C store it +C popq %rax C return val + movq %xmm2, %rbx + ret +EPILOGUE() + hunk ./shift.c 38 -#define func mpn_rshift_sse2 +// #define func mpn_rshift_sse2_aligned +// #define func mpn_rshift_sse2 +#define func mpn_rshift_core2 hunk ./shift.c 50 - +mp_limb_t mpn_rshift_core2(mp_limb_t *rp, const mp_limb_t *sp, mp_size_t n, unsigned int count ); }