[add a non-SSE version copied from mpn/x86/rshift.asm. uses shrd for ~1.89 c/l on Core 2
peter@cordes.ca**20080319184937] {
hunk ./rshift.asm 305
+
+C	non-SSE rshift for Core 2, copied from 32bit mpn/x86/rshift.asm
+C regs: args: %rdi, %rsi, %rdx, %rcx, %r8, %r9
+C %r11: used for linking
+C %r12: unused.
+
+	C Conroe:
+	C size 1	9.024 c/l
+	C size 2	6.5 c/l
+	C size 3	4.0 c/l
+	C size 4	3.76 c/l
+	C size 496	1.892 c/l  (addq $2, with offset addressing: 1.796c/l)
+	C size 10000	2.48 c/l
+	C size 10000000	13.977 c/l
+ASM_START()
+ALIGN(64)
+PROLOGUE(mpn_rshift_core2)
+	C shift count can stay where it is in %rcx
+C	movq	%rbx,	%xmm2		C save regs in xmm, not stack
+	push	%rbx
+	mov	(%rsi), %rbx		C %rbx = limb0
+	xor	%eax,	%eax
+	shrd	%cl,	%rbx,	%rax	C %rax = ret val limb.  %rbx still = limb0
+
+	lea	-8(%rdi,%rdx,8), %rdi	C rdi = last limb
+	lea	(%rsi,%rdx,8), %rsi	C rsi = last limb + 1.
+	neg	%rdx
+C rdx=-n; %0=64-cnt; %1=cnt;
+C %rbx=limb0;
+	add	$1, %rdx		C n= -n + 1
+	jz	L(c2_end_short)
+	push	%rax			C save ret val
+        testb	$1,%dl
+        jnz	L(c2_1)			C enter loop in the middle
+        mov	%rbx,%rax
+
+	ALIGN(8)
+L(c2_loop):
+
+	mov	(%rsi,%rdx,8),	%rbx	C load next higher limb
+	shrd	%cl, %rbx, %rax		C compute result limb
+	mov	%rax,(%rdi,%rdx,8)	C store it
+	add	$1,	%rdx
+C	inc	%rdx
+L(c2_1): mov	(%rsi,%rdx,8),%rax
+	shrd	%cl, %rax, %rbx
+	mov	%rbx,	(%rdi,%rdx,8)
+	add	$1,	%rdx
+C	inc	%rdx
+	jnz	L(c2_loop)
+
+C L(c2_end):
+	shr	%cl,	%rax		C compute most significant limb
+        mov	%rax,	(%rdi)		C store it
+	pop	%rax			C return val
+	pop	%rbx
+C	movq	%xmm2,	%rbx
+	ret
+
+L(c2_end_short):
+	shrq	%cl,	%rbx		C compute most significant limb
+        movq	%rbx,	(%rdi)		C store it
+C	popq	%rax			C return val
+	movq	%xmm2,	%rbx
+	ret
+EPILOGUE()
+
hunk ./shift.c 38
-#define func mpn_rshift_sse2
+// #define func mpn_rshift_sse2_aligned
+// #define func mpn_rshift_sse2
+#define func mpn_rshift_core2
hunk ./shift.c 50
-
+mp_limb_t mpn_rshift_core2(mp_limb_t *rp, const mp_limb_t *sp, mp_size_t n, unsigned int count );
}