[working unrolled version.  1.55c/l
peter@cordes.ca**20080320044409
 slow when n is small and not = 4*m+1.  Use a computed jump
] {
hunk ./rshift.asm 275
+C	movdqa	%xmm6,	(%rdi,%rdx,8)
hunk ./rshift.asm 307
-C	non-SSE rshift for Core 2, copied from 32bit mpn/x86/rshift.asm
hunk ./rshift.asm 308
-C %r11: used for linking
-C %r12: unused.
+C %r11: used for linking.  looks like it is used as a tmp in gcc-compiled functions.
+C %r12: unused in C.  wrong, it is used. gcc-code saves and restores it.
+
+C	non-SSE rshift for Core 2, copied from 32bit mpn/x86/rshift.asm
+C K8:
+C size 497	4.524
+
+C Conroe:
+C size 1	8.000 (9.024 w/cmp jge sse2)	(not unrolled: 9.024 c/l)
+C size 2	(not unrolled: 6.5 c/l)
+C size 3	(not unrolled: 4.0 c/l)
+C size 4	3.780	(not unrolled: 3.76 c/l)
+C size 5	2.995
+C size 496	1.571 (not unrolled: 1.892 c/l (addq $2 with offset addressing: 1.796c/l)
+C size 497	1.555
+C size 10000	2.448 (not unrolled: 2.48 c/l)
+C size 16000	2.448
+C size 17000	13.025  beyond here, take timings
+C size 100000	13.042
+C size 1000000	13.943
+C size 10000000	13.961  (not unrolled: 13.977 c/l)
+
+C Harpertown:
+C size 497	1.562
hunk ./rshift.asm 333
-	C Conroe:
-	C size 1	9.024 c/l
-	C size 2	6.5 c/l
-	C size 3	4.0 c/l
-	C size 4	3.76 c/l
-	C size 496	1.892 c/l  (addq $2, with offset addressing: 1.796c/l)
-	C size 10000	2.48 c/l
-	C size 10000000	13.977 c/l
hunk ./rshift.asm 334
-ALIGN(64)
+C TODO: remove this, or make it 16
+ALIGN(1024)
hunk ./rshift.asm 337
+	C If we're working from L2 cache, because args don't fit in L1, sse2 goes ~9c/l, this fn goes ~13c/l
+	C From main memory, sse2 goes ~11.5c/l, while we go ~14c/l
+	C TODO: find a good threshold.  With shift.c running in a loop, the jump is between n=16000 and 17000.
+	C If src isn't perfectly cached, the threshold would probably be lower.
+
+	cmp	$16000,	%rdx
+	jge	mpn_rshift_sse2		C TODO: and not penryn/harpertown unless we can use the aligned version
+
hunk ./rshift.asm 346
-C	movq	%rbx,	%xmm2		C save regs in xmm, not stack
+C	mov	%rbx,	%r9		C regs >= 8 need an extra prefix to access, so just use for saving
hunk ./rshift.asm 349
+	push	%rbp
hunk ./rshift.asm 353
-	lea	-8(%rdi,%rdx,8), %rdi	C rdi = last limb
-	lea	(%rsi,%rdx,8), %rsi	C rsi = last limb + 1.
-	neg	%rdx
-C rdx=-n; %0=64-cnt; %1=cnt;
-C %rbx=limb0;
-	add	$1, %rdx		C n= -n + 1
-	jz	L(c2_end_short)
-	push	%rax			C save ret val
-        testb	$1,%dl
-        jnz	L(c2_1)			C enter loop in the middle
-        mov	%rbx,%rax
+C	lea	-8(%rdi,%rdx,8), %rdi	C rdi = last limb
+C	lea	(%rsi,%rdx,8), %rsi	C rsi = last limb + 1.
+
+C	neg	%rdx
+C	add	$1, %rdx		C n= -n + 1
+C %rax = ret val; %rbx=limb0; %rcx=shift count; %rdx=size
+
+	sub	$1,	%rdx		C %rdx = n-1
+	jle	L(c2_end_short)		C if(n<=1), no loop.  %rdi on entry points to top (only) limb
+C	mov	%rax,	%r8		C save ret val
+	push	%rax
+	mov	%rbx,	%rax		C get limb0 in both regs, so we can jump into the loop anywhere.
hunk ./rshift.asm 366
-	ALIGN(8)
-L(c2_loop):
+	mov	%rdx,	%rbp
+	and	$3,	%rbp		C %rbp = (n-1)%4
+	jz	L(c2_plus1)		C (n-1)%4==0, n==4m+1.  special case: don't need to adjust pointers (and the code below would fail because (n-1)%4 = 0, not 4.)
hunk ./rshift.asm 370
-	mov	(%rsi,%rdx,8),	%rbx	C load next higher limb
+C adjust pointers for jump into loop:  %rsi, %rdi -= (4 - (n-1)%4)*8 == add -4*8 + ((n-1%4))*8
+	lea	(-4*8)(%rdi,%rbp,8), %rdi
+	lea	(8-4*8)(%rsi,%rbp,8), %rsi
+C	lea	8(%rsi), %rsi		C already loaded the first limb
+
+	cmp	$2,	%rbp
+C FIXME: definitely use a computed jump; this is slow on small n. when it's the last branch that's taken
+	jg	L(c2_1)			C %rbp=3, (n-1)%4=3, n=4m
+	je	L(c2_2)			C %rbp=2, (n-1)%4=2, n=4m-1
+	jl	L(c2_3)			C %rbp=1, (n-1)%4=1, n=4m-2
+
+C	jmp	*(L(c2_loop)+%rbp)
+
+L(c2_plus1):
+	lea	8(%rsi), %rsi
+C	add	$4,	%rdx
+
+C	ALIGN(16)  C loop is <= 18 insn and <= 64 aligned bytes, so fits into Core 2's loop stream buffer, so alignment doesn't matter
+C further unrolling will push it beyond the size of the loop stream detector.  (already close in bytes)
+C use simple addressing modes, not (%rdi,%rdx,8).  That generates too many ALU uops that compete with the shifts.
+L(c2_loop):					C start here on n = 4m+1. with rdi,rsi -= 0.  (n-1)%4 = 0
+	mov	0(%rsi), %rbx		C load next higher limb
hunk ./rshift.asm 393
-	mov	%rax,(%rdi,%rdx,8)	C store it
-	add	$1,	%rdx
-C	inc	%rdx
-L(c2_1): mov	(%rsi,%rdx,8),%rax
+	mov	%rax,	0(%rdi)		C store it
+C	stosq
+L(c2_1): mov	8(%rsi), %rax			C start here on n = 4m. with rdi,rsi -= 1*8.  (n-1)%4=3
hunk ./rshift.asm 397
-	mov	%rbx,	(%rdi,%rdx,8)
-	add	$1,	%rdx
+	mov	%rbx,	8(%rdi)
+C	mov	%rbx,	(%rdi)
+C	add	$8,	%rdi
+L(c2_2): mov	16(%rsi), %rbx			C start here on n = 4m-1.  with rdi,rsi -= 2*8 (n-1)%4=2
+	shrd	%cl, %rbx, %rax		C compute result limb
+	mov	%rax,	16(%rdi)	C store it
+C	stosq
+L(c2_3): mov	24(%rsi), %rax			C start here on n = 4m-2.  with rdi,rsi -= 3*8 (n-1)%4=1
+	shrd	%cl, %rax, %rbx
+	mov	%rbx,	24(%rdi)
+C	mov	%rbx,	(%rdi)
+C	add	$8,	%rdi
+
+	add	$32,	%rsi
+	add	$32,	%rdi
+	sub	$4,	%rdx
hunk ./rshift.asm 414
-	jnz	L(c2_loop)
+	jg	L(c2_loop)
hunk ./rshift.asm 420
+C	mov	%r8,	%rax
+	pop	%rbp
hunk ./rshift.asm 423
-C	movq	%xmm2,	%rbx
hunk ./rshift.asm 425
+C probably get rid of this and put an extra insn or two on the n=1 path, allowing a jump to c2_end
hunk ./rshift.asm 429
-C	popq	%rax			C return val
-	movq	%xmm2,	%rbx
+C	mov	%r9,	%rbx
+	pop	%rbp
+	pop	%rbx
}