[better-pipelined main loop based on Torbjorn's work.  With good short-loop intro
peter@cordes.ca**20080322105200
 runs 1.331c/l
] {
hunk ./rshift.asm 335
-C conroe: computed goto     ; triple jcc version  (downside: pollutes the branch predictor)
-C SIZE=1; 10.016 cycles/limb  8.992
-C SIZE=2; 7.008		      7.464
-C SIZE=3; 5.333 cycles/limb   5.013
-C SIZE=4; 4.260 cycles/limb   5.760
-C SIZE=5; 3.206 cycles/limb   3.014
-C SIZE=6; 3.509 cycles/limb   3.509
-C SIZE=7; 3.163 cycles/limb   2.999
-C SIZE=8; 2.934 cycles/limb   2.718
-C SIZE=9; 2.596 cycles/limb   2.347
-C SIZE=10; 2.693 cycles/limb  2.710
-C SIZE=496; 1.571 cycles/limb 1.571
+C conroe: intro loop		computed goto     ; triple jcc version  (pollutes the branch predictor)
+C SIZE=1; 11.008		10.016 cycles/limb  8.992
+C SIZE=2; 6.504			7.008		    7.464
+C SIZE=3; 5.355			5.333 cycles/limb   5.013
+C SIZE=4; 4.760			4.260 cycles/limb   5.760
+C SIZE=5; 3.014			3.206 cycles/limb   3.014
+C SIZE=6; 3.005			3.509 cycles/limb   3.509
+C SIZE=7; 2.999			3.163 cycles/limb   2.999
+C SIZE=8; 3.006			2.934 cycles/limb   2.718
+C SIZE=9; 2.222			2.596 cycles/limb   2.347
+C SIZE=10; 2.306		2.693 cycles/limb   2.710
+C SIZE=496; 1.331		1.571 cycles/limb 1.571
hunk ./rshift.asm 355
-C L(sse2_thresh): .word 16000
-C 	.word 0
-C 	.word 0
-C 	.word 0
hunk ./rshift.asm 357
-	C If we're working from L2 cache, because args don't fit in L1, sse2 goes ~9c/l, this fn goes ~13c/l
-	C From main memory, sse2 goes ~11.5c/l, while we go ~14c/l
-	C TODO: find a good threshold.  With shift.c running in a loop, the jump is between n=16000 and 17000.
-	C If src isn't perfectly cached, the threshold would probably be lower.
+C 	C would like to get lots of instructions into the OOO execution engine early so it has plenty to work on...
+C 	cmp	$16000,	%rdx
+C 	jge	mpn_rshift_sse2		C TODO: and not penryn/harpertown unless we can use the aligned version
hunk ./rshift.asm 361
-	C would like to get lots of instructions into the OOO execution engine early so it has plenty to work on...
-	cmp	$16000,	%rdx
-C	cmp	L(sse2_thresh)(%rip),	%rdx	C no addressing mode can make this take less space, so use imm32
-	jge	mpn_rshift_sse2		C TODO: and not penryn/harpertown unless we can use the aligned version
+
+C regs for the loop.  use macros to make register reallocation easy.
+define(n,%rdx)
+define(reg1,%r9)
+define(reg2,%rax)
+define(reg3,%r8)  		C referenced the fewest times
+define(reg4,%r11)
hunk ./rshift.asm 370
-C	mov	%rbx,	%r9		C regs >= 8 need an extra prefix to access, so just use for saving.  push takes fewer bytes
-	push	%rbx
-	mov	(%rsi), %rbx		C %rbx = limb0
-	push	%rbp
+C 	push	reg2
+C 	push	reg4
+	mov	(%rsi), reg4		C reg4 = limb0
hunk ./rshift.asm 374
-	shrd	%cl,	%rbx,	%rax	C %rax = ret val limb.  %rbx still = limb0
-
-	sub	$1,	%rdx		C %rdx = n-1
-	jle	L(c2_end_short)		C if(n<=1), no loop.  %rdi on entry points to top (only) limb
-	add	$8,	%rsi
+	shrd	%cl,	reg4,	%rax	C %rax = ret val limb.  %rbx still = limb0
hunk ./rshift.asm 377
+C	mov	%rsi,	%r9
+	jmp	L(c2_entry)
hunk ./rshift.asm 380
-	mov	(%rsi),	%rax
-	shrd	%cl,	%rax,	%rbx
-	mov	%rbx,	(%rdi)
-	mov	%rax,	%rbx
+	shrd	%cl,	reg1,	reg4
+	mov	reg4,	(%rdi)
+	mov	reg1,	reg4
hunk ./rshift.asm 384
-	sub	$1,	%rdx
-	jz	L(c2_end)
+C	add	$8,	%r9
+L(c2_entry):
+	dec	n		C sub looks like it makes things align better, but dec has the same timings
+C 	sub	$1,	n
+	jle	L(c2_end)
hunk ./rshift.asm 390
+	mov	(%rsi),	reg1	C reg4=limb0 reg1=limb1
hunk ./rshift.asm 395
-C	%rax=%rbx=limb(i).  %rdx=n-i-1  %rsi -> next limb to load
-C	mov	%rbx,	%rax		C get limb0 in both regs, so we can jump into the loop anywhere.
-
-	C do some work that's wasted for (n-1)%4==0.  This hopefully shortens the critical path for the computed jump
-C	mov	%rdx,	%rbp
-C	sub	$1,	%rbp
-C	and	$3,	%rbp
-C	imul	$-12,	%rbp,	%r8	C do this early to hide the latency
-C	lea	(L(c2_1)+3*12)(%r8), %r8
-
-C	lea	-8(%rdi,%rdx,8), %rdi	C rdi = last limb
-C	lea	(%rsi,%rdx,8), %rsi	C rsi = last limb + 1.
-
-C	neg	%rdx
-C	add	$1, %rdx		C n= -n + 1
-C %rax = ret val; %rbx=limb0; %rcx=shift count; %rdx=size
-
-C	mov	%rdx,	%rbp
-C	and	$3,	%rbp		C %rbp = (n-1)%4
-C	test	%rbp, %rbp
-C	jz	L(c2_plus1)		C (n-1)%4==0, n==4m+1.  special case: don't need to adjust pointers (and the code below would fail because (n-1)%4 = 0, not 4.)
-
-C moved earlier
-C	mov	%rbp, %r8
-C	sub	$3,	%r8		C %rbp: 0->c2_1.  -1->c2_2.  -2->c2_3.
-C	imul	$-12,	%rbp,	%r8	C each mov and shrd takes 4 bytes, so loop uses 12 bytes per block
-C	lea	(L(c2_1)+3*12)(%r8), %r8  C requires a 4-byte displacement :(
+C	reg4=limb(i) reg1=limb(i+1).  %rdx=n-i-1, %rdx%4=0  %rsi -> limb(i+1)
hunk ./rshift.asm 397
-C adjust pointers for jump into loop:  %rsi, %rdi -= (4 - (n-1)%4)*8 == add -4*8 + ((n-1%4))*8
-C	lea	(-4*8)(%rdi,%rbp,8), %rdi
-C	lea	(8-4*8)(%rsi,%rbp,8), %rsi
-C	lea	8(%rsi), %rsi		C already loaded the first limb
+C  	mov	(%rsi),	reg1
+  	mov	8(%rsi), reg2
+	lea	16(%rsi),%rsi
hunk ./rshift.asm 401
+C debug: %r9 = %rsi
+C 	mov	(%r9),	reg4
+C 	mov	8(%r9),	reg1
+C 	mov	16(%r9),reg2
hunk ./rshift.asm 406
-C	cmp	$2,	%rbp
-C FIXME: definitely use a computed jump; this is slow on small n. when it's the last branch that's taken
-C	jg	L(c2_1)			C %rbp=3, (n-1)%4=3, n=4m
-C	je	L(c2_2)			C %rbp=2, (n-1)%4=2, n=4m-1
-C	jl	L(c2_3)			C %rbp=1, (n-1)%4=1, n=4m-2
+C require: reg1=limb1; reg2=limb2; reg3=xxx;  reg4=limb0
hunk ./rshift.asm 408
-C	jmp	*L(c2_1)(%r8)
-C	jmp	*%r8
-
-L(c2_plus1):
-C 	lea	8(%rsi), %rsi
-C	add	$4,	%rdx
-
-C	ALIGN(16)  C loop is <= 18 insn and <= 64 aligned bytes, so fits into Core 2's loop stream buffer, so alignment doesn't matter
+C loop is <= 18 insn and <= 4 16byte aligned blocks, so fits into Core 2's loop stream buffer, so alignment doesn't matter
+C If this is misaligned so it doesn't fit in the loob buffer, it runs ~1.57 c/l instead of ~1.33
hunk ./rshift.asm 411
-C use simple addressing modes, not (%rdi,%rdx,8).  That generates too many ALU uops that compete with the shifts.
-L(c2_loop):					C start here on n = 4m+1. with rdi,rsi -= 0.  (n-1)%4 = 0
-	mov	(%rsi), %rbx		C load next higher limb
-	shrd	%cl, %rbx, %rax		C compute result limb
-	mov	%rax,	(%rdi)		C store it
-C	xchg	%eax, %eax		C 2 byte nop. might be better to use zero displacements in above addressing modes.
-					C but still have to treat loop entry as special because of the lea needed before falling in.
-C	stosq
-L(c2_1): mov	8(%rsi), %rax			C start here on n = 4m. with rdi,rsi -= 1*8.  (n-1)%4=3
-	shrd	%cl, %rax, %rbx
-	mov	%rbx,	8(%rdi)
-C	mov	%rbx,	(%rdi)
-C	add	$8,	%rdi
-L(c2_2): mov	16(%rsi), %rbx			C start here on n = 4m-1.  with rdi,rsi -= 2*8 (n-1)%4=2
-	shrd	%cl, %rbx, %rax		C compute result limb
-	mov	%rax,	16(%rdi)	C store it
-C	stosq
-L(c2_3): mov	24(%rsi), %rax			C start here on n = 4m-2.  with rdi,rsi -= 3*8 (n-1)%4=1
-	shrd	%cl, %rax, %rbx
-	mov	%rbx,	24(%rdi)
-C	mov	%rbx,	(%rdi)
-C	add	$8,	%rdi
-
-L(c2_4):					C jump in here could be better than coming in the top
-	add	$32,	%rsi
-	add	$32,	%rdi
-	sub	$4,	%rdx
-C	inc	%rdx
+C use simple addressing modes, not (%rdi,%rdx,8).  That generates too many reads of not-in-flight register values
+C ALIGN(16)
+L(c2_loop):	shrd	%cl, reg1, reg4
+	mov	(%rsi),	reg3
+	mov	reg4,	(%rdi)
+C	L(c2_10):
+	shrd	%cl, reg2, reg1
+	mov	8(%rsi), reg4
+	mov	reg1,	8(%rdi)
+C	L(c2_01):
+	shrd	%cl, reg3, reg2
+	mov	16(%rsi),reg1
+	mov	reg2,	16(%rdi)
+C	L(c2_00):
+	shrd	%cl, reg4, reg3
+	mov	24(%rsi),reg2
+	lea	32(%rsi),%rsi
+	mov	reg3,	24(%rdi)
+	lea	32(%rdi),%rdi
+	sub	$4, n
hunk ./rshift.asm 432
-
+C ALIGN(16)  would align the branch target, but it doesn't seem to matter.
+C L(c2_endshort):
hunk ./rshift.asm 435
-	shr	%cl,	%rax		C compute most significant limb
-        mov	%rax,	(%rdi)		C store it
hunk ./rshift.asm 436
-	pop	%rbp
-	pop	%rbx
-	ret
-
-C probably get rid of this and put an extra insn or two on the n=1 path, allowing a jump to c2_end
-L(c2_end_short):
-	shrq	%cl,	%rbx		C compute most significant limb
-        movq	%rbx,	(%rdi)		C store it
-C	mov	%r9,	%rbx
-	pop	%rbp
-	pop	%rbx
+	shr	%cl,	reg4		C compute most significant limb
+	mov	reg4,	(%rdi)		C store it
+C  	pop	reg4
+C  	pop	reg2
}