[better block ordering to remove more branches
peter@cordes.ca**20080326041203
 saves a few cycles for most n.
] {
hunk ./rshift.asm 395
-	xor	%eax,	%eax		C no advantage to using %rax.  this first block would still fit in 16B with a REX
-	shrd	%cl,	reg8,	%rax	C %rax = ret val limb.  %rbx still = limb0
+	xor	%eax,	%eax
+	shrd	%cl,	reg8,	%rax	C %rax = ret val limb.  reg8 still = limb0
hunk ./rshift.asm 398
-	jmp	L(c2_unrolled)	C faster to _not_ have this through the decoders on the first cycle, when n < 12.
-C < 16B < 6 instr:  first fetch.
+C	jmp	L(c2_unrolled)	C faster to _not_ have this through the decoders on the first cycle, when n < 12.
+C L(c2_unrolled):
+	cmp	$3,	n	C if n-1>=pipeline depth, we can use that cleanup code
+	jle	L(c2_entry)
+	mov	8(%rsi),	reg1
+	mov	16(%rsi),	reg2
hunk ./rshift.asm 405
-ALIGN(16)
+	add	$32,	%rsi
+	mov	(24-32)(%rsi),	reg3
+	sub	$(3+unroll),	n      C n is still possibly > 2^32
+	jg	L(c2_loop)
+	C for large n, we've had 1 not taken jcc, and the above jcc is taken
+	C for n>3, n<12, we've had 2 not taken jcc
+C ALIGN(16)
hunk ./rshift.asm 442
-C 	sub	$1,	n
+C 	sub	$1,	%dl
hunk ./rshift.asm 449
-C L(c2_endshort):
hunk ./rshift.asm 464
-ALIGN(16)	C jumped to for all n, after a 5-insn fetch.  And nothing drops in to it.
-L(c2_unrolled):
-	cmp	$3,	n	C if n-1>=pipeline depth, we can use that cleanup code
-	jle	L(c2_entry)
-	mov	8(%rsi),	reg1
-	mov	16(%rsi),	reg2
-	add	$32,	%rsi	C This is the pipeline depth, not the unroll count
-	sub	$(3+unroll),	n      C n is still possibly > 2^32
-	mov	(24-32)(%rsi),	reg3
-	jle	L(c2_unrolled_cleanup) C end of basic block with 8 insns. good.
-	C else drop into the main loop.  for large n, we've had 1 taken jmp, and 2 not taken jcc.
-
hunk ./rshift.asm 466
-C require: reg1=limb1; reg2=limb2; reg3=limb3;  reg4=limb4; reg8=limb0
+C require: reg1=limb1; reg2=limb2; reg3=limb3;  reg8=limb0
}