[new layout of mostly the same code.  only 2 icache lines for n<12, and mostly better speed
peter@cordes.ca**20080326023333] {
hunk ./rshift.asm 317
+C note: core2 has 3 exec ports for add, only 1 for lea.  So don't use lea when an add will do, except to preserve flags
+
hunk ./rshift.asm 395
-	xor	%eax,	%eax
+	xor	%eax,	%eax		C no advantage to using %rax.  this first block would still fit in 16B with a REX
hunk ./rshift.asm 397
-	push	%rax			C save retval
+	push	%rax		C save retval.
+	jmp	L(c2_unrolled)	C faster to _not_ have this through the decoders on the first cycle, when n < 12.
+C < 16B < 6 instr:  first fetch.
+
+ALIGN(16)
+L(c2_unrolled_cleanup):
+	C require: reg8=limb(i).  reg1=limb(i+1)  reg2=limb(i+2)  reg3=limb(i+3).  n<=256
+	C in 4-limb unroll, reg8->reg4
+
+	shrd	%cl, reg1, reg8
+	mov	reg8,	(%rdi)
+	add	$24,	%rdi
+	shrd	%cl, reg2, reg1
+	mov	reg1,	(8-24)(%rdi)
+
+	mov	reg3,	reg8
+
+	shrd	%cl, reg3, reg2
+	mov	reg2,	(16-24)(%rdi)
+
+	add	$(unroll-1), %dl
+	jz	L(c2_end)
+C	jmp	L(c2_entry_after_unrolled)
hunk ./rshift.asm 421
-	cmp	$3,	n	C if n-1>=pipeline depth, we can use that cleanup code
-	jg	L(c2_unrolled)
-	jmp	L(c2_entry)
hunk ./rshift.asm 431
-	dec	n
+	dec	%dl
hunk ./rshift.asm 438
-C ALIGN(8)  would align the branch target.  only needed if near the end of a 16byte fetch, causing a bubble.
+C ALIGN(8) C would align the branch target.  only needed if near the end of a 16byte fetch, causing a bubble.
hunk ./rshift.asm 455
+ALIGN(16)	C jumped to for all n, after a 5-insn fetch.  And nothing drops in to it.
hunk ./rshift.asm 457
+	cmp	$3,	n	C if n-1>=pipeline depth, we can use that cleanup code
+	jle	L(c2_entry)
hunk ./rshift.asm 461
-	mov	24(%rsi),	reg3
+	add	$32,	%rsi	C This is the pipeline depth, not the unroll count
+	sub	$(3+unroll),	n      C n is still possibly > 2^32
+	mov	(24-32)(%rsi),	reg3
+	jle	L(c2_unrolled_cleanup) C end of basic block with 8 insns. good.
+	C else drop into the main loop.  for large n, we've had 1 taken jmp, and 2 not taken jcc.
hunk ./rshift.asm 467
-C	mov	24(%rsi),	reg4
-	lea	32(%rsi),	%rsi
-	sub	$(3+unroll),	n
-	jle	L(c2_unrolled_cleanup)
+C IP=124B from start.  Only 2 icache lines if we don't touch the unrolled loop
hunk ./rshift.asm 471
-C loop is <= 18 insn and <= 4 16byte aligned blocks, so fits into Core 2's loop stream buffer, so alignment doesn't matter
+C 4-limb loop is <= 18 insn and <= 4 16byte aligned blocks, so fits into Core 2's loop stream buffer without ALIGN.
hunk ./rshift.asm 476
+C add/lea instructions placed by trial and error to avoid pipeline stalls.
hunk ./rshift.asm 485
-ifdef(`C2_UNROLL8',,	`add	$32,	%rsi; define(`srcoff',32)')
+	ifdef(`C2_UNROLL8',,	`add	$32,	%rsi; define(`srcoff',32)')
hunk ./rshift.asm 489
-ifdef(`C2_UNROLL8',	`lea	64(%rsi),%rsi;  define(`srcoff',64)',)
+	ifdef(`C2_UNROLL8',	`add	$64,	%rsi; define(`srcoff',64)',)
hunk ./rshift.asm 492
-ifdef(`C2_UNROLL8',,	`add	$32,	%rdi; define(`dstoff',32)')
+	ifdef(`C2_UNROLL8',,	`add	$32,	%rdi; define(`dstoff',32)')
hunk ./rshift.asm 497
-ifdef(`C2_UNROLL8',,	`sub	$4, n')
+	ifdef(`C2_UNROLL8',,	`sub	$4, n')
hunk ./rshift.asm 505
-ifdef(`C2_UNROLL8',	`lea	64(%rdi),%rdi; define(`dstoff',64)',)
+	ifdef(`C2_UNROLL8',	`add	$64,	%rdi; define(`dstoff',64)',)
hunk ./rshift.asm 517
-ifdef(`C2_UNROLL8',	`sub	$8, n',)
+	ifdef(`C2_UNROLL8',	`sub	$8, n',)
hunk ./rshift.asm 524
-
-C	jmp	L(c2_unrolled_cleanup)
-C   move this before the loop for better small-n perf, probably
-
-	C require: reg8=limb(i).  reg1=limb(i+1)  reg2=limb(i+2)  reg3=limb(i+3)
-	C in 4-limb unroll, reg8->reg4
-L(c2_unrolled_cleanup):
-
-	shrd	%cl, reg1, reg8
-	mov	reg8,	(%rdi)
-	shrd	%cl, reg2, reg1
-	mov	reg1,	8(%rdi)
-
-	add	$24,	%rdi
-	mov	reg3,	reg8
-
-	shrd	%cl, reg3, reg2
-	mov	reg2,	(16-24)(%rdi)
-
-	add	$unroll,n
-	jmp	L(c2_entry_after_unrolled)
-
+	C RIP=xxxc  if loop start was align(16).
+	C might be optimal to duplicate a 3-byte insn here before the jmp
+	jmp	L(c2_unrolled_cleanup)
+C 236B total.  could duplicate some code down here...
}