[distribute ALU ops through the loop.  add commented-out 8-way unroll
peter@cordes.ca**20080323013830
 spreading out the ALU ops doesn't seem to make any difference.
 The 8-way unroll is slightly faster with large n, but way slower with
 small n that make the intro loop run more.  change test $3, %dl to
 test $7, %dl and uncomment the bottom half of the main loop.
] {
hunk ./rshift.asm 410
-C further unrolling will push it beyond the size of the loop stream detector.  (already close in bytes).  8 limbs/iter runs at ~1.67 c/l
+C further unrolling will push it beyond the size of the loop stream detector.  (already close in bytes).
+C 8 limbs/iter runs at 1.202 - 1.315 c/l with ALIGN(16).  (slow intro loop has to do more, though...)
hunk ./rshift.asm 416
+ 	add	$32,	%rsi
hunk ./rshift.asm 420
-	mov	8(%rsi), reg4
+	mov	(8-32)(%rsi), reg4
hunk ./rshift.asm 422
+ 	add	$32,	%rdi
hunk ./rshift.asm 425
-	mov	16(%rsi),reg1
-	mov	reg2,	16(%rdi)
+	mov	(16-32)(%rsi),reg1
+	mov	reg2,	(16-32)(%rdi)
hunk ./rshift.asm 429
-	mov	24(%rsi),reg2
-	lea	32(%rsi),%rsi
-	mov	reg3,	24(%rdi)
-	lea	32(%rdi),%rdi
-	sub	$4, n
+ 	sub	$4, n
+	mov	(24-32)(%rsi),reg2
+ 	mov	reg3,	(24-32)(%rdi)
+
+C 	shrd	%cl, reg1, reg4
+C 	mov	32(%rsi),	reg3
+C 	mov	reg4,	32(%rdi)
+C C	L(c2_10):
+C 	shrd	%cl, reg2, reg1
+C 	mov	40(%rsi), reg4
+C 	mov	reg1,	40(%rdi)
+C C	L(c2_01):
+C 	shrd	%cl, reg3, reg2
+C 	mov	48(%rsi),reg1
+C 	mov	reg2,	48(%rdi)
+C C	L(c2_00):
+C 	shrd	%cl, reg4, reg3
+C 	mov	56(%rsi),reg2
+C 	lea	64(%rsi),%rsi
+C 	mov	reg3,	56(%rdi)
+C 	lea	64(%rdi),%rdi
+C 	sub	$8, n
+
}