[edit benchmark and optimization comments
peter@cordes.ca**20080314185706] {
hunk ./rshift.asm 62
-C size 4:		4.275		cycles/limb (took a long time to settle.  often swung up to 8.800)
+C size 4:		4.275		cycles/limb (took a long time to settle.  often swung up to 8.800. prob. branch pred problems)
hunk ./rshift.asm 73
+C movdqu (unaligned allowed, times for the aligned case) (Conroe 2.4GHz)
+C size 1:		14.000	    cycles/limb
+C size 2:		6.990-7.050 cycles/limb
+C size 496, 4000:	4.048	    cycles/limb
+C size 496001:		8.787-8.807 cycles/limb
+
+
hunk ./rshift.asm 86
+C further gains would take more loop unrolling or software pipelining to hide the latency of the load
+C which would require a bigger intro/outro to not read past the end of the array
+C currently the function fits nicely into 128bytes
+
+C optimization: could maybe structure things so limb0,limb1 need the shuffle, to hide the latency.
+C probably would need another shuffle before storing, though
hunk ./rshift.asm 93
-C movdqu (unaligned allowed, times for the aligned case)
-C size 1:		14.000	    cycles/limb
-C size 2:		6.990-7.050 cycles/limb
-C size 496, 4000:	4.048	    cycles/limb
-C size 496001:		8.787-8.807
hunk ./rshift.asm 125
-	jge	L(out)			C j if ZF=0 and SF=OF.  j if orig_n<=1, AFAICT.
+	jge	L(out)			C skip the loop if n<=2
hunk ./rshift.asm 130
-	movdqa	(%rsi,%rdx,8), %xmm7	C %7=limb2,limb3.  If src is 16byte aligned, this can't cross a page boundary and segfault.  we might harmlessly read past the end of the array of limbs if it's length is odd.
-
-C optimization: could maybe structure things so limb0,limb1 need the shuffle, to hide the latency.
-C probably would need another shuffle before storing, though
+	movdqa	(%rsi,%rdx,8), %xmm7	C %7=limb2,limb3.  If src is 16byte aligned, this can't cross a page boundary and segfault.  we might harmlessly read past the end of the array of limbs if its length is odd.
hunk ./rshift.asm 141
+C require %7=limbs(2,3) (next iteration's limbs(0,1))
hunk ./rshift.asm 144
-	addq	$2, %rdx		C n= -n + 2
+	addq	$2, %rdx
}