[do the cleanup after the loop.  debugged and working, and pretty fast
peter@cordes.ca**20080325055033
 This works well: we can have special cleanup for coming out of the pipeline.
 Tested with electric fence, and it doesn't read past the end of src.
] {
hunk ./rshift.asm 323
+C n limbs.  last one is special, since nothing to shift in.
+C unrolled loop does max(0, floor((n-1-pipedepth)/unroll)*unroll + pipedepth) limbs
+C shortloop does the rest (-1)
+C unroll=8 pipedepth=3:
+C n=1.	end:1	unrolled:0	shortloop:0	-> end
+C n=2.	end:1	unrolled:0	shortloop:1	shortloop -> end
+C n=3.	end:1	unrolled:0	shortloop:2	shortloop -> end
+C n=4.	end:1	unrolled:0+3	shortloop:0	unroll_setup -> cleanup -> end
+C n=5.	end:1	unrolled:0+3	shortloop:1	unroll_setup -> cleanup -> shortloop -> end
+C n=6.	end:1	unrolled:0+3	shortloop:2	unroll_setup -> cleanup -> shortloop -> end
+C n=7.	end:1	unrolled:0+3	shortloop:3	unroll_setup -> cleanup -> shortloop -> end
+C n=8.	end:1	unrolled:0+3	shortloop:4	unroll_setup -> cleanup -> shortloop -> end
+C n=9.	end:1	unrolled:0+3	shortloop:5	unroll_setup -> cleanup -> shortloop -> end
+C n=10	end:1	unrolled:0+3	shortloop:6	unroll_setup -> cleanup -> shortloop -> end
+C n=11	end:1	unrolled:0+3	shortloop:7	unroll_setup -> cleanup -> shortloop -> end
+C size>unroll+3:
+C n=12	end:1	unrolled:8+3	shortloop:0	unroll_setup -> unrolled -> cleanup -> end
+C n=13	end:1	unrolled:8+3	shortloop:1	unroll_setup -> unrolled -> cleanup -> shortloop -> end
+C n=14	end:1	unrolled:8+3	shortloop:2	unroll_setup -> unrolled -> cleanup -> shortloop -> end
+C n=15	end:1	unrolled:8+3	shortloop:3	unroll_setup -> unrolled -> cleanup -> shortloop -> end
+C n=16	end:1	unrolled:8+3	shortloop:4	unroll_setup -> unrolled -> cleanup -> shortloop -> end
+
+
hunk ./rshift.asm 357
-C still need to comment/uncomment loop counters/pointer increments.  and the ALIGN(16)
+C still need to comment/uncomment the ALIGN(16)
+C ################ controls for 4/8 limb unroll ################
hunk ./rshift.asm 360
-C define(`C2_8REG')
+C define(`C2_8REG')  C only works with 8-limb unroll.
hunk ./rshift.asm 367
-define(`reg2',%rax)
+define(`reg2',%rax)		C %rax can't be reg1 or reg4/8
hunk ./rshift.asm 389
+C	movq	%rdi,	%xmm0	C FIXME: debugging only
+C	movq	%rsi,	%xmm1	C FIXME: debugging only
hunk ./rshift.asm 397
-C	mov	%rsi,	%r9
+	cmp	$3,	n	C if n-1>=pipeline depth, we can use that cleanup code
+	jg	L(c2_unrolled)
hunk ./rshift.asm 400
+
hunk ./rshift.asm 402
+	mov	(%rsi),	reg1		C reg8=limb0 reg1=limb1
hunk ./rshift.asm 407
-C	add	$8,	%r9
hunk ./rshift.asm 408
-	dec	n		C sub looks like it makes things align better, but dec has the same timings
-C 	sub	$1,	n
-	jle	L(c2_end)
hunk ./rshift.asm 409
-	mov	(%rsi),	reg1	C reg8=limb0 reg1=limb1
-	test	$(unroll-1),	%dl
-	jnz	L(c2_shortloop)
+L(c2_entry_after_unrolled):
+	dec	n
+C 	sub	$1,	n
+	jg	L(c2_shortloop)
hunk ./rshift.asm 417
-C  	mov	(%rsi),	reg1
-  	mov	8(%rsi),	reg2
-	mov	16(%rsi),	reg3
+C ALIGN(8)  would align the branch target.  only needed if near the end of a 16byte fetch, causing a bubble.
+C L(c2_endshort):
+C  XXXXXXXXXXXXXX tail end of function
+L(c2_end):
+	pop	%rax			C return val
+	shr	%cl,	reg8		C compute most significant limb
+	mov	reg8,	(%rdi)		C store it
+ifdef(`C2_8REG', `
+ 	pop	%r15
+ 	pop	%r14
+ 	pop	%r13
+ 	pop	%r12
+',)
+ 	ret
+C  XXXXXXXXXXXXXX tail end of function
+
hunk ./rshift.asm 434
-C 	mov	24(%rsi),	reg4
- 	lea	24(%rsi),	%rsi
+L(c2_unrolled):
+	mov	8(%rsi),	reg1
+	mov	16(%rsi),	reg2
+	mov	24(%rsi),	reg3
hunk ./rshift.asm 439
-C debug: %r9 = %rsi
-C 	mov	(%r9),	reg8
-C 	mov	8(%r9),	reg1
-C 	mov	16(%r9),reg2
+C	mov	24(%rsi),	reg4
+	lea	32(%rsi),	%rsi
+	sub	$(3+unroll),	n
+	jle	L(c2_unrolled_cleanup)
hunk ./rshift.asm 454
-L(c2_loop):
+L(c2_loop):			C does unroll*(%rdx / unroll) + 3 limbs
hunk ./rshift.asm 495
+L(c2_unrolled_tail):
hunk ./rshift.asm 497
-C ALIGN(8)  would align the branch target.  only needed if near the end of a 16byte fetch, causing a bubble.
-C L(c2_endshort):
-L(c2_end):
-	pop	%rax			C return val
-	shr	%cl,	reg8		C compute most significant limb
-	mov	reg8,	(%rdi)		C store it
-ifdef(`C2_8REG', `
- 	pop	%r15
- 	pop	%r14
- 	pop	%r13
- 	pop	%r12
-',)
- 	ret
+
+
+C	jmp	L(c2_unrolled_cleanup)
+C   move this before the loop for better small-n perf, probably
+
+	C require: reg8=limb(i).  reg1=limb(i+1)  reg2=limb(i+2)  reg3=limb(i+3)
+	C in 4-limb unroll, reg8->reg4
+L(c2_unrolled_cleanup):
+
+	shrd	%cl, reg1, reg8
+	mov	reg8,	(%rdi)
+	shrd	%cl, reg2, reg1
+	mov	reg1,	8(%rdi)
+
+	add	$24,	%rdi
+	mov	reg3,	reg8
+
+	shrd	%cl, reg3, reg2
+	mov	reg2,	(16-24)(%rdi)
+
+	add	$unroll,n
+	jmp	L(c2_entry_after_unrolled)
+
}