[working computed-jmp version
peter@cordes.ca**20080320084938
 not as fast as triple-branch except when triple branch fell through to
 the last one.  But hopefully less polluting of the branch predictor
] {
hunk ./rshift.asm 245
+ALIGN(64)
hunk ./rshift.asm 287
+	C todo: use shrd to shorten the outro and put the start of this fn within -128bytes of the jge in the non-sse2 version
hunk ./rshift.asm 313
-C	non-SSE rshift for Core 2, copied from 32bit mpn/x86/rshift.asm
+C	non-SSE rshift for Core 2, basic structure copied from 32bit mpn/x86/rshift.asm
hunk ./rshift.asm 317
-C Conroe:
-C size 1	8.000 (9.024 w/cmp jge sse2)	(not unrolled: 9.024 c/l)
-C size 2	(not unrolled: 6.5 c/l)
+C Conroe:	timings _with_ cmp/jge sse2, since that's how it will be for real
+C size 1	9.024 (8.000  w/o cmp jge sse2)	(not unrolled: 9.024 c/l)
+C size 2	7 c/l (not unrolled: 6.5 c/l)
hunk ./rshift.asm 321
-C size 4	3.780	(not unrolled: 3.76 c/l)
+C size 4	3.780	(not unrolled: 3.76 c/l).  computed jmp: 4.260
hunk ./rshift.asm 327
-C size 17000	13.025  beyond here, take timings
+C size 17000	13.025  beyond here, look at timings for sse2 routing which we call instead.
hunk ./rshift.asm 332
+C probably if we care about small n performance, we should have a separate version of the function.
+C conroe: computed goto     ; triple jcc version  (downside: pollutes the branch predictor)
+C SIZE=1; 10.016 cycles/limb  8.992
+C SIZE=2; 7.008		      7.464
+C SIZE=3; 5.333 cycles/limb   5.013
+C SIZE=4; 4.260 cycles/limb   5.760
+C SIZE=5; 3.206 cycles/limb   3.014
+C SIZE=6; 3.509 cycles/limb   3.509
+C SIZE=7; 3.163 cycles/limb   2.999
+C SIZE=8; 2.934 cycles/limb   2.718
+C SIZE=9; 2.596 cycles/limb   2.347
+C SIZE=10; 2.693 cycles/limb  2.710
+C SIZE=496; 1.571 cycles/limb 1.571
+
+
hunk ./rshift.asm 352
-ALIGN(1024)
+C ALIGN(1024)
+L(sse2_thresh): .word 16000
+	.word 0
+	.word 0
+	.word 0
+C ALIGN(16)
hunk ./rshift.asm 364
+	C would like to get lots of instructions into the OOO execution engine early so it has plenty to work on...
hunk ./rshift.asm 366
+C	cmp	L(sse2_thresh)(%rip),	%rdx	C no addressing mode can make this take less space, so use imm32
hunk ./rshift.asm 370
-C	mov	%rbx,	%r9		C regs >= 8 need an extra prefix to access, so just use for saving
+C	mov	%rbx,	%r9		C regs >= 8 need an extra prefix to access, so just use for saving.  push takes fewer bytes
hunk ./rshift.asm 377
+	C do some work that's wasted for (n-1)%4==0.  This hopefully shortens the critical path for the computed jump
+	mov	%rdx,	%rbp
+	sub	$1,	%rbp
+	and	$3,	%rbp
+	imul	$-12,	%rbp,	%r8	C do this early to hide the latency
+	lea	(L(c2_1)+3*12)(%r8), %r8
+
hunk ./rshift.asm 393
-C	mov	%rax,	%r8		C save ret val
hunk ./rshift.asm 396
-	mov	%rdx,	%rbp
-	and	$3,	%rbp		C %rbp = (n-1)%4
+C	mov	%rdx,	%rbp
+C	and	$3,	%rbp		C %rbp = (n-1)%4
+	test	%rbp, %rbp
hunk ./rshift.asm 401
+C moved earlier
+C	mov	%rbp, %r8
+C	sub	$3,	%r8		C %rbp: 0->c2_1.  -1->c2_2.  -2->c2_3.
+C	imul	$-12,	%rbp,	%r8	C each mov and shrd takes 4 bytes, so loop uses 12 bytes per block
+C	lea	(L(c2_1)+3*12)(%r8), %r8
+
hunk ./rshift.asm 412
-	cmp	$2,	%rbp
+
+C	cmp	$2,	%rbp
hunk ./rshift.asm 415
-	jg	L(c2_1)			C %rbp=3, (n-1)%4=3, n=4m
-	je	L(c2_2)			C %rbp=2, (n-1)%4=2, n=4m-1
-	jl	L(c2_3)			C %rbp=1, (n-1)%4=1, n=4m-2
+C	jg	L(c2_1)			C %rbp=3, (n-1)%4=3, n=4m
+C	je	L(c2_2)			C %rbp=2, (n-1)%4=2, n=4m-1
+C	jl	L(c2_3)			C %rbp=1, (n-1)%4=1, n=4m-2
hunk ./rshift.asm 419
-C	jmp	*(L(c2_loop)+%rbp)
+C	jmp	*L(c2_1)(%r8)
+	jmp	*%r8
hunk ./rshift.asm 427
-C further unrolling will push it beyond the size of the loop stream detector.  (already close in bytes)
+C further unrolling will push it beyond the size of the loop stream detector.  (already close in bytes).  8 limbs/iter runs at ~1.67 c/l
hunk ./rshift.asm 430
-	mov	0(%rsi), %rbx		C load next higher limb
+	mov	(%rsi), %rbx		C load next higher limb
hunk ./rshift.asm 432
-	mov	%rax,	0(%rdi)		C store it
+	mov	%rax,	(%rdi)		C store it
+C	xchg	%eax, %eax		C 2 byte nop. might be better to use zero displacements in above addressing modes.
+					C but still have to treat loop entry as special because of the lea needed before falling in.
hunk ./rshift.asm 451
+L(c2_4):					C jump in here could be better than coming in the top
hunk ./rshift.asm 462
-C	mov	%r8,	%rax
}