[faster intro avoiding 8bit reg access.  movhlps for data shuffling in the loop.  some register allocation changes
peter@cordes.ca**20080315054049] {
hunk ./rshift.asm 31
+C +++++++++++++++++++++ shufpd version with slow (%cl) intro ++++++++++++++++++++
hunk ./rshift.asm 74
+C ++++++++++++++++++++ fast intro (%ecx), movhlps version +++++++++++++++++++
+C AMD K8, movhlps version
+C size 496		3.048		cycles/limb
+
+
+
hunk ./rshift.asm 113
-	movdqa	(%rsi), %xmm6		C %6 = limb0, limb1
+	movdqa	(%rsi), %xmm3		C %3 = limb0, limb1
hunk ./rshift.asm 115
-	sub	$64, %cl		C cnt must be <=64, so it's ok to operate on small version of it
-	neg	%cl			C we want 64-cnt in ecx as a shift count for getting the return value
-	movq	%xmm6, %rax		C %rax = limb0
-	shlq	%cl, %rax		C return value=limb0<<lc. shift count has to be in %cl.	 C modifies flags, so it has to go before the loop
+	sub	$64, %ecx		C cnt must be <=64, so it's ok to operate on small version of it
+	neg	%ecx			C we want 64-cnt in ecx as a shift count for getting the return value
+	movq	%xmm3, %rax		C %rax = limb0
hunk ./rshift.asm 119
+	shlq	%cl, %rax		C return value=limb0<<lc. shift count has to be in %cl.	 C modifies flags, so it has to go before the loop
hunk ./rshift.asm 124
-
hunk ./rshift.asm 125
-C %6=limb0,limb1; 
+C %3=limb0,limb1;
+	C seems to make no diff where we put pxor, so move it to function start if that helps alignment
+	pxor	%xmm6, %xmm6		C we need this for later, in L(out).
hunk ./rshift.asm 129
-	movdqa	%xmm6, %xmm3
hunk ./rshift.asm 132
-	ALIGN(8)			C minimal alignment for claimed speed
+	ALIGN(16)			C minimal alignment for claimed speed
hunk ./rshift.asm 134
-
hunk ./rshift.asm 136
-C require: %3=%6=limbs(0,1); %7=limbs(2,3)
-	shufpd	$1, %xmm7, %xmm6		C %6=limbs(1,2).  take dest[1],src[0], so op=1+0<<1
+C good:  pshufd works, and gave 2.0 cycles/limb on conroe, but 2.5 on Harpertown. (?!).  3.0 on K8
+C	pshufd  $14, %xmm3, %xmm6	C %6=limb1,  14 = (2 + 3<<2)  C latency=2 on pre-penryn
+
+C the best alternative for getting limb1 in the low part of a register
+	pxor	%xmm6, %xmm6		C break the partial-reg dependency.  2.0 cycles/limb instead of 2.5 on conroe and penryn
+	movhlps	%xmm3, %xmm6
+C require %6=limb1,xxx; %7=limb
+	punpcklqdq %xmm7, %xmm6		C dest=dest[0],src[0]
+
+
+C	movdqa	%xmm3, %xmm6
+C require: %3=limbs(0,1); %6=xxx,limb1; %7=limbs(2,3)  C shufpd version ran 2.0 cycles/limb on conroe and penryn, but 3.7 instead of 3.0 on K8
+C	shufpd	$1, %xmm7, %xmm6		C %6=limbs(1,2).  take dest[1],src[0], so op=1+0<<1
+
+C bad	pslldq	$4, %xmm2	C only operates in-place and has latency=2 on pre-penryn
hunk ./rshift.asm 159
-	movdqa	%xmm7, %xmm6		C setup for next iter.
+C	movdqa	%xmm7, %xmm6		C setup for next iter.
hunk ./rshift.asm 165
-C %3=%6=limb0,limb1 (since it was loaded before rdx was incremented).  %7=%3 on for n>2
-	C seems to make no diff where we put pxor, so move it to function start if that helps alignment
-	pxor	%xmm2, %xmm2		C we need this for later, in L(out).
-
-	psllq	%xmm0, %xmm3		C %3=limb0<<lc,limb1<<lc
-	psrlq	%xmm1, %xmm6		C %6=(limb0,1)>>c  C NOTE xmm3 and xmm6 are shifted in opposite directions than in the code below.
-	punpckhqdq %xmm2, %xmm3		C %2=0,0;  %3=limb1<<lc,0
+C %3=limb0,limb1 (since it was loaded before rdx was incremented).
hunk ./rshift.asm 167
-C require: %3=limb1<<lc, 0; %6=limb0>>c,limb1>>c
-	por	%xmm6, %xmm3		C %3=result limbs 0,1
+	movdqa  %xmm3, %xmm2
+	punpckhqdq %xmm6, %xmm3		C %6=0,0;  %3=limb1,0
+	psllq	%xmm0, %xmm3		C %3=limb1<<lc,0
+	psrlq	%xmm1, %xmm2		C %2=(limb0,1)>>c  C NOTE xmm3 and xmm2 are shifted in opposite directions than in the code below.
+C require: %3=limb1<<lc, 0; %2=limb0>>c,limb1>>c
+	por	%xmm2, %xmm3		C %3=result limbs 0,1
}