[shufpd version faster than movhlps for size=4, same rest of the time peter@cordes.ca**20080315060439] { hunk ./rshift.asm 147 - pxor %xmm6, %xmm6 C break the partial-reg dependency. 2.0 cycles/limb instead of 2.5 on conroe and penryn - movhlps %xmm3, %xmm6 +C pxor %xmm6, %xmm6 C break the partial-reg dependency. 2.0 cycles/limb instead of 2.5 on conroe and penryn +C movhlps %xmm3, %xmm6 hunk ./rshift.asm 150 - punpcklqdq %xmm7, %xmm6 C dest=dest[0],src[0] +C punpcklqdq %xmm7, %xmm6 C dest=dest[0],src[0] hunk ./rshift.asm 153 -C movdqa %xmm3, %xmm6 + movdqa %xmm3, %xmm6 hunk ./rshift.asm 155 -C shufpd $1, %xmm7, %xmm6 C %6=limbs(1,2). take dest[1],src[0], so op=1+0<<1 + shufpd $1, %xmm7, %xmm6 C %6=limbs(1,2). take dest[1],src[0], so op=1+0<<1 }