[faster intro avoiding 8bit reg access. movhlps for data shuffling in the loop. some register allocation changes peter@cordes.ca**20080315054049] { hunk ./rshift.asm 31 +C +++++++++++++++++++++ shufpd version with slow (%cl) intro ++++++++++++++++++++ hunk ./rshift.asm 74 +C ++++++++++++++++++++ fast intro (%ecx), movhlps version +++++++++++++++++++ +C AMD K8, movhlps version +C size 496 3.048 cycles/limb + + + hunk ./rshift.asm 113 - movdqa (%rsi), %xmm6 C %6 = limb0, limb1 + movdqa (%rsi), %xmm3 C %3 = limb0, limb1 hunk ./rshift.asm 115 - sub $64, %cl C cnt must be <=64, so it's ok to operate on small version of it - neg %cl C we want 64-cnt in ecx as a shift count for getting the return value - movq %xmm6, %rax C %rax = limb0 - shlq %cl, %rax C return value=limb0<2 - C seems to make no diff where we put pxor, so move it to function start if that helps alignment - pxor %xmm2, %xmm2 C we need this for later, in L(out). - - psllq %xmm0, %xmm3 C %3=limb0<>c C NOTE xmm3 and xmm6 are shifted in opposite directions than in the code below. - punpckhqdq %xmm2, %xmm3 C %2=0,0; %3=limb1<>c,limb1>>c - por %xmm6, %xmm3 C %3=result limbs 0,1 + movdqa %xmm3, %xmm2 + punpckhqdq %xmm6, %xmm3 C %6=0,0; %3=limb1,0 + psllq %xmm0, %xmm3 C %3=limb1<>c C NOTE xmm3 and xmm2 are shifted in opposite directions than in the code below. +C require: %3=limb1<>c,limb1>>c + por %xmm2, %xmm3 C %3=result limbs 0,1 }