[use little-endian notation for register contents peter@cordes.ca**20080318230601] { hunk ./rshift.asm 124 -C note: %6 = limb0, limb1 means the _high_ quad of %xmm6 = limb1. Intel is little-endian, so this is a bit bogus. It actually matters if you use psrldq or something. +C notation: %6 = limb1:limb0 means the _high_ quad of %xmm6 = limb1. hunk ./rshift.asm 128 - movdqa (%rsi), %xmm3 C %3 = limb0, limb1 + movdqa (%rsi), %xmm3 C %3 = limb1:limb0 hunk ./rshift.asm 140 -C %3=limb0,limb1; +C %3=limb1:limb0; hunk ./rshift.asm 148 - movdqa (%rsi,%rdx,8), %xmm7 C %7=limb2,limb3. If src is 16byte aligned, this can't cross a page boundary and segfault. we might harmlessly read past the end of the array of limbs if its length is odd. + movdqa (%rsi,%rdx,8), %xmm7 C %7=limbs(3:2). If src is 16byte aligned, this can't cross a page boundary and segfault. we might harmlessly read past the end of the array of limbs if its length is odd. + +C require: %3=limbs(1:0); %7=limbs(3:2) hunk ./rshift.asm 153 -C pshufd $14, %xmm3, %xmm6 C %6=limb1, 14 = (2 + 3<<2) C latency=2 on pre-penryn -C punpcklqdq %xmm7, %xmm6 C dest=dest[0],src[0] +C pshufd $14, %xmm3, %xmm6 C %6=xxx:limb1 14 = (2 + 3<<2) C latency=2 on pre-penryn +C punpcklqdq %xmm7, %xmm6 C dest=src[0]:dest[0] hunk ./rshift.asm 158 -C require %6=limb1,xxx; %7=limb -C punpcklqdq %xmm7, %xmm6 C dest=dest[0],src[0] +C require %6=xxx:limb1; %7=limbs(3:2) +C punpcklqdq %xmm7, %xmm6 C dest=src[0]:dest[0] hunk ./rshift.asm 163 +C shufpd version ran 2.0 cycles/limb on conroe and penryn, but 3.7 instead of 3.0 for movhlps on K8 hunk ./rshift.asm 165 -C require: %3=limbs(0,1); %6=xxx,limb1; %7=limbs(2,3) C shufpd version ran 2.0 cycles/limb on conroe and penryn, but 3.7 instead of 3.0 on K8 - shufpd $1, %xmm7, %xmm6 C %6=limbs(1,2). take dest[1],src[0], so op=1+0<<1 + shufpd $1, %xmm7, %xmm6 C %6=limbs(2:1). take dest[1],src[0], so op=1+0<<1 hunk ./rshift.asm 169 -C require: %3=limb0,limb1; %6 = limb1,limb2 +C require: %3=limb1:limb0; %6 = limb2:limb1 hunk ./rshift.asm 171 - psrlq %xmm1, %xmm3 C %3=limbs(0,1)>>cnt; %6=limbs(1,2)<>cnt; %6=limbs(2:1)<>c -C require: %3=limb1<>c,limb1>>c - por %xmm2, %xmm3 C %3=result limbs 0,1 + punpckhqdq %xmm6, %xmm3 C %6=0:0; %3=limb0:1 + psllq %xmm0, %xmm3 C %3=0:limb1<>c +C require: %3=0:limb1<>c:limb0>>c + por %xmm2, %xmm3 C %3=result limbs 1:0 }