[new version of rshift using unaligned loads and not overshooting. 2.0c/l Conroe, but 2.5c/l Harpertown peter@cordes.ca**20080319040110] { hunk ./rshift.asm 127 -PROLOGUE(mpn_rshift_sse2) +PROLOGUE(mpn_rshift_sse2_aligned) hunk ./rshift.asm 199 + +EPILOGUE() + + + +C version using unaligned loads but aligned stores: +C K8 (2.6 GHz) +C size 1: 16.926 c/l +C size 2: 8.470 c/l +C size 3 7.638 c/l (no ALIGN: 7.309) +C size 4: 5.736 c/l (no ALIGN: 5.731) +C size 5: 5.777 c/l +C size 6: 4.813 c/l +C size 8: 4.358 c/l (no ALIGN: 4.607) +C size 496: 3.052 c/l. (no ALIGN: 3.547) old: 3.039c/l with store commented out. +C size 10000 5.244 c/l (no ALIGN: 5.257) +C size 10000000 14.053 c/l. (no ALIGN: 14.529) + +C Conroe: +C size 1 12.096 c/l +C size 2 6.504 c/l +C size 3 5.376 c/l (no ALIGN: 5.312) +C size 4 4.040 c/l (no ALIGN: 3.980, and one less icache line touched?) +C size 5 4.013 c/l +C size 6 3.341 c/l +C size 8 2.988 c/l (no ALIGN: 2.988) +C size 496: 2.052 c/l. (no ALIGN: 2.052) +C size 10000 2.320 c/l (no ALIGN: 2.320) +C size 10000000 11.178 c/l. (no ALIGN: 11.195) (2.4GHz, g965, dual channel DDR800) + + +C Harpertown (2.8GHz): +C size 1 12.115 c/l. +C size 2 6.524 c/l +C size 3 5.444 c/l (no ALIGN: 5.289) +C size 4 4.083 c/l (no ALIGN: 3.961) +C size 5 4.217 c/l +C size 6 3.512 c/l +C size 8 3.229 c/l. (no ALIGN: 3.134) +C size 496 2.581 c/l. (no ALIGN: 2.558) +C size 10000 2.847 c/l (no ALIGN: 2.968) +C size 10000000 14.460 c/l. (no ALIGN: 14.403) + +C TODO: test this with electric fence to see if it goes off the end. + +ASM_START() +PROLOGUE(mpn_rshift_sse2) + movq (%rsi), %xmm7 C %7 = limb0 + movd %ecx, %xmm1 + sub $64, %ecx C cnt must be <=64, so it's ok to operate on small version of it + neg %ecx C we want 64-cnt in ecx as a shift count for getting the return value + movq %xmm7, %rax C %rax = limb0 + movd %ecx, %xmm0 C %0=64-cnt=left count=lc; %1=cnt; C this can go anywhere before the loop. + shlq %cl, %rax C return value=limb0<