[use movq; movhpd instead of movdqa to allow unaligned stores peter@cordes.ca**20080319054934] { hunk ./rshift.asm 213 -C size 496: 3.052 c/l. (no ALIGN: 3.547) old: 3.039c/l with store commented out. -C size 10000 5.244 c/l (no ALIGN: 5.257) -C size 10000000 14.053 c/l. (no ALIGN: 14.529) +C size 496: 3.052 c/l. (no ALIGN: 3.547, movhpd: 3.048) old: 3.039c/l with store commented out. +C size 10000 5.244 c/l (no ALIGN: 5.257. movhpd: 5.248) +C size 10000000 14.053 c/l. (no ALIGN: 14.529. movhpd: 14.005) hunk ./rshift.asm 218 -C size 1 12.096 c/l -C size 2 6.504 c/l -C size 3 5.376 c/l (no ALIGN: 5.312) -C size 4 4.040 c/l (no ALIGN: 3.980, and one less icache line touched?) +C size 1 12.096 c/l (no ALIGN, movhpd: 13.024) +C size 2 6.504 c/l (no ALIGN, movhpd: 6.504) +C size 3 5.376 c/l (no ALIGN: 5.312. unaligned movhpd stores: 5.355) +C size 4 4.040 c/l (no ALIGN: 3.980, and one less icache line touched?) (movhpd: 4.180) hunk ./rshift.asm 224 -C size 8 2.988 c/l (no ALIGN: 2.988) -C size 496: 2.052 c/l. (no ALIGN: 2.052) -C size 10000 2.320 c/l (no ALIGN: 2.320) -C size 10000000 11.178 c/l. (no ALIGN: 11.195) (2.4GHz, g965, dual channel DDR800) +C size 8 2.988 c/l (no ALIGN: 2.988) (movhpd: 3.294) +C size 496: 2.052 c/l. (no ALIGN: 2.052) (movhpd: 2.132) +C size 10000 2.320 c/l (no ALIGN: 2.320) (movhpd: 2.656) +C size 10000000 11.178 c/l. (no ALIGN: 11.195. movhpd: 11.520)) (2.4GHz, g965, dual channel DDR800) hunk ./rshift.asm 238 -C size 496 2.581 c/l. (no ALIGN: 2.558) -C size 10000 2.847 c/l (no ALIGN: 2.968) -C size 10000000 14.460 c/l. (no ALIGN: 14.403) +C size 496 2.581 c/l. (no ALIGN: 2.558. movhpd: 2.562) +C size 10000 2.847 c/l (no ALIGN: 2.968. movhpd: 3.566) +C size 10000000 14.460 c/l. (no ALIGN: 14.403. movhpd: 14.294) hunk ./rshift.asm 275 - movdqa %xmm6, (%rdi,%rdx,8) C store the result. + movq %xmm6, (%rdi,%rdx,8) C store the result. + movhpd %xmm6, 8(%rdi,%rdx,8) C store the result. + hunk ./rshift.asm 279 -C movq 16(%rsi,%rdx,8), %xmm6 C %6 = limb3 (about to become limb1) hunk ./rshift.asm 294 - movdqa %xmm6, (%rdi) C store the result. +C movdqa %xmm6, (%rdi) C store the result. + movq %xmm6, (%rdi) C store the result. + movhpd %xmm6, 8(%rdi) C store the result. }