[use movq; movhpd instead of movdqa to allow unaligned stores peter@cordes.ca**20080319053541] { hunk ./rshift.asm 214 -C size 10000 5.244 c/l (no ALIGN: 5.257) -C size 10000000 14.053 c/l. (no ALIGN: 14.529) +C size 10000 5.244 c/l (no ALIGN: 5.257. movhpd: 5.248) +C size 10000000 14.053 c/l. (no ALIGN: 14.529. movhpd: 14.005) hunk ./rshift.asm 220 -C size 3 5.376 c/l (no ALIGN: 5.312) -C size 4 4.040 c/l (no ALIGN: 3.980, and one less icache line touched?) +C size 3 5.376 c/l (no ALIGN: 5.312. unaligned movhpd stores: 5.355) +C size 4 4.040 c/l (no ALIGN: 3.980, and one less icache line touched?) (movhpd: 4.180) hunk ./rshift.asm 225 -C size 496: 2.052 c/l. (no ALIGN: 2.052) -C size 10000 2.320 c/l (no ALIGN: 2.320) -C size 10000000 11.178 c/l. (no ALIGN: 11.195) (2.4GHz, g965, dual channel DDR800) +C size 496: 2.052 c/l. (no ALIGN: 2.052) (movhpd: 2.132) +C size 10000 2.320 c/l (no ALIGN: 2.320) (movhpd: 2.656) +C size 10000000 11.178 c/l. (no ALIGN: 11.195. movhpd: 11.520)) (2.4GHz, g965, dual channel DDR800) hunk ./rshift.asm 239 -C size 10000 2.847 c/l (no ALIGN: 2.968) -C size 10000000 14.460 c/l. (no ALIGN: 14.403) +C size 10000 2.847 c/l (no ALIGN: 2.968. movhpd: 3.566) +C size 10000000 14.460 c/l. (no ALIGN: 14.403. movhpd: 14.294) hunk ./rshift.asm 275 - movdqa %xmm6, (%rdi,%rdx,8) C store the result. + movq %xmm6, (%rdi,%rdx,8) C store the result. + movhpd %xmm6, 8(%rdi,%rdx,8) C store the result. + hunk ./rshift.asm 279 -C movq 16(%rsi,%rdx,8), %xmm6 C %6 = limb3 (about to become limb1) hunk ./rshift.asm 294 - movdqa %xmm6, (%rdi) C store the result. +C movdqa %xmm6, (%rdi) C store the result. + movq %xmm6, (%rdi) C store the result. + movhpd %xmm6, 8(%rdi) C store the result. }