[new layout of mostly the same code. only 2 icache lines for n<12, and mostly better speed peter@cordes.ca**20080326023333] { hunk ./rshift.asm 317 +C note: core2 has 3 exec ports for add, only 1 for lea. So don't use lea when an add will do, except to preserve flags + hunk ./rshift.asm 395 - xor %eax, %eax + xor %eax, %eax C no advantage to using %rax. this first block would still fit in 16B with a REX hunk ./rshift.asm 397 - push %rax C save retval + push %rax C save retval. + jmp L(c2_unrolled) C faster to _not_ have this through the decoders on the first cycle, when n < 12. +C < 16B < 6 instr: first fetch. + +ALIGN(16) +L(c2_unrolled_cleanup): + C require: reg8=limb(i). reg1=limb(i+1) reg2=limb(i+2) reg3=limb(i+3). n<=256 + C in 4-limb unroll, reg8->reg4 + + shrd %cl, reg1, reg8 + mov reg8, (%rdi) + add $24, %rdi + shrd %cl, reg2, reg1 + mov reg1, (8-24)(%rdi) + + mov reg3, reg8 + + shrd %cl, reg3, reg2 + mov reg2, (16-24)(%rdi) + + add $(unroll-1), %dl + jz L(c2_end) +C jmp L(c2_entry_after_unrolled) hunk ./rshift.asm 421 - cmp $3, n C if n-1>=pipeline depth, we can use that cleanup code - jg L(c2_unrolled) - jmp L(c2_entry) hunk ./rshift.asm 431 - dec n + dec %dl hunk ./rshift.asm 438 -C ALIGN(8) would align the branch target. only needed if near the end of a 16byte fetch, causing a bubble. +C ALIGN(8) C would align the branch target. only needed if near the end of a 16byte fetch, causing a bubble. hunk ./rshift.asm 455 +ALIGN(16) C jumped to for all n, after a 5-insn fetch. And nothing drops in to it. hunk ./rshift.asm 457 + cmp $3, n C if n-1>=pipeline depth, we can use that cleanup code + jle L(c2_entry) hunk ./rshift.asm 461 - mov 24(%rsi), reg3 + add $32, %rsi C This is the pipeline depth, not the unroll count + sub $(3+unroll), n C n is still possibly > 2^32 + mov (24-32)(%rsi), reg3 + jle L(c2_unrolled_cleanup) C end of basic block with 8 insns. good. + C else drop into the main loop. for large n, we've had 1 taken jmp, and 2 not taken jcc. hunk ./rshift.asm 467 -C mov 24(%rsi), reg4 - lea 32(%rsi), %rsi - sub $(3+unroll), n - jle L(c2_unrolled_cleanup) +C IP=124B from start. Only 2 icache lines if we don't touch the unrolled loop hunk ./rshift.asm 471 -C loop is <= 18 insn and <= 4 16byte aligned blocks, so fits into Core 2's loop stream buffer, so alignment doesn't matter +C 4-limb loop is <= 18 insn and <= 4 16byte aligned blocks, so fits into Core 2's loop stream buffer without ALIGN. hunk ./rshift.asm 476 +C add/lea instructions placed by trial and error to avoid pipeline stalls. hunk ./rshift.asm 485 -ifdef(`C2_UNROLL8',, `add $32, %rsi; define(`srcoff',32)') + ifdef(`C2_UNROLL8',, `add $32, %rsi; define(`srcoff',32)') hunk ./rshift.asm 489 -ifdef(`C2_UNROLL8', `lea 64(%rsi),%rsi; define(`srcoff',64)',) + ifdef(`C2_UNROLL8', `add $64, %rsi; define(`srcoff',64)',) hunk ./rshift.asm 492 -ifdef(`C2_UNROLL8',, `add $32, %rdi; define(`dstoff',32)') + ifdef(`C2_UNROLL8',, `add $32, %rdi; define(`dstoff',32)') hunk ./rshift.asm 497 -ifdef(`C2_UNROLL8',, `sub $4, n') + ifdef(`C2_UNROLL8',, `sub $4, n') hunk ./rshift.asm 505 -ifdef(`C2_UNROLL8', `lea 64(%rdi),%rdi; define(`dstoff',64)',) + ifdef(`C2_UNROLL8', `add $64, %rdi; define(`dstoff',64)',) hunk ./rshift.asm 517 -ifdef(`C2_UNROLL8', `sub $8, n',) + ifdef(`C2_UNROLL8', `sub $8, n',) hunk ./rshift.asm 524 - -C jmp L(c2_unrolled_cleanup) -C move this before the loop for better small-n perf, probably - - C require: reg8=limb(i). reg1=limb(i+1) reg2=limb(i+2) reg3=limb(i+3) - C in 4-limb unroll, reg8->reg4 -L(c2_unrolled_cleanup): - - shrd %cl, reg1, reg8 - mov reg8, (%rdi) - shrd %cl, reg2, reg1 - mov reg1, 8(%rdi) - - add $24, %rdi - mov reg3, reg8 - - shrd %cl, reg3, reg2 - mov reg2, (16-24)(%rdi) - - add $unroll,n - jmp L(c2_entry_after_unrolled) - + C RIP=xxxc if loop start was align(16). + C might be optimal to duplicate a 3-byte insn here before the jmp + jmp L(c2_unrolled_cleanup) +C 236B total. could duplicate some code down here... }