[better block ordering to remove more branches peter@cordes.ca**20080326041203 saves a few cycles for most n. ] { hunk ./rshift.asm 395 - xor %eax, %eax C no advantage to using %rax. this first block would still fit in 16B with a REX - shrd %cl, reg8, %rax C %rax = ret val limb. %rbx still = limb0 + xor %eax, %eax + shrd %cl, reg8, %rax C %rax = ret val limb. reg8 still = limb0 hunk ./rshift.asm 398 - jmp L(c2_unrolled) C faster to _not_ have this through the decoders on the first cycle, when n < 12. -C < 16B < 6 instr: first fetch. +C jmp L(c2_unrolled) C faster to _not_ have this through the decoders on the first cycle, when n < 12. +C L(c2_unrolled): + cmp $3, n C if n-1>=pipeline depth, we can use that cleanup code + jle L(c2_entry) + mov 8(%rsi), reg1 + mov 16(%rsi), reg2 hunk ./rshift.asm 405 -ALIGN(16) + add $32, %rsi + mov (24-32)(%rsi), reg3 + sub $(3+unroll), n C n is still possibly > 2^32 + jg L(c2_loop) + C for large n, we've had 1 not taken jcc, and the above jcc is taken + C for n>3, n<12, we've had 2 not taken jcc +C ALIGN(16) hunk ./rshift.asm 442 -C sub $1, n +C sub $1, %dl hunk ./rshift.asm 449 -C L(c2_endshort): hunk ./rshift.asm 464 -ALIGN(16) C jumped to for all n, after a 5-insn fetch. And nothing drops in to it. -L(c2_unrolled): - cmp $3, n C if n-1>=pipeline depth, we can use that cleanup code - jle L(c2_entry) - mov 8(%rsi), reg1 - mov 16(%rsi), reg2 - add $32, %rsi C This is the pipeline depth, not the unroll count - sub $(3+unroll), n C n is still possibly > 2^32 - mov (24-32)(%rsi), reg3 - jle L(c2_unrolled_cleanup) C end of basic block with 8 insns. good. - C else drop into the main loop. for large n, we've had 1 taken jmp, and 2 not taken jcc. - hunk ./rshift.asm 466 -C require: reg1=limb1; reg2=limb2; reg3=limb3; reg4=limb4; reg8=limb0 +C require: reg1=limb1; reg2=limb2; reg3=limb3; reg8=limb0 }