[do the cleanup after the loop. debugged and working, and pretty fast peter@cordes.ca**20080325055033 This works well: we can have special cleanup for coming out of the pipeline. Tested with electric fence, and it doesn't read past the end of src. ] { hunk ./rshift.asm 323 +C n limbs. last one is special, since nothing to shift in. +C unrolled loop does max(0, floor((n-1-pipedepth)/unroll)*unroll + pipedepth) limbs +C shortloop does the rest (-1) +C unroll=8 pipedepth=3: +C n=1. end:1 unrolled:0 shortloop:0 -> end +C n=2. end:1 unrolled:0 shortloop:1 shortloop -> end +C n=3. end:1 unrolled:0 shortloop:2 shortloop -> end +C n=4. end:1 unrolled:0+3 shortloop:0 unroll_setup -> cleanup -> end +C n=5. end:1 unrolled:0+3 shortloop:1 unroll_setup -> cleanup -> shortloop -> end +C n=6. end:1 unrolled:0+3 shortloop:2 unroll_setup -> cleanup -> shortloop -> end +C n=7. end:1 unrolled:0+3 shortloop:3 unroll_setup -> cleanup -> shortloop -> end +C n=8. end:1 unrolled:0+3 shortloop:4 unroll_setup -> cleanup -> shortloop -> end +C n=9. end:1 unrolled:0+3 shortloop:5 unroll_setup -> cleanup -> shortloop -> end +C n=10 end:1 unrolled:0+3 shortloop:6 unroll_setup -> cleanup -> shortloop -> end +C n=11 end:1 unrolled:0+3 shortloop:7 unroll_setup -> cleanup -> shortloop -> end +C size>unroll+3: +C n=12 end:1 unrolled:8+3 shortloop:0 unroll_setup -> unrolled -> cleanup -> end +C n=13 end:1 unrolled:8+3 shortloop:1 unroll_setup -> unrolled -> cleanup -> shortloop -> end +C n=14 end:1 unrolled:8+3 shortloop:2 unroll_setup -> unrolled -> cleanup -> shortloop -> end +C n=15 end:1 unrolled:8+3 shortloop:3 unroll_setup -> unrolled -> cleanup -> shortloop -> end +C n=16 end:1 unrolled:8+3 shortloop:4 unroll_setup -> unrolled -> cleanup -> shortloop -> end + + hunk ./rshift.asm 357 -C still need to comment/uncomment loop counters/pointer increments. and the ALIGN(16) +C still need to comment/uncomment the ALIGN(16) +C ################ controls for 4/8 limb unroll ################ hunk ./rshift.asm 360 -C define(`C2_8REG') +C define(`C2_8REG') C only works with 8-limb unroll. hunk ./rshift.asm 367 -define(`reg2',%rax) +define(`reg2',%rax) C %rax can't be reg1 or reg4/8 hunk ./rshift.asm 389 +C movq %rdi, %xmm0 C FIXME: debugging only +C movq %rsi, %xmm1 C FIXME: debugging only hunk ./rshift.asm 397 -C mov %rsi, %r9 + cmp $3, n C if n-1>=pipeline depth, we can use that cleanup code + jg L(c2_unrolled) hunk ./rshift.asm 400 + hunk ./rshift.asm 402 + mov (%rsi), reg1 C reg8=limb0 reg1=limb1 hunk ./rshift.asm 407 -C add $8, %r9 hunk ./rshift.asm 408 - dec n C sub looks like it makes things align better, but dec has the same timings -C sub $1, n - jle L(c2_end) hunk ./rshift.asm 409 - mov (%rsi), reg1 C reg8=limb0 reg1=limb1 - test $(unroll-1), %dl - jnz L(c2_shortloop) +L(c2_entry_after_unrolled): + dec n +C sub $1, n + jg L(c2_shortloop) hunk ./rshift.asm 417 -C mov (%rsi), reg1 - mov 8(%rsi), reg2 - mov 16(%rsi), reg3 +C ALIGN(8) would align the branch target. only needed if near the end of a 16byte fetch, causing a bubble. +C L(c2_endshort): +C XXXXXXXXXXXXXX tail end of function +L(c2_end): + pop %rax C return val + shr %cl, reg8 C compute most significant limb + mov reg8, (%rdi) C store it +ifdef(`C2_8REG', ` + pop %r15 + pop %r14 + pop %r13 + pop %r12 +',) + ret +C XXXXXXXXXXXXXX tail end of function + hunk ./rshift.asm 434 -C mov 24(%rsi), reg4 - lea 24(%rsi), %rsi +L(c2_unrolled): + mov 8(%rsi), reg1 + mov 16(%rsi), reg2 + mov 24(%rsi), reg3 hunk ./rshift.asm 439 -C debug: %r9 = %rsi -C mov (%r9), reg8 -C mov 8(%r9), reg1 -C mov 16(%r9),reg2 +C mov 24(%rsi), reg4 + lea 32(%rsi), %rsi + sub $(3+unroll), n + jle L(c2_unrolled_cleanup) hunk ./rshift.asm 454 -L(c2_loop): +L(c2_loop): C does unroll*(%rdx / unroll) + 3 limbs hunk ./rshift.asm 495 +L(c2_unrolled_tail): hunk ./rshift.asm 497 -C ALIGN(8) would align the branch target. only needed if near the end of a 16byte fetch, causing a bubble. -C L(c2_endshort): -L(c2_end): - pop %rax C return val - shr %cl, reg8 C compute most significant limb - mov reg8, (%rdi) C store it -ifdef(`C2_8REG', ` - pop %r15 - pop %r14 - pop %r13 - pop %r12 -',) - ret + + +C jmp L(c2_unrolled_cleanup) +C move this before the loop for better small-n perf, probably + + C require: reg8=limb(i). reg1=limb(i+1) reg2=limb(i+2) reg3=limb(i+3) + C in 4-limb unroll, reg8->reg4 +L(c2_unrolled_cleanup): + + shrd %cl, reg1, reg8 + mov reg8, (%rdi) + shrd %cl, reg2, reg1 + mov reg1, 8(%rdi) + + add $24, %rdi + mov reg3, reg8 + + shrd %cl, reg3, reg2 + mov reg2, (16-24)(%rdi) + + add $unroll,n + jmp L(c2_entry_after_unrolled) + }