[use macros to allow switching between 4-limb and 8-limb unroll peter@cordes.ca**20080324205442 also make it easy to move the add/lea up/down in the loop ] { hunk ./rshift.asm 362 + +C still need to comment/uncomment loop counters/pointer increments. and the ALIGN(16) + define(`C2_UNROLL8') +C define(`C2_8REG') + hunk ./rshift.asm 368 -define(n,%rdx) -define(reg1,%r9) -define(reg2,%rax) -define(reg3,%r8) C referenced the fewest times -define(reg4,%r11) -define(reg5,%r12) -define(reg6,%r13) -define(reg7,%r14) -define(reg8,%r15) -C define(reg8,reg4) +define(`n',%rdx) +define(`reg1',%r9) +define(`reg2',%rax) +define(`reg3',%r8) C referenced the fewest times +define(`reg4',%r11) +C pipeline depth is only 4, so we can avoid overlap with only 4 regs. +ifdef(`C2_8REG', ` + define(`reg5',%r12) + define(`reg6',%r13) + define(`reg7',%r14) + define(`reg8',%r15) + ', ` + define(reg5,reg1) + define(reg6,reg2) + define(reg7,reg3) + define(reg8,reg4) +') + +ifdef(`C2_8REG', ` + push %r12 + push %r13 + push %r14 + push %r15 +', ) hunk ./rshift.asm 393 -C push reg2 -C push reg4 - push %r12 - push %r13 - push %r14 - push %r15 hunk ./rshift.asm 412 - test $7, %dl +ifdef(`C2_UNROLL8', + `test $7, %dl' +, `test $3, %dl' ) hunk ./rshift.asm 439 +define(`srcoff',0) +define(`dstoff',0) hunk ./rshift.asm 442 -L(c2_loop): shrd %cl, reg1, reg8 +L(c2_loop): + shrd %cl, reg1, reg8 hunk ./rshift.asm 447 -C add $32, %rsi +C add $32, %rsi; define(`srcoff',32) hunk ./rshift.asm 449 - mov reg1, 8(%rdi) - mov (8-64)(%rsi), reg5 - lea 64(%rsi),%rsi + mov reg1, (8-dstoff)(%rdi) + mov (8-srcoff)(%rsi), reg5 + lea 64(%rsi),%rsi; define(`srcoff',64) hunk ./rshift.asm 454 -C add $32, %rdi - mov reg2, (16-0)(%rdi) - mov (16-64)(%rsi), reg6 +C add $32, %rdi; define(`dstoff',32) + mov reg2, (16-dstoff)(%rdi) + mov (16-srcoff)(%rsi), reg6 hunk ./rshift.asm 459 -C sub $4, n - mov reg3, (24-0)(%rdi) - mov (24-64)(%rsi), reg7 +C sub $4, n + mov reg3, (24-dstoff)(%rdi) + mov (24-srcoff)(%rsi), reg7 hunk ./rshift.asm 463 - shrd %cl, reg5, reg4 - mov (32-64)(%rsi), reg8 - mov reg4, (32-0)(%rdi) - lea 64(%rdi),%rdi - C L(c2_10): - shrd %cl, reg6, reg5 - mov (40-64)(%rsi), reg1 - mov reg5, (40-64)(%rdi) - C L(c2_01): - shrd %cl, reg7, reg6 - mov (48-64)(%rsi), reg2 - mov reg6, (48-64)(%rdi) - C L(c2_00): - shrd %cl, reg8, reg7 - sub $8, n - mov (56-64)(%rsi), reg3 - mov reg7, (56-64)(%rdi) hunk ./rshift.asm 464 +ifdef(`C2_UNROLL8', ` + shrd %cl, reg5, reg4 + mov reg4, (32-dstoff)(%rdi) + lea 64(%rdi),%rdi; define(`dstoff',64) + mov (32-srcoff)(%rsi), reg8 +C L(c2_10): + shrd %cl, reg6, reg5 + mov reg5, (40-dstoff)(%rdi) + mov (40-srcoff)(%rsi), reg1 +C L(c2_01): + shrd %cl, reg7, reg6 + mov reg6, (48-dstoff)(%rdi) + mov (48-srcoff)(%rsi), reg2 +C L(c2_00): + shrd %cl, reg8, reg7 + sub $8, n + mov reg7, (56-dstoff)(%rdi) + mov (56-srcoff)(%rsi), reg3 +') C endif hunk ./rshift.asm 490 -C pop reg4 -C pop reg2 - pop %r15 - pop %r14 - pop %r13 - pop %r12 +ifdef(`C2_8REG', ` + pop %r15 + pop %r14 + pop %r13 + pop %r12 +',) }