[fast 8-limb loop, 1.66 c/l. peter@cordes.ca**20080323060928 still reads past the end of src, so needs finishing touches ] { hunk ./rshift.asm 368 - +define(reg5,%r12) +define(reg6,%r13) +define(reg7,%r14) +define(reg8,%r15) +C define(reg8,reg4) hunk ./rshift.asm 376 - mov (%rsi), reg4 C reg4 = limb0 + push %r12 + push %r13 + push %r14 + push %r15 + mov (%rsi), reg8 C reg8 = limb0 hunk ./rshift.asm 382 - shrd %cl, reg4, %rax C %rax = ret val limb. %rbx still = limb0 + shrd %cl, reg8, %rax C %rax = ret val limb. %rbx still = limb0 hunk ./rshift.asm 388 - shrd %cl, reg1, reg4 - mov reg4, (%rdi) - mov reg1, reg4 + shrd %cl, reg1, reg8 + mov reg8, (%rdi) + mov reg1, reg8 hunk ./rshift.asm 398 - mov (%rsi), reg1 C reg4=limb0 reg1=limb1 - test $3, %dl + mov (%rsi), reg1 C reg8=limb0 reg1=limb1 + test $7, %dl hunk ./rshift.asm 403 -C reg4=limb(i) reg1=limb(i+1). %rdx=n-i-1, %rdx%4=0 %rsi -> limb(i+1) +C reg8=limb(i) reg1=limb(i+1). %rdx=n-i-1, %rdx%4=0 %rsi -> limb(i+1) hunk ./rshift.asm 406 - mov 8(%rsi), reg2 - lea 16(%rsi),%rsi + mov 8(%rsi), reg2 + mov 16(%rsi), reg3 + +C mov 24(%rsi), reg4 + lea 24(%rsi), %rsi hunk ./rshift.asm 413 -C mov (%r9), reg4 +C mov (%r9), reg8 hunk ./rshift.asm 417 -C require: reg1=limb1; reg2=limb2; reg3=xxx; reg4=limb0 +C require: reg1=limb1; reg2=limb2; reg3=limb3; reg4=limb4; reg8=limb0 hunk ./rshift.asm 424 -C ALIGN(16) -L(c2_loop): shrd %cl, reg1, reg4 - mov (%rsi), reg3 - add $32, %rsi - mov reg4, (%rdi) + ALIGN(16) +L(c2_loop): shrd %cl, reg1, reg8 + mov reg8, (%rdi) + mov (%rsi), reg4 hunk ./rshift.asm 429 +C add $32, %rsi hunk ./rshift.asm 431 - mov (8-32)(%rsi), reg4 hunk ./rshift.asm 432 - add $32, %rdi + mov (8-64)(%rsi), reg5 + lea 64(%rsi),%rsi hunk ./rshift.asm 436 - mov (16-32)(%rsi),reg1 - mov reg2, (16-32)(%rdi) +C add $32, %rdi + mov reg2, (16-0)(%rdi) + mov (16-64)(%rsi), reg6 hunk ./rshift.asm 441 - sub $4, n - mov (24-32)(%rsi),reg2 - mov reg3, (24-32)(%rdi) +C sub $4, n + mov reg3, (24-0)(%rdi) + mov (24-64)(%rsi), reg7 hunk ./rshift.asm 445 -C shrd %cl, reg1, reg4 -C mov 32(%rsi), reg3 -C mov reg4, 32(%rdi) -C C L(c2_10): -C shrd %cl, reg2, reg1 -C mov 40(%rsi), reg4 -C mov reg1, 40(%rdi) -C C L(c2_01): -C shrd %cl, reg3, reg2 -C mov 48(%rsi),reg1 -C mov reg2, 48(%rdi) -C C L(c2_00): -C shrd %cl, reg4, reg3 -C mov 56(%rsi),reg2 -C lea 64(%rsi),%rsi -C mov reg3, 56(%rdi) -C lea 64(%rdi),%rdi -C sub $8, n + shrd %cl, reg5, reg4 + mov (32-64)(%rsi), reg8 + mov reg4, (32-0)(%rdi) + lea 64(%rdi),%rdi + C L(c2_10): + shrd %cl, reg6, reg5 + mov (40-64)(%rsi), reg1 + mov reg5, (40-64)(%rdi) + C L(c2_01): + shrd %cl, reg7, reg6 + mov (48-64)(%rsi), reg2 + mov reg6, (48-64)(%rdi) + C L(c2_00): + shrd %cl, reg8, reg7 + sub $8, n + mov (56-64)(%rsi), reg3 + mov reg7, (56-64)(%rdi) hunk ./rshift.asm 464 -C ALIGN(16) would align the branch target, but it doesn't seem to matter. +C ALIGN(8) would align the branch target. only needed if near the end of a 16byte fetch, causing a bubble. hunk ./rshift.asm 468 - shr %cl, reg4 C compute most significant limb - mov reg4, (%rdi) C store it + shr %cl, reg8 C compute most significant limb + mov reg8, (%rdi) C store it hunk ./rshift.asm 472 - ret + pop %r15 + pop %r14 + pop %r13 + pop %r12 + ret }