[simple loop instead of computed goto to deal with alignment peter@cordes.ca**20080322050316] { hunk ./rshift.asm 377 + sub $1, %rdx C %rdx = n-1 + jle L(c2_end_short) C if(n<=1), no loop. %rdi on entry points to top (only) limb + add $8, %rsi + push %rax C save retval + +L(c2_shortloop): + mov (%rsi), %rax + shrd %cl, %rax, %rbx + mov %rbx, (%rdi) + mov %rax, %rbx + add $8, %rdi + sub $1, %rdx + jz L(c2_end) + add $8, %rsi + test $3, %dl + jnz L(c2_shortloop) + +C loop last iter stores ith limb to dest, and loads i+1st limb from src +C %rax=%rbx=limb(i). %rdx=n-i-1 %rsi -> next limb to load +C mov %rbx, %rax C get limb0 in both regs, so we can jump into the loop anywhere. + hunk ./rshift.asm 399 - mov %rdx, %rbp - sub $1, %rbp - and $3, %rbp - imul $-12, %rbp, %r8 C do this early to hide the latency - lea (L(c2_1)+3*12)(%r8), %r8 +C mov %rdx, %rbp +C sub $1, %rbp +C and $3, %rbp +C imul $-12, %rbp, %r8 C do this early to hide the latency +C lea (L(c2_1)+3*12)(%r8), %r8 hunk ./rshift.asm 412 - sub $1, %rdx C %rdx = n-1 - jle L(c2_end_short) C if(n<=1), no loop. %rdi on entry points to top (only) limb - push %rax - mov %rbx, %rax C get limb0 in both regs, so we can jump into the loop anywhere. - hunk ./rshift.asm 414 - test %rbp, %rbp - jz L(c2_plus1) C (n-1)%4==0, n==4m+1. special case: don't need to adjust pointers (and the code below would fail because (n-1)%4 = 0, not 4.) +C test %rbp, %rbp +C jz L(c2_plus1) C (n-1)%4==0, n==4m+1. special case: don't need to adjust pointers (and the code below would fail because (n-1)%4 = 0, not 4.) hunk ./rshift.asm 421 -C lea (L(c2_1)+3*12)(%r8), %r8 +C lea (L(c2_1)+3*12)(%r8), %r8 C requires a 4-byte displacement :( hunk ./rshift.asm 424 - lea (-4*8)(%rdi,%rbp,8), %rdi - lea (8-4*8)(%rsi,%rbp,8), %rsi +C lea (-4*8)(%rdi,%rbp,8), %rdi +C lea (8-4*8)(%rsi,%rbp,8), %rsi hunk ./rshift.asm 436 - jmp *%r8 +C jmp *%r8 hunk ./rshift.asm 439 - lea 8(%rsi), %rsi +C lea 8(%rsi), %rsi hunk ./rshift.asm 474 -C L(c2_end): +L(c2_end): }