dnl  AMD64 mpn_rshift

dnl  Copyright 2004, 2006 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')


C MMX version:
C		    cycles/limb
C AMD K8:		 2.5
C Intel P4:		 4
C Intel Core 2:		 2.4

C all benchmarks on tesla: E6600 (2.4GHz) w/DDR800 dual-channel.  system not idle, azureus running.

C MMX: (Conroe 2.4GHz. calls to dynamically linked lib function)
C size 1		18.000  cycles/limb
C size 2		9.960-10.020 cyc/limb
C size 49:		2.673	cycles/limb
C size 496:		2.465	cycles/limb
C size 1600:		2.402	cycles/limb

C MMX: (Conroe 2.4GHz. calls to statically linked lib function)
C size 1:		16.530  cycles
C size 2:		9.547	cycles/limb
C size 4:		5.775	cycles/limb
C size 496:		2.445	cycles/limb  (same on 2.8GHz Harpertown)
C size 10000000:	14.823	c/l (dual-channel DDR800, g965 chipset)

C MMX: (K8 2.6GHz (ACT cluster). calls to statically linked lib function.)
C size 496:		2.553	cycles/limb


C this SSE2 version: requires 16byte aligned input and output
C +++++++++++++++++++++ shufpd version with slow (%cl) intro ++++++++++++++++++++
C AMD K8 (awarnach master node, 2.0GHz, Solaris)
C size 1:		14-15		cycles
C size 2:		8.025		cycles/limb
C size 49:		4.337		cycles/limb
C size 496:		3.808		cycles/limb
C size 496, ACT 2.6G	3.788		cycles/limb.  Linux on dual 2218 2.6GHz.
C size 1600:		3.652-3.802	cycles/limb
C size 4000:		3.751		cycles/limb
C size 496001:		12.807		cycles/limb

C Intel Core 2(Conroe 2.4GHz.  calls to statically linked .o)
C size 1:		13.024		cycles.
C size 2:		6.5-6.6		cycles/limb
C size 4:		4.275		cycles/limb (took a long time to settle.  often swung up to 8.800. prob. branch pred problems)
C size 49:		2.245		cycles/limb
C size 496,800:		2.064		cycles/limb
C size 1600:		1.981-2.021	cycles/limb
C size 4000:		2.401		cycles/limb
C size 496001:		8.607		cycles/limb

C Intel Core 2(Harpertown 2.8GHz, system idle. calls to statically linked .o)
C size 496:		2.087		cycles/limb (more measurement overhead?)


C ++++++++++++++++++++ fast intro (%ecx), new loop, shufpd (and sometimes movhlps) version +++++++++++++++++++
C AMD K8 (Opteron 2218)
C size 1		15.938		cycles ( movhlps)
C size 4		5.947		cycles/limb (shufpd)
C size 496		3.791		cycles/limb (3.044 movhlps, still slower than MMX)

C Core 2 Conroe (2.4GHz)
C size 1		12.224		cycles  (11.968 with pxor after the loop)
C size 2		6.000		c/l
C size 3		5.333		c/l
C size 4		4.0		cycles/limb  (6.420 with movhlps)
C size 496		2.052		cycles/limb (2.036-2.068 movhlps)
C size 10000000		10.820		c/l (dual-channel DDR800, g965 chipset)

C Core 2 Harpertown (2.8GHz)
C size 1		12.049		cycles
C size 4		4.013		cycles/limb (shufpd)
C size 496		2.062		cycles/limb (shufpd/movhlps)


C movdqu (unaligned allowed, times for the aligned case) (Conroe 2.4GHz)
C size 1:		14.000	    cycles/limb
C size 2:		6.990-7.050 cycles/limb
C size 496, 4000:	4.048	    cycles/limb
C size 496001:		8.787-8.807 cycles/limb


C psllq has a latency of 2 cycles, throughput 1
C 2c/limb = 4c/2limb = 4cycles for the whole loop
C need to hide the latency of the shufpd?  (lat=1, through=1)
C reversing the order of the shifts doesn't help.

C 4 cycles ?= latency chain of movdqa (load), shufpd, psllq, por(?)
C further gains would take more loop unrolling or software pipelining to hide the latency of the load
C which would require a bigger intro/outro to not read past the end of the array
C currently the function fits nicely into 128bytes

C optimization: could maybe structure things so limb0,limb1 need the shuffle, to hide the latency.
C probably would need another shuffle before storing, though


C INPUT PARAMETERS
C rp	rdi
C up	rsi
C n	rdx    C not allowed to be negative.  not doubling as a sign bit.
C cnt	rcx


C could probably make seriously optimized versions for multiple-of-8 shift counts using SSE2 shuffle instructions.
C notation: %6 = limb1:limb0  means the _high_ quad of %xmm6 = limb1.

ASM_START()
PROLOGUE(mpn_rshift_sse2_aligned)
	movdqa	(%rsi), %xmm3		C %3 = limb1:limb0
	movd	%ecx, %xmm1
	sub	$64, %ecx		C cnt must be <=64, so it's ok to operate on small version of it
	neg	%ecx			C we want 64-cnt in ecx as a shift count for getting the return value
	movq	%xmm3, %rax		C %rax = limb0
 	movd	%ecx, %xmm0		C %0=64-cnt=left count=lc; %1=cnt;  C this can go anywhere before the loop.
	shlq	%cl, %rax		C return value=limb0<<lc. shift count has to be in %cl.	 C modifies flags, so it has to go before the loop

	leaq	(%rsi,%rdx,8), %rsi     C rsi = rsi + rdx*8 = last limb + 1
	leaq	(%rdi,%rdx,8), %rdi
	negq	%rdx
C rsi=last srclimb+1; rdi=last destlimb+1; rdx=-n; %0=64-cnt; %1=cnt;
C %3=limb1:limb0;
C	movdqa	%xmm3, %xmm7

	addq	$2, %rdx		C n= -n + 2
	jge	L(out)			C skip the loop if n<=2

	ALIGN(16)			C minimal alignment for claimed speed
L(loop):
	movdqa	(%rsi,%rdx,8), %xmm7	C %7=limbs(3:2).  If src is 16byte aligned, this can't cross a page boundary and segfault.  we might harmlessly read past the end of the array of limbs if its length is odd.

C require: %3=limbs(1:0); %7=limbs(3:2)

C good:  2.5c/l on Conroe.  w/pshufd ..,%xmm7, %xmm6 before the load, 2.0 cycles/limb on conroe, but 2.5 on Harpertown. (?!), and  3.0 on K8.
C	pshufd  $14, %xmm3, %xmm6	C %6=xxx:limb1  14 = (2 + 3<<2)  C latency=2 on pre-penryn
C	punpcklqdq %xmm7, %xmm6		C dest=src[0]:dest[0]

C	pxor	%xmm6, %xmm6		C break the partial-reg dependency.  2.0 cycles/limb instead of 2.5 on conroe and penryn
C	movhlps	%xmm3, %xmm6
C require %6=xxx:limb1; %7=limbs(3:2)
C	punpcklqdq %xmm7, %xmm6		C dest=src[0]:dest[0]


C the best alternative for getting limb1 in the low part of a register
C shufpd version ran 2.0 cycles/limb on conroe and penryn, but 3.7 instead of 3.0 for movhlps on K8
	movdqa	%xmm3, %xmm6
	shufpd	$1, %xmm7, %xmm6	C %6=limbs(2:1).  take dest[1],src[0], so op=1+0<<1

C bad	pslldq	$4, %xmm2	C only operates in-place and has latency=2 on pre-penryn

C require: %3=limb1:limb0; %6 = limb2:limb1
	psllq	%xmm0, %xmm6
	psrlq	%xmm1, %xmm3		C %3=limbs(1:0)>>cnt; %6=limbs(2:1)<<lc
	por	%xmm6, %xmm3		C %3=result limbs(1:0)
	movdqa	%xmm3, -16(%rdi,%rdx,8) C store the result.

C require %7=limbs(3:2) (next iteration's limbs(1:0))
C	movdqa	%xmm7, %xmm6		C setup for next iter.
	movdqa	%xmm7, %xmm3
	addq	$2, %rdx
	jl	L(loop)

L(out):
C %3=limb1:limb0 (since it was loaded before rdx was incremented).
	C seems to make no diff where we put pxor, so move it to function start if that helps alignment
	pxor	%xmm6, %xmm6		C we need this for later, in L(out).

	movdqa  %xmm3, %xmm2
	punpckhqdq %xmm6, %xmm3		C %6=0:0;  %3=limb0:1
	psllq	%xmm0, %xmm3		C %3=0:limb1<<lc
	psrlq	%xmm1, %xmm2		C %2=(limbs(1:0))>>c
C require: %3=0:limb1<<lc; %2=limb1>>c:limb0>>c
	por	%xmm2, %xmm3		C %3=result limbs 1:0
	jg	L(endo)			C n = 1 limb left 	C condition flags still set from loop counter
	movdqa	%xmm3, -16(%rdi)	C store the result.
	ret

L(endo):
	movq	%xmm3, -8(%rdi)		C an odd limb to store
	ret

EPILOGUE()


C version using unaligned loads but aligned stores:
C K8 (2.6 GHz)
C size 1:	16.926	c/l
C size 2:	8.470	c/l
C size 3	7.638	c/l (no ALIGN: 7.309)
C size 4:	5.736	c/l (no ALIGN: 5.731)
C size 5:	5.777	c/l
C size 6:	4.813	c/l
C size 8:	4.358	c/l (no ALIGN: 4.607)
C size 496:	3.052	c/l. (no ALIGN: 3.547, movhpd: 3.048) old: 3.039c/l with store commented out.
C size 10000	5.244	c/l (no ALIGN: 5.257. movhpd: 5.248)
C size 10000000	14.053	c/l. (no ALIGN: 14.529. movhpd: 14.005)

C Conroe:
C size 1	12.096	c/l  (no ALIGN, movhpd: 13.024)
C size 2	6.504	c/l  (no ALIGN, movhpd: 6.504)
C size 3	5.376	c/l  (no ALIGN: 5.312. unaligned movhpd stores: 5.355)
C size 4	4.040	c/l  (no ALIGN: 3.980, and one less icache line touched?) (movhpd: 4.180)
C size 5	4.013	c/l
C size 6	3.341	c/l
C size 8	2.988	c/l  (no ALIGN: 2.988) (movhpd: 3.294)
C size 496:	2.052	c/l. (no ALIGN: 2.052) (movhpd: 2.132)
C size 10000	2.320	c/l  (no ALIGN: 2.320) (movhpd: 2.656)
C size 10000000 11.178  c/l. (no ALIGN: 11.195. movhpd: 11.520))  (2.4GHz, g965, dual channel DDR800)


C Harpertown (2.8GHz):
C size 1	12.115	c/l.
C size 2	6.524	c/l
C size 3	5.444	c/l (no ALIGN: 5.289)
C size 4	4.083	c/l (no ALIGN: 3.961)
C size 5	4.217	c/l
C size 6	3.512	c/l
C size 8	3.229	c/l. (no ALIGN: 3.134)
C size 496	2.581	c/l. (no ALIGN: 2.558. movhpd: 2.562)
C size 10000	2.847	c/l  (no ALIGN: 2.968. movhpd: 3.566)
C size 10000000	14.460	c/l. (no ALIGN: 14.403. movhpd: 14.294)

C TODO: test this with electric fence to see if it goes off the end.

ASM_START()
ALIGN(64)
PROLOGUE(mpn_rshift_sse2)
	movq	(%rsi), %xmm7		C %7 = limb0
	movd	%ecx, %xmm1
	sub	$64, %ecx		C cnt must be <=64, so it's ok to operate on small version of it
	neg	%ecx			C we want 64-cnt in ecx as a shift count for getting the return value
	movq	%xmm7, %rax		C %rax = limb0
 	movd	%ecx, %xmm0		C %0=64-cnt=left count=lc; %1=cnt;  C this can go anywhere before the loop.
	shlq	%cl, %rax		C return value=limb0<<lc. shift count has to be in %cl.	 C modifies flags, so it has to go before the loop

	leaq	-8(%rsi,%rdx,8), %rsi     C rsi = rsi + rdx*8 = last limb.
	leaq	-16(%rdi,%rdx,8), %rdi	C rdi = rdi + rdx*8 - 16 = last limb - 1
	negq	%rdx
C rdx=-n; %0=64-cnt; %1=cnt;
C %7=limb0;
C	xor	%ecx, %ecx		C can clear rcx here or later with mov
	addq	$2, %rdx		C n= -n + 2
	jge	L(unal_out)			C skip the loop if n<=2

C	ALIGN(16)			C No alignment needed on Core 2.  This makes n even case only 128B of instructions.
L(unal_loop):
	movq	(%rsi,%rdx,8), %xmm6	C %6 = limb1
	movdqa	%xmm7,	%xmm3
	punpcklqdq %xmm6, %xmm3
	psrlq	%xmm1,	%xmm3

	movq	8(%rsi,%rdx,8), %xmm7	C %7 = limb2
	punpcklqdq %xmm7, %xmm6
	psllq	%xmm0,	%xmm6

	por	%xmm3,	%xmm6
C	movdqa	%xmm6,	(%rdi,%rdx,8)
	movq	%xmm6,	(%rdi,%rdx,8) C store the result.
	movhpd	%xmm6,	8(%rdi,%rdx,8) C store the result.


	addq	$2,	%rdx
	jl	L(unal_loop)

L(unal_out):
C %6=limb1; %7=limb0
C It's actually faster to mov $0 here instead of an xor before the loop.  It saves a ~0.2 cycles/limb on n=4.  It does make the function bigger.
	C todo: use shrd to shorten the outro and put the start of this fn within -128bytes of the jge in the non-sse2 version
	mov	$0, %ecx		C need to preserve the condition codes if we clear rcx here.
	cmovng	(%rsi), %rcx		C (%rsi,%rdx,8), but the move happens only when rdx is zero.
	movq	%rcx, %xmm6		C %6 = limb1, or 0 if we're off the end of the array.  (can't be garbage because it gets ORed with the result we want)

	punpcklqdq %xmm6, %xmm7
	psllq	%xmm0,	%xmm6
	psrlq	%xmm1,	%xmm7
	por	%xmm7,	%xmm6
	jg	L(unal_endodd)			C n = 1 limb left 	C condition flags still set from loop counter
C	movdqa	%xmm6, (%rdi)	C store the result.
	movq	%xmm6,	(%rdi)	C store the result.
	movhpd	%xmm6,	8(%rdi)	C store the result.
	ret

L(unal_endodd):
	movq	%xmm6, 8(%rdi)		C an odd limb to store
	ret

EPILOGUE()


C regs: args out/in: %rax / %rdi, %rsi, %rdx, %rcx, %r8, %r9
C callee saved: %rbx, %rbp, %r10, %r13-15
C %r11: used as a tmp in linking functions.  looks like it is used as a tmp in gcc-compiled functions, too.
C %r12: unused in C.  wrong, it is used. gcc-code saves and restores it.
C %rsp: stack pointer

C	non-SSE rshift for Core 2, basic structure copied from 32bit mpn/x86/rshift.asm

C note: core2 has 3 exec ports for add, only 1 for lea.  So don't use lea when an add will do, except to preserve flags

C fastest ever was 1.166 c/l, with an 8-limb unroll achieving 3.886 uops/clock (av. over the whole function) on SIZE=1001
C see email sent to gmp-devel for order of instructions in the loop.
C It had the loads before the stores in the bottom half of the loop, so wouldn't work with 4 regs.

C 4-limb unroll fits in the loop-stream buffer, and does at best 1.281 c/l.  3.922 uops/clock.

C n limbs.  last one is special, since nothing to shift in.
C unrolled loop does max(0, floor((n-1-pipedepth)/unroll)*unroll + pipedepth) limbs
C shortloop does the rest (-1)
C unroll=8 pipedepth=3:
C n=1.	end:1	unrolled:0	shortloop:0	-> end
C n=2.	end:1	unrolled:0	shortloop:1	shortloop -> end
C n=3.	end:1	unrolled:0	shortloop:2	shortloop -> end
C n=4.	end:1	unrolled:0+3	shortloop:0	unroll_setup -> cleanup -> end
C n=5.	end:1	unrolled:0+3	shortloop:1	unroll_setup -> cleanup -> shortloop -> end
C n=6.	end:1	unrolled:0+3	shortloop:2	unroll_setup -> cleanup -> shortloop -> end
C n=7.	end:1	unrolled:0+3	shortloop:3	unroll_setup -> cleanup -> shortloop -> end
C n=8.	end:1	unrolled:0+3	shortloop:4	unroll_setup -> cleanup -> shortloop -> end
C n=9.	end:1	unrolled:0+3	shortloop:5	unroll_setup -> cleanup -> shortloop -> end
C n=10	end:1	unrolled:0+3	shortloop:6	unroll_setup -> cleanup -> shortloop -> end
C n=11	end:1	unrolled:0+3	shortloop:7	unroll_setup -> cleanup -> shortloop -> end
C size>unroll+3:
C n=12	end:1	unrolled:8+3	shortloop:0	unroll_setup -> unrolled -> cleanup -> end
C n=13	end:1	unrolled:8+3	shortloop:1	unroll_setup -> unrolled -> cleanup -> shortloop -> end
C n=14	end:1	unrolled:8+3	shortloop:2	unroll_setup -> unrolled -> cleanup -> shortloop -> end
C n=15	end:1	unrolled:8+3	shortloop:3	unroll_setup -> unrolled -> cleanup -> shortloop -> end
C n=16	end:1	unrolled:8+3	shortloop:4	unroll_setup -> unrolled -> cleanup -> shortloop -> end


ASM_START()
C TODO: remove this, or make it 16
ALIGN(1024)
C ALIGN(16)
PROLOGUE(mpn_rshift_core2)
C 	C would like to get lots of instructions into the OOO execution engine early so it has plenty to work on...
C 	cmp	$16000,	%rdx
C 	jge	mpn_rshift_sse2		C TODO: and not penryn/harpertown unless we can use the aligned version


C still need to comment/uncomment the ALIGN(16)
C ################ controls for 4/8 limb unroll ################
 define(`C2_UNROLL8')
C define(`C2_8REG')  C only works with 8-limb unroll.

define(`unroll', ifdef(`C2_UNROLL8',8,4))

C regs for the loop.  use macros to make register reallocation easy.
define(`n',%rdx)
define(`reg1',%r9)
define(`reg2',%rax)		C %rax can't be reg1 or reg4/8
define(`reg3',%r8)  		C referenced the fewest times
define(`reg4',%r11)
C  pipeline depth is only 4, so we can avoid overlap with only 4 regs.
ifdef(`C2_8REG', `
 define(`reg5',%r12)
 define(`reg6',%r13)
 define(`reg7',%r14)
 define(`reg8',%r15)
	', `
 define(reg5,reg1)
 define(reg6,reg2)
 define(reg7,reg3)
 define(reg8,reg4)
')

ifdef(`C2_8REG', `
  	push	%r12
  	push	%r13
  	push	%r14
  	push	%r15
', )
C	movq	%rdi,	%xmm0	C FIXME: debugging only
C	movq	%rsi,	%xmm1	C FIXME: debugging only
	C shift count can stay where it is in %rcx
	mov	(%rsi), reg8		C reg8 = limb0
	xor	%eax,	%eax
	shrd	%cl,	reg8,	%rax	C %rax = ret val limb.  reg8 still = limb0
	push	%rax		C save retval.
C	jmp	L(c2_unrolled)	C faster to _not_ have this through the decoders on the first cycle, when n < 12.
C L(c2_unrolled):
	cmp	$3,	n	C if n-1>=pipeline depth, we can use that cleanup code
	jle	L(c2_entry)
	mov	8(%rsi),	reg1
	mov	16(%rsi),	reg2

	add	$32,	%rsi
	mov	(24-32)(%rsi),	reg3
	sub	$(3+unroll),	n      C n is still possibly > 2^32
	jg	L(c2_loop)
	C for large n, we've had 1 not taken jcc, and the above jcc is taken
	C for n>3, n<12, we've had 2 not taken jcc
C ALIGN(16)
L(c2_unrolled_cleanup):
	C require: reg8=limb(i).  reg1=limb(i+1)  reg2=limb(i+2)  reg3=limb(i+3).  n<=256
	C in 4-limb unroll, reg8->reg4

	shrd	%cl, reg1, reg8
	mov	reg8,	(%rdi)
	add	$24,	%rdi
	shrd	%cl, reg2, reg1
	mov	reg1,	(8-24)(%rdi)

	mov	reg3,	reg8

	shrd	%cl, reg3, reg2
	mov	reg2,	(16-24)(%rdi)

	add	$(unroll-1), %dl
	jz	L(c2_end)
C	jmp	L(c2_entry_after_unrolled)


L(c2_shortloop):
	mov	(%rsi),	reg1		C reg8=limb0 reg1=limb1
	shrd	%cl,	reg1,	reg8
	mov	reg8,	(%rdi)
	mov	reg1,	reg8
	add	$8,	%rdi
L(c2_entry):
	add	$8,	%rsi
L(c2_entry_after_unrolled):
	dec	%dl
C 	sub	$1,	%dl
	jg	L(c2_shortloop)

C loop last iter stores ith limb to dest, and loads i+1st limb from src
C	reg8=limb(i) reg1=limb(i+1).  %rdx=n-i-1, %rdx%4=0  %rsi -> limb(i+1)

C ALIGN(8) C would align the branch target.  only needed if near the end of a 16byte fetch, causing a bubble.
C  XXXXXXXXXXXXXX tail end of function
L(c2_end):
	pop	%rax			C return val
	shr	%cl,	reg8		C compute most significant limb
	mov	reg8,	(%rdi)		C store it
ifdef(`C2_8REG', `
 	pop	%r15
 	pop	%r14
 	pop	%r13
 	pop	%r12
',)
 	ret
C  XXXXXXXXXXXXXX tail end of function


C IP=124B from start.  Only 2 icache lines if we don't touch the unrolled loop

C require: reg1=limb1; reg2=limb2; reg3=limb3;  reg8=limb0

C 4-limb loop is <= 18 insn and <= 4 16byte aligned blocks, so fits into Core 2's loop stream buffer without ALIGN.
C If this is misaligned so it doesn't fit in the loob buffer, it runs ~1.57 c/l instead of ~1.33
C further unrolling will push it beyond the size of the loop stream detector.  (already close in bytes).
C 8 limbs/iter runs at 1.202 - 1.315 c/l with ALIGN(16).  (slow intro loop has to do more, though...)
C use simple addressing modes, not (%rdi,%rdx,8).  That generates too many reads of not-in-flight register values
C add/lea instructions placed by trial and error to avoid pipeline stalls.
define(`srcoff',0)
define(`dstoff',0)
   ALIGN(16)
L(c2_loop):			C does unroll*(%rdx / unroll) + 3 limbs
	shrd	%cl, reg1, reg8
	mov	reg8,	(%rdi)
	mov	(%rsi),		reg4
C	L(c2_10):
	ifdef(`C2_UNROLL8',,	`add	$32,	%rsi; define(`srcoff',32)')
	shrd	%cl, reg2, reg1
	mov	reg1,	(8-dstoff)(%rdi)
	mov	(8-srcoff)(%rsi),	reg5
	ifdef(`C2_UNROLL8',	`add	$64,	%rsi; define(`srcoff',64)',)
C	L(c2_01):
	shrd	%cl, reg3, reg2
	ifdef(`C2_UNROLL8',,	`add	$32,	%rdi; define(`dstoff',32)')
	mov	reg2,	(16-dstoff)(%rdi)
	mov	(16-srcoff)(%rsi),	reg6
C	L(c2_00):
	shrd	%cl, reg4, reg3
	ifdef(`C2_UNROLL8',,	`sub	$4, n')
	mov	reg3,	(24-dstoff)(%rdi)
	mov	(24-srcoff)(%rsi),	reg7


ifdef(`C2_UNROLL8', `
	shrd	%cl, reg5, reg4
	mov	reg4,	(32-dstoff)(%rdi)
	ifdef(`C2_UNROLL8',	`add	$64,	%rdi; define(`dstoff',64)',)
	mov	(32-srcoff)(%rsi),	reg8
C	L(c2_10):
	shrd	%cl, reg6, reg5
	mov	reg5,	(40-dstoff)(%rdi)
	mov	(40-srcoff)(%rsi),	reg1
C	L(c2_01):
	shrd	%cl, reg7, reg6
	mov	reg6,	(48-dstoff)(%rdi)
	mov	(48-srcoff)(%rsi),	reg2
C	L(c2_00):
	shrd	%cl, reg8, reg7
	ifdef(`C2_UNROLL8',	`sub	$8, n',)
	mov	reg7,	(56-dstoff)(%rdi)
	mov	(56-srcoff)(%rsi),	reg3
') C endif
L(c2_unrolled_tail):
	jg	L(c2_loop)

	C RIP=xxxc  if loop start was align(16).
	C might be optimal to duplicate a 3-byte insn here before the jmp
	jmp	L(c2_unrolled_cleanup)
C 236B total.  could duplicate some code down here...
EPILOGUE()