[update benchmark comments
peter@cordes.ca**20080314174107] {
hunk ./rshift.asm 31
-C MMX:
+C MMX: (Conroe 2.4GHz. calls to dynamically linked lib function)
hunk ./rshift.asm 38
+C MMX: (Conroe 2.4GHz. calls to statically linked lib function)
+C size 1:		16.530  cycles
+C size 2:		9.547	cycles/limb
+C size 4:		5.775	cycles/limb
+C size 496:		2.445	cycles/limb  (same on 2.8GHz Harpertown)
hunk ./rshift.asm 44
+C MMX: (K8 2.6GHz (ACT cluster). calls to statically linked lib function.)
+C size 496:		2.553	cycles/limb
hunk ./rshift.asm 54
+C size 496, ACT 2.6G	3.788		cycles/limb.  Linux on dual 2218 2.6GHz.
hunk ./rshift.asm 59
-C Intel Core 2(64bit mode)
-C size 1:		13.080		cycles.
+C Intel Core 2(Conroe 2.4GHz.  calls to statically linked .o)
+C size 1:		13.024		cycles.
hunk ./rshift.asm 62
+C size 4:		4.275		cycles/limb (took a long time to settle.  often swung up to 8.800)
hunk ./rshift.asm 69
+C Intel Core 2(Harpertown 2.8GHz, system idle. calls to statically linked .o)
+C size 496:		2.087		cycles/limb (more measurement overhead?)
+
+
+C psllq has a latency of 2 cycles, throughput 1
+C 2c/limb = 4c/2limb = 4cycles for the whole loop
+C need to hide the latency of the shufpd?  (lat=1, through=1)
+C reversing the order of the shifts doesn't help.
+
+C 4 cycles ?= latency chain of movdqa (load), shufpd, psllq, por(?)
hunk ./rshift.asm 122
-C optimization: could maybe structure things so limb0,limb1 need the shuffle, to hide the latency
+C optimization: could maybe structure things so limb0,limb1 need the shuffle, to hide the latency.
+C probably would need another shuffle before storing, though
+
}