[initial working versions of sse2 rshift and test program peter@cordes.ca**20080314043242] { addfile ./rshift.asm hunk ./rshift.asm 1 +dnl AMD64 mpn_rshift + +dnl Copyright 2004, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../../config.m4') + + +C MMX version: +C cycles/limb +C AMD K8: 2.5 +C Intel P4: 4 +C Intel Core 2: 2.4 + + +C INPUT PARAMETERS +C rp rdi +C up rsi +C n rdx C not allowed to be negative. not doubling as a sign bit. +C cnt rcx + + +C could probably make seriously optimized versions for multiple-of-8 shift counts using SSE2 shuffle instructions. + +ASM_START() +PROLOGUE(mpn_rshift_sse2) + movdqa (%rsi), %xmm6 C %6 = limb0, limb1 + movd %ecx, %xmm1 + sub $64, %cl C cnt must be <=64, so it's ok to operate on small version of it + neg %cl C we want 64-cnt in ecx as a shift count for getting the return value + movd %ecx, %xmm0 C %0=64-cnt=left count=lc; %1=cnt; + + movq %xmm6, %rax C %rax = limb0 + shlq %cl, %rax C return value=limb0<>cnt; %6=limbs(1,2)<>c C NOTE xmm3 and xmm6 are shifted in opposite directions than in the code below. + punpckhqdq %xmm2, %xmm3 C %2=0,0; %3=limb1<>c,limb1>>c + por %xmm6, %xmm3 C %3=result limbs 0,1 + jg L(endo) C n = 1 limb left C condition flags still set from loop counter + movdqa %xmm3, -16(%rdi) C store the result. + ret + +L(endo): + movq %xmm3, -8(%rdi) C an odd limb to store + ret + +EPILOGUE() addfile ./shift.c hunk ./shift.c 1 +/* +Copyright 1996, 1998, 1999, 2000, 2001, 2004 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + + +#include +#include +#include "gmp.h" +#include "gmp-impl.h" +#include "tests.h" + +#ifndef OPERATION_lshift +#define OPERATION_rshift +#endif + +#ifdef OPERATION_lshift +#define func __gmpn_lshift +#define reffunc refmpn_lshift +#define funcname "mpn_lshift" +#endif + +#ifdef OPERATION_rshift +// #define func __gmpn_rshift +#define func mpn_rshift_sse2 +#define reffunc refmpn_rshift +#define funcname "mpn_rshift" +#endif + +// need proto or return type is 32bit int, not 64bit mp_limb_t. +mp_limb_t mpn_rshift_sse2(mp_limb_t *rp, const mp_limb_t *sp, mp_size_t n, unsigned int count ); + + +#if defined (USG) || defined (__SVR4) || defined (_UNICOS) || defined (__hpux) +#include + +int +cputime () +{ + if (CLOCKS_PER_SEC < 100000) + return clock () * 1000 / CLOCKS_PER_SEC; + return clock () / (CLOCKS_PER_SEC / 1000); +} +#else +#include +#include +#include + +int +cputime () +{ + struct rusage rus; + + getrusage (0, &rus); + return rus.ru_utime.tv_sec * 1000 + rus.ru_utime.tv_usec / 1000; +} +#endif + +static void mpn_print (mp_ptr, mp_size_t); + +#define M * 1000000 + +// tesla 2.4GHz Core 2 +#define CLOCK 2400000000 + +#ifndef CLOCK +#error "Don't know CLOCK of your machine" +#endif + +#ifndef OPS +#define OPS (CLOCK/5) +#endif +#ifndef SIZE +//#define SIZE 496 +#define SIZE 4965 +#endif +#ifndef TIMES +#define TIMES OPS/(SIZE+1) +#endif + +#ifndef CNT +int CNT = 13; +#endif + +main (int argc, char **argv) +{ + mp_ptr s1, dx, dy; + mp_limb_t cyx, cyy; + int i; + long t0, t; + unsigned int test; + int cnt = CNT; + mp_size_t size; + unsigned int ntests; + + s1 = malloc ((SIZE+1) * sizeof (mp_limb_t)); + dx = malloc ((SIZE + 3) * sizeof (mp_limb_t)); + dy = malloc ((SIZE + 3) * sizeof (mp_limb_t)); +// s1; // now even 8 byte aligned + dx++; // dx+1 is now 16byte aligned again. + dy++; + printf("s1&0xff = %lu\n", ((unsigned long)s1)&0xff); + + // gmp_randseed(RANDS, 12345); + gmp_randinit_default(RANDS); + gmp_randseed_ui(RANDS, 15); + + + ntests = ~(unsigned) 0; + if (argc == 2) + ntests = strtol (argv[1], 0, 0); + + for (test = 1; test <= ntests; test++) + { +#if TIMES == 1 && ! defined (PRINT) + if (test % (SIZE > 100000 ? 1 : 100000 / SIZE) == 0) + { + printf ("\r%u", test); + fflush (stdout); + } +#endif + +#if TIMES == 1 + cnt = random () % (GMP_NUMB_BITS - 1) + 1; +#endif + +#ifdef RANDOM + size = random () % SIZE + 1; +#else + size = SIZE; +#endif + + dx[0] = 0x87654321; + dy[0] = 0x87654321; + dx[size+1] = 0x12345678; + dy[size+1] = 0x12345678; + +#if TIMES != 1 + mpn_random (s1, size); + mpn_random (s1, size); + mpn_random (s1, size); + + t0 = cputime(); + for (i = 0; i < TIMES; i++) + func (dx+1, s1, size, cnt); + t = cputime() - t0; + printf ("%s: %5ldms (%.3f cycles/limb)\n", __func__, + t, ((double) t * CLOCK) / (TIMES * size * 1000.0)); +#endif + +#ifndef NOCHECK + mpn_random2 (s1, size); + +#ifdef PRINT + printf ("cnt=%-*d ", (int) (2 * sizeof(mp_limb_t)) - 4, cnt); + mpn_print (s1, size); +#endif + + /* Put garbage in the destination. */ + for (i = 0; i < size; i++) + { + dx[i+1] = 0xdead; + dy[i+1] = 0xbeef; + } + + cyx = reffunc (dx+1, s1, size, cnt); + cyy = func (dy+1, s1, size, cnt); + +#ifdef PRINT + mpn_print (&cyx, 1); + mpn_print (dx+1, size); + mpn_print (&cyy, 1); + mpn_print (dy+1, size); +#endif + + if (cyx != cyy || mpn_cmp (dx, dy, size+2) != 0 + || dx[0] != 0x87654321 || dx[size+1] != 0x12345678) + { +#ifndef PRINT + fputs(" src : ", stdout); // padding because source doesn't have guard limbs + mpn_print (s1, size); + fputs("reffunc: ", stdout); + mpn_print (&cyx, 1); + mpn_print (dx, size+2); // leave in the guard limbs + fputs(" func: ", stdout); + mpn_print (&cyy, 1); + mpn_print (dy, size+2); +#endif + printf ("\n"); + if (dy[0] != 0x87654321) + printf ("clobbered at low end\n"); + if (dy[size+1] != 0x12345678) + printf ("clobbered at high end\n"); + printf ("TEST NUMBER %u\n", test); + abort(); + } +#endif + } + exit (0); +} + +static void +mpn_print (mp_ptr p, mp_size_t size) +{ + mp_size_t i; + + for (i = size - 1; i >= 0; i--) + { +#ifdef _LONG_LONG_LIMB + printf ("%0*lX%0*lX", (int) (sizeof(mp_limb_t)), + (unsigned long) (p[i] >> (BITS_PER_MP_LIMB/2)), + (int) (sizeof(mp_limb_t)), (unsigned long) (p[i])); +#else + printf ("%0*lX", (int) (2 * sizeof(mp_limb_t)), p[i]); +#endif +#ifdef SPACE + if (i != 0) + printf (" "); +#endif + } + puts (""); +} }