Add libgmp 6.2.0 to third_party

Don't build it yet.  That will come in the next review.

Change-Id: Idf3266558165e5ab45f4a41c98cc8c838c8244d5
diff --git a/third_party/gmp/tests/refmpn.c b/third_party/gmp/tests/refmpn.c
new file mode 100644
index 0000000..42bb411
--- /dev/null
+++ b/third_party/gmp/tests/refmpn.c
@@ -0,0 +1,2630 @@
+/* Reference mpn functions, designed to be simple, portable and independent
+   of the normal gmp code.  Speed isn't a consideration.
+
+Copyright 1996-2009, 2011-2014 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library test suite.
+
+The GNU MP Library test suite is free software; you can redistribute it
+and/or modify it under the terms of the GNU General Public License as
+published by the Free Software Foundation; either version 3 of the License,
+or (at your option) any later version.
+
+The GNU MP Library test suite is distributed in the hope that it will be
+useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+the GNU MP Library test suite.  If not, see https://www.gnu.org/licenses/.  */
+
+
+/* Most routines have assertions representing what the mpn routines are
+   supposed to accept.  Many of these reference routines do sensible things
+   outside these ranges (eg. for size==0), but the assertions are present to
+   pick up bad parameters passed here that are about to be passed the same
+   to a real mpn routine being compared.  */
+
+/* always do assertion checking */
+#define WANT_ASSERT  1
+
+#include <stdio.h>  /* for NULL */
+#include <stdlib.h> /* for malloc */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#include "tests.h"
+
+
+
+/* Return non-zero if regions {xp,xsize} and {yp,ysize} overlap, with sizes
+   in bytes. */
+int
+byte_overlap_p (const void *v_xp, mp_size_t xsize,
+		const void *v_yp, mp_size_t ysize)
+{
+  const char *xp = (const char *) v_xp;
+  const char *yp = (const char *) v_yp;
+
+  ASSERT (xsize >= 0);
+  ASSERT (ysize >= 0);
+
+  /* no wraparounds */
+  ASSERT (xp+xsize >= xp);
+  ASSERT (yp+ysize >= yp);
+
+  if (xp + xsize <= yp)
+    return 0;
+
+  if (yp + ysize <= xp)
+    return 0;
+
+  return 1;
+}
+
+/* Return non-zero if limb regions {xp,xsize} and {yp,ysize} overlap. */
+int
+refmpn_overlap_p (mp_srcptr xp, mp_size_t xsize, mp_srcptr yp, mp_size_t ysize)
+{
+  return byte_overlap_p (xp, xsize * GMP_LIMB_BYTES,
+			 yp, ysize * GMP_LIMB_BYTES);
+}
+
+/* Check overlap for a routine defined to work low to high. */
+int
+refmpn_overlap_low_to_high_p (mp_srcptr dst, mp_srcptr src, mp_size_t size)
+{
+  return (dst <= src || ! refmpn_overlap_p (dst, size, src, size));
+}
+
+/* Check overlap for a routine defined to work high to low. */
+int
+refmpn_overlap_high_to_low_p (mp_srcptr dst, mp_srcptr src, mp_size_t size)
+{
+  return (dst >= src || ! refmpn_overlap_p (dst, size, src, size));
+}
+
+/* Check overlap for a standard routine requiring equal or separate. */
+int
+refmpn_overlap_fullonly_p (mp_srcptr dst, mp_srcptr src, mp_size_t size)
+{
+  return (dst == src || ! refmpn_overlap_p (dst, size, src, size));
+}
+int
+refmpn_overlap_fullonly_two_p (mp_srcptr dst, mp_srcptr src1, mp_srcptr src2,
+			       mp_size_t size)
+{
+  return (refmpn_overlap_fullonly_p (dst, src1, size)
+	  && refmpn_overlap_fullonly_p (dst, src2, size));
+}
+
+
+mp_ptr
+refmpn_malloc_limbs (mp_size_t size)
+{
+  mp_ptr  p;
+  ASSERT (size >= 0);
+  if (size == 0)
+    size = 1;
+  p = (mp_ptr) malloc ((size_t) (size * GMP_LIMB_BYTES));
+  ASSERT (p != NULL);
+  return p;
+}
+
+/* Free limbs allocated by refmpn_malloc_limbs. NOTE: Can't free
+ * memory allocated by refmpn_malloc_limbs_aligned. */
+void
+refmpn_free_limbs (mp_ptr p)
+{
+  free (p);
+}
+
+mp_ptr
+refmpn_memdup_limbs (mp_srcptr ptr, mp_size_t size)
+{
+  mp_ptr  p;
+  p = refmpn_malloc_limbs (size);
+  refmpn_copyi (p, ptr, size);
+  return p;
+}
+
+/* malloc n limbs on a multiple of m bytes boundary */
+mp_ptr
+refmpn_malloc_limbs_aligned (mp_size_t n, size_t m)
+{
+  return (mp_ptr) align_pointer (refmpn_malloc_limbs (n + m-1), m);
+}
+
+
+void
+refmpn_fill (mp_ptr ptr, mp_size_t size, mp_limb_t value)
+{
+  mp_size_t  i;
+  ASSERT (size >= 0);
+  for (i = 0; i < size; i++)
+    ptr[i] = value;
+}
+
+void
+refmpn_zero (mp_ptr ptr, mp_size_t size)
+{
+  refmpn_fill (ptr, size, CNST_LIMB(0));
+}
+
+void
+refmpn_zero_extend (mp_ptr ptr, mp_size_t oldsize, mp_size_t newsize)
+{
+  ASSERT (newsize >= oldsize);
+  refmpn_zero (ptr+oldsize, newsize-oldsize);
+}
+
+int
+refmpn_zero_p (mp_srcptr ptr, mp_size_t size)
+{
+  mp_size_t  i;
+  for (i = 0; i < size; i++)
+    if (ptr[i] != 0)
+      return 0;
+  return 1;
+}
+
+mp_size_t
+refmpn_normalize (mp_srcptr ptr, mp_size_t size)
+{
+  ASSERT (size >= 0);
+  while (size > 0 && ptr[size-1] == 0)
+    size--;
+  return size;
+}
+
+/* the highest one bit in x */
+mp_limb_t
+refmpn_msbone (mp_limb_t x)
+{
+  mp_limb_t  n = (mp_limb_t) 1 << (GMP_LIMB_BITS-1);
+
+  while (n != 0)
+    {
+      if (x & n)
+	break;
+      n >>= 1;
+    }
+  return n;
+}
+
+/* a mask of the highest one bit plus and all bits below */
+mp_limb_t
+refmpn_msbone_mask (mp_limb_t x)
+{
+  if (x == 0)
+    return 0;
+
+  return (refmpn_msbone (x) << 1) - 1;
+}
+
+/* How many digits in the given base will fit in a limb.
+   Notice that the product b is allowed to be equal to the limit
+   2^GMP_NUMB_BITS, this ensures the result for base==2 will be
+   GMP_NUMB_BITS (and similarly other powers of 2).  */
+int
+refmpn_chars_per_limb (int base)
+{
+  mp_limb_t  limit[2], b[2];
+  int        chars_per_limb;
+
+  ASSERT (base >= 2);
+
+  limit[0] = 0;  /* limit = 2^GMP_NUMB_BITS */
+  limit[1] = 1;
+  b[0] = 1;      /* b = 1 */
+  b[1] = 0;
+
+  chars_per_limb = 0;
+  for (;;)
+    {
+      if (refmpn_mul_1 (b, b, (mp_size_t) 2, (mp_limb_t) base))
+	break;
+      if (refmpn_cmp (b, limit, (mp_size_t) 2) > 0)
+	break;
+      chars_per_limb++;
+    }
+  return chars_per_limb;
+}
+
+/* The biggest value base**n which fits in GMP_NUMB_BITS. */
+mp_limb_t
+refmpn_big_base (int base)
+{
+  int        chars_per_limb = refmpn_chars_per_limb (base);
+  int        i;
+  mp_limb_t  bb;
+
+  ASSERT (base >= 2);
+  bb = 1;
+  for (i = 0; i < chars_per_limb; i++)
+    bb *= base;
+  return bb;
+}
+
+
+void
+refmpn_setbit (mp_ptr ptr, unsigned long bit)
+{
+  ptr[bit/GMP_NUMB_BITS] |= CNST_LIMB(1) << (bit%GMP_NUMB_BITS);
+}
+
+void
+refmpn_clrbit (mp_ptr ptr, unsigned long bit)
+{
+  ptr[bit/GMP_NUMB_BITS] &= ~ (CNST_LIMB(1) << (bit%GMP_NUMB_BITS));
+}
+
+#define REFMPN_TSTBIT(ptr,bit) \
+  (((ptr)[(bit)/GMP_NUMB_BITS] & (CNST_LIMB(1) << ((bit)%GMP_NUMB_BITS))) != 0)
+
+int
+refmpn_tstbit (mp_srcptr ptr, unsigned long bit)
+{
+  return REFMPN_TSTBIT (ptr, bit);
+}
+
+unsigned long
+refmpn_scan0 (mp_srcptr ptr, unsigned long bit)
+{
+  while (REFMPN_TSTBIT (ptr, bit) != 0)
+    bit++;
+  return bit;
+}
+
+unsigned long
+refmpn_scan1 (mp_srcptr ptr, unsigned long bit)
+{
+  while (REFMPN_TSTBIT (ptr, bit) == 0)
+    bit++;
+  return bit;
+}
+
+void
+refmpn_copy (mp_ptr rp, mp_srcptr sp, mp_size_t size)
+{
+  ASSERT (refmpn_overlap_fullonly_p (rp, sp, size));
+  refmpn_copyi (rp, sp, size);
+}
+
+void
+refmpn_copyi (mp_ptr rp, mp_srcptr sp, mp_size_t size)
+{
+  mp_size_t i;
+
+  ASSERT (refmpn_overlap_low_to_high_p (rp, sp, size));
+  ASSERT (size >= 0);
+
+  for (i = 0; i < size; i++)
+    rp[i] = sp[i];
+}
+
+void
+refmpn_copyd (mp_ptr rp, mp_srcptr sp, mp_size_t size)
+{
+  mp_size_t i;
+
+  ASSERT (refmpn_overlap_high_to_low_p (rp, sp, size));
+  ASSERT (size >= 0);
+
+  for (i = size-1; i >= 0; i--)
+    rp[i] = sp[i];
+}
+
+/* Copy {xp,xsize} to {wp,wsize}.  If x is shorter, then pad w with low
+   zeros to wsize.  If x is longer, then copy just the high wsize limbs.  */
+void
+refmpn_copy_extend (mp_ptr wp, mp_size_t wsize, mp_srcptr xp, mp_size_t xsize)
+{
+  ASSERT (wsize >= 0);
+  ASSERT (xsize >= 0);
+
+  /* high part of x if x bigger than w */
+  if (xsize > wsize)
+    {
+      xp += xsize - wsize;
+      xsize = wsize;
+    }
+
+  refmpn_copy (wp + wsize-xsize, xp, xsize);
+  refmpn_zero (wp, wsize-xsize);
+}
+
+int
+refmpn_cmp (mp_srcptr xp, mp_srcptr yp, mp_size_t size)
+{
+  mp_size_t  i;
+
+  ASSERT (size >= 1);
+  ASSERT_MPN (xp, size);
+  ASSERT_MPN (yp, size);
+
+  for (i = size-1; i >= 0; i--)
+    {
+      if (xp[i] > yp[i])  return 1;
+      if (xp[i] < yp[i])  return -1;
+    }
+  return 0;
+}
+
+int
+refmpn_cmp_allowzero (mp_srcptr xp, mp_srcptr yp, mp_size_t size)
+{
+  if (size == 0)
+    return 0;
+  else
+    return refmpn_cmp (xp, yp, size);
+}
+
+int
+refmpn_cmp_twosizes (mp_srcptr xp, mp_size_t xsize,
+		     mp_srcptr yp, mp_size_t ysize)
+{
+  int  opp, cmp;
+
+  ASSERT_MPN (xp, xsize);
+  ASSERT_MPN (yp, ysize);
+
+  opp = (xsize < ysize);
+  if (opp)
+    MPN_SRCPTR_SWAP (xp,xsize, yp,ysize);
+
+  if (! refmpn_zero_p (xp+ysize, xsize-ysize))
+    cmp = 1;
+  else
+    cmp = refmpn_cmp (xp, yp, ysize);
+
+  return (opp ? -cmp : cmp);
+}
+
+int
+refmpn_equal_anynail (mp_srcptr xp, mp_srcptr yp, mp_size_t size)
+{
+  mp_size_t  i;
+  ASSERT (size >= 0);
+
+  for (i = 0; i < size; i++)
+      if (xp[i] != yp[i])
+	return 0;
+  return 1;
+}
+
+
+#define LOGOPS(operation)                                               \
+  {                                                                     \
+    mp_size_t  i;                                                       \
+									\
+    ASSERT (refmpn_overlap_fullonly_two_p (rp, s1p, s2p, size));        \
+    ASSERT (size >= 1);                                                 \
+    ASSERT_MPN (s1p, size);                                             \
+    ASSERT_MPN (s2p, size);                                             \
+									\
+    for (i = 0; i < size; i++)                                          \
+      rp[i] = operation;                                                \
+  }
+
+void
+refmpn_and_n (mp_ptr rp, mp_srcptr s1p, mp_srcptr s2p, mp_size_t size)
+{
+  LOGOPS (s1p[i] & s2p[i]);
+}
+void
+refmpn_andn_n (mp_ptr rp, mp_srcptr s1p, mp_srcptr s2p, mp_size_t size)
+{
+  LOGOPS (s1p[i] & ~s2p[i]);
+}
+void
+refmpn_nand_n (mp_ptr rp, mp_srcptr s1p, mp_srcptr s2p, mp_size_t size)
+{
+  LOGOPS ((s1p[i] & s2p[i]) ^ GMP_NUMB_MASK);
+}
+void
+refmpn_ior_n (mp_ptr rp, mp_srcptr s1p, mp_srcptr s2p, mp_size_t size)
+{
+  LOGOPS (s1p[i] | s2p[i]);
+}
+void
+refmpn_iorn_n (mp_ptr rp, mp_srcptr s1p, mp_srcptr s2p, mp_size_t size)
+{
+  LOGOPS (s1p[i] | (s2p[i] ^ GMP_NUMB_MASK));
+}
+void
+refmpn_nior_n (mp_ptr rp, mp_srcptr s1p, mp_srcptr s2p, mp_size_t size)
+{
+  LOGOPS ((s1p[i] | s2p[i]) ^ GMP_NUMB_MASK);
+}
+void
+refmpn_xor_n (mp_ptr rp, mp_srcptr s1p, mp_srcptr s2p, mp_size_t size)
+{
+  LOGOPS (s1p[i] ^ s2p[i]);
+}
+void
+refmpn_xnor_n (mp_ptr rp, mp_srcptr s1p, mp_srcptr s2p, mp_size_t size)
+{
+  LOGOPS ((s1p[i] ^ s2p[i]) ^ GMP_NUMB_MASK);
+}
+
+
+/* set *dh,*dl to mh:ml - sh:sl, in full limbs */
+void
+refmpn_sub_ddmmss (mp_limb_t *dh, mp_limb_t *dl,
+		   mp_limb_t mh, mp_limb_t ml, mp_limb_t sh, mp_limb_t sl)
+{
+  *dl = ml - sl;
+  *dh = mh - sh - (ml < sl);
+}
+
+
+/* set *w to x+y, return 0 or 1 carry */
+mp_limb_t
+ref_addc_limb (mp_limb_t *w, mp_limb_t x, mp_limb_t y)
+{
+  mp_limb_t  sum, cy;
+
+  ASSERT_LIMB (x);
+  ASSERT_LIMB (y);
+
+  sum = x + y;
+#if GMP_NAIL_BITS == 0
+  *w = sum;
+  cy = (sum < x);
+#else
+  *w = sum & GMP_NUMB_MASK;
+  cy = (sum >> GMP_NUMB_BITS);
+#endif
+  return cy;
+}
+
+/* set *w to x-y, return 0 or 1 borrow */
+mp_limb_t
+ref_subc_limb (mp_limb_t *w, mp_limb_t x, mp_limb_t y)
+{
+  mp_limb_t  diff, cy;
+
+  ASSERT_LIMB (x);
+  ASSERT_LIMB (y);
+
+  diff = x - y;
+#if GMP_NAIL_BITS == 0
+  *w = diff;
+  cy = (diff > x);
+#else
+  *w = diff & GMP_NUMB_MASK;
+  cy = (diff >> GMP_NUMB_BITS) & 1;
+#endif
+  return cy;
+}
+
+/* set *w to x+y+c (where c is 0 or 1), return 0 or 1 carry */
+mp_limb_t
+adc (mp_limb_t *w, mp_limb_t x, mp_limb_t y, mp_limb_t c)
+{
+  mp_limb_t  r;
+
+  ASSERT_LIMB (x);
+  ASSERT_LIMB (y);
+  ASSERT (c == 0 || c == 1);
+
+  r = ref_addc_limb (w, x, y);
+  return r + ref_addc_limb (w, *w, c);
+}
+
+/* set *w to x-y-c (where c is 0 or 1), return 0 or 1 borrow */
+mp_limb_t
+sbb (mp_limb_t *w, mp_limb_t x, mp_limb_t y, mp_limb_t c)
+{
+  mp_limb_t  r;
+
+  ASSERT_LIMB (x);
+  ASSERT_LIMB (y);
+  ASSERT (c == 0 || c == 1);
+
+  r = ref_subc_limb (w, x, y);
+  return r + ref_subc_limb (w, *w, c);
+}
+
+
+#define AORS_1(operation)                               \
+  {                                                     \
+    mp_size_t  i;                                       \
+							\
+    ASSERT (refmpn_overlap_fullonly_p (rp, sp, size));  \
+    ASSERT (size >= 1);                                 \
+    ASSERT_MPN (sp, size);                              \
+    ASSERT_LIMB (n);                                    \
+							\
+    for (i = 0; i < size; i++)                          \
+      n = operation (&rp[i], sp[i], n);                 \
+    return n;                                           \
+  }
+
+mp_limb_t
+refmpn_add_1 (mp_ptr rp, mp_srcptr sp, mp_size_t size, mp_limb_t n)
+{
+  AORS_1 (ref_addc_limb);
+}
+mp_limb_t
+refmpn_sub_1 (mp_ptr rp, mp_srcptr sp, mp_size_t size, mp_limb_t n)
+{
+  AORS_1 (ref_subc_limb);
+}
+
+#define AORS_NC(operation)                                              \
+  {                                                                     \
+    mp_size_t  i;                                                       \
+									\
+    ASSERT (refmpn_overlap_fullonly_two_p (rp, s1p, s2p, size));        \
+    ASSERT (carry == 0 || carry == 1);                                  \
+    ASSERT (size >= 1);                                                 \
+    ASSERT_MPN (s1p, size);                                             \
+    ASSERT_MPN (s2p, size);                                             \
+									\
+    for (i = 0; i < size; i++)                                          \
+      carry = operation (&rp[i], s1p[i], s2p[i], carry);                \
+    return carry;                                                       \
+  }
+
+mp_limb_t
+refmpn_add_nc (mp_ptr rp, mp_srcptr s1p, mp_srcptr s2p, mp_size_t size,
+	       mp_limb_t carry)
+{
+  AORS_NC (adc);
+}
+mp_limb_t
+refmpn_sub_nc (mp_ptr rp, mp_srcptr s1p, mp_srcptr s2p, mp_size_t size,
+	       mp_limb_t carry)
+{
+  AORS_NC (sbb);
+}
+
+
+mp_limb_t
+refmpn_add_n (mp_ptr rp, mp_srcptr s1p, mp_srcptr s2p, mp_size_t size)
+{
+  return refmpn_add_nc (rp, s1p, s2p, size, CNST_LIMB(0));
+}
+mp_limb_t
+refmpn_sub_n (mp_ptr rp, mp_srcptr s1p, mp_srcptr s2p, mp_size_t size)
+{
+  return refmpn_sub_nc (rp, s1p, s2p, size, CNST_LIMB(0));
+}
+
+mp_limb_t
+refmpn_cnd_add_n (mp_limb_t cnd, mp_ptr rp, mp_srcptr s1p, mp_srcptr s2p, mp_size_t size)
+{
+  if (cnd != 0)
+    return refmpn_add_n (rp, s1p, s2p, size);
+  else
+    {
+      refmpn_copyi (rp, s1p, size);
+      return 0;
+    }
+}
+mp_limb_t
+refmpn_cnd_sub_n (mp_limb_t cnd, mp_ptr rp, mp_srcptr s1p, mp_srcptr s2p, mp_size_t size)
+{
+  if (cnd != 0)
+    return refmpn_sub_n (rp, s1p, s2p, size);
+  else
+    {
+      refmpn_copyi (rp, s1p, size);
+      return 0;
+    }
+}
+
+
+#define AORS_ERR1_N(operation)						\
+  {                                                                     \
+    mp_size_t  i;                                                       \
+    mp_limb_t carry2;							\
+									\
+    ASSERT (refmpn_overlap_fullonly_p (rp, s1p, size));			\
+    ASSERT (refmpn_overlap_fullonly_p (rp, s2p, size));			\
+    ASSERT (! refmpn_overlap_p (rp, size, yp, size));			\
+    ASSERT (! refmpn_overlap_p (ep, 2, s1p, size));			\
+    ASSERT (! refmpn_overlap_p (ep, 2, s2p, size));			\
+    ASSERT (! refmpn_overlap_p (ep, 2, yp, size));			\
+    ASSERT (! refmpn_overlap_p (ep, 2, rp, size));			\
+									\
+    ASSERT (carry == 0 || carry == 1);					\
+    ASSERT (size >= 1);							\
+    ASSERT_MPN (s1p, size);						\
+    ASSERT_MPN (s2p, size);						\
+    ASSERT_MPN (yp, size);						\
+									\
+    ep[0] = ep[1] = CNST_LIMB(0);					\
+									\
+    for (i = 0; i < size; i++)                                          \
+      {									\
+	carry = operation (&rp[i], s1p[i], s2p[i], carry);		\
+	if (carry == 1)							\
+	  {								\
+	    carry2 = ref_addc_limb (&ep[0], ep[0], yp[size - 1 - i]);	\
+	    carry2 = ref_addc_limb (&ep[1], ep[1], carry2);		\
+	    ASSERT (carry2 == 0);					\
+	  }								\
+      }									\
+    return carry;                                                       \
+  }
+
+mp_limb_t
+refmpn_add_err1_n (mp_ptr rp, mp_srcptr s1p, mp_srcptr s2p,
+		   mp_ptr ep, mp_srcptr yp,
+		   mp_size_t size, mp_limb_t carry)
+{
+  AORS_ERR1_N (adc);
+}
+mp_limb_t
+refmpn_sub_err1_n (mp_ptr rp, mp_srcptr s1p, mp_srcptr s2p,
+		   mp_ptr ep, mp_srcptr yp,
+		   mp_size_t size, mp_limb_t carry)
+{
+  AORS_ERR1_N (sbb);
+}
+
+
+#define AORS_ERR2_N(operation)						\
+  {                                                                     \
+    mp_size_t  i;                                                       \
+    mp_limb_t carry2;							\
+									\
+    ASSERT (refmpn_overlap_fullonly_p (rp, s1p, size));			\
+    ASSERT (refmpn_overlap_fullonly_p (rp, s2p, size));			\
+    ASSERT (! refmpn_overlap_p (rp, size, y1p, size));			\
+    ASSERT (! refmpn_overlap_p (rp, size, y2p, size));			\
+    ASSERT (! refmpn_overlap_p (ep, 4, s1p, size));			\
+    ASSERT (! refmpn_overlap_p (ep, 4, s2p, size));			\
+    ASSERT (! refmpn_overlap_p (ep, 4, y1p, size));			\
+    ASSERT (! refmpn_overlap_p (ep, 4, y2p, size));			\
+    ASSERT (! refmpn_overlap_p (ep, 4, rp, size));			\
+									\
+    ASSERT (carry == 0 || carry == 1);					\
+    ASSERT (size >= 1);							\
+    ASSERT_MPN (s1p, size);						\
+    ASSERT_MPN (s2p, size);						\
+    ASSERT_MPN (y1p, size);						\
+    ASSERT_MPN (y2p, size);						\
+									\
+    ep[0] = ep[1] = CNST_LIMB(0);					\
+    ep[2] = ep[3] = CNST_LIMB(0);					\
+									\
+    for (i = 0; i < size; i++)                                          \
+      {									\
+	carry = operation (&rp[i], s1p[i], s2p[i], carry);		\
+	if (carry == 1)							\
+	  {								\
+	    carry2 = ref_addc_limb (&ep[0], ep[0], y1p[size - 1 - i]);	\
+	    carry2 = ref_addc_limb (&ep[1], ep[1], carry2);		\
+	    ASSERT (carry2 == 0);					\
+	    carry2 = ref_addc_limb (&ep[2], ep[2], y2p[size - 1 - i]);	\
+	    carry2 = ref_addc_limb (&ep[3], ep[3], carry2);		\
+	    ASSERT (carry2 == 0);					\
+	  }								\
+      }									\
+    return carry;                                                       \
+  }
+
+mp_limb_t
+refmpn_add_err2_n (mp_ptr rp, mp_srcptr s1p, mp_srcptr s2p,
+		   mp_ptr ep, mp_srcptr y1p, mp_srcptr y2p,
+		   mp_size_t size, mp_limb_t carry)
+{
+  AORS_ERR2_N (adc);
+}
+mp_limb_t
+refmpn_sub_err2_n (mp_ptr rp, mp_srcptr s1p, mp_srcptr s2p,
+		   mp_ptr ep, mp_srcptr y1p, mp_srcptr y2p,
+		   mp_size_t size, mp_limb_t carry)
+{
+  AORS_ERR2_N (sbb);
+}
+
+
+#define AORS_ERR3_N(operation)						\
+  {                                                                     \
+    mp_size_t  i;                                                       \
+    mp_limb_t carry2;							\
+									\
+    ASSERT (refmpn_overlap_fullonly_p (rp, s1p, size));			\
+    ASSERT (refmpn_overlap_fullonly_p (rp, s2p, size));			\
+    ASSERT (! refmpn_overlap_p (rp, size, y1p, size));			\
+    ASSERT (! refmpn_overlap_p (rp, size, y2p, size));			\
+    ASSERT (! refmpn_overlap_p (rp, size, y3p, size));			\
+    ASSERT (! refmpn_overlap_p (ep, 6, s1p, size));			\
+    ASSERT (! refmpn_overlap_p (ep, 6, s2p, size));			\
+    ASSERT (! refmpn_overlap_p (ep, 6, y1p, size));			\
+    ASSERT (! refmpn_overlap_p (ep, 6, y2p, size));			\
+    ASSERT (! refmpn_overlap_p (ep, 6, y3p, size));			\
+    ASSERT (! refmpn_overlap_p (ep, 6, rp, size));			\
+									\
+    ASSERT (carry == 0 || carry == 1);					\
+    ASSERT (size >= 1);							\
+    ASSERT_MPN (s1p, size);						\
+    ASSERT_MPN (s2p, size);						\
+    ASSERT_MPN (y1p, size);						\
+    ASSERT_MPN (y2p, size);						\
+    ASSERT_MPN (y3p, size);						\
+									\
+    ep[0] = ep[1] = CNST_LIMB(0);					\
+    ep[2] = ep[3] = CNST_LIMB(0);					\
+    ep[4] = ep[5] = CNST_LIMB(0);					\
+									\
+    for (i = 0; i < size; i++)                                          \
+      {									\
+	carry = operation (&rp[i], s1p[i], s2p[i], carry);		\
+	if (carry == 1)							\
+	  {								\
+	    carry2 = ref_addc_limb (&ep[0], ep[0], y1p[size - 1 - i]);	\
+	    carry2 = ref_addc_limb (&ep[1], ep[1], carry2);		\
+	    ASSERT (carry2 == 0);					\
+	    carry2 = ref_addc_limb (&ep[2], ep[2], y2p[size - 1 - i]);	\
+	    carry2 = ref_addc_limb (&ep[3], ep[3], carry2);		\
+	    ASSERT (carry2 == 0);					\
+	    carry2 = ref_addc_limb (&ep[4], ep[4], y3p[size - 1 - i]);	\
+	    carry2 = ref_addc_limb (&ep[5], ep[5], carry2);		\
+	    ASSERT (carry2 == 0);					\
+	  }								\
+      }									\
+    return carry;                                                       \
+  }
+
+mp_limb_t
+refmpn_add_err3_n (mp_ptr rp, mp_srcptr s1p, mp_srcptr s2p,
+		   mp_ptr ep, mp_srcptr y1p, mp_srcptr y2p, mp_srcptr y3p,
+		   mp_size_t size, mp_limb_t carry)
+{
+  AORS_ERR3_N (adc);
+}
+mp_limb_t
+refmpn_sub_err3_n (mp_ptr rp, mp_srcptr s1p, mp_srcptr s2p,
+		   mp_ptr ep, mp_srcptr y1p, mp_srcptr y2p, mp_srcptr y3p,
+		   mp_size_t size, mp_limb_t carry)
+{
+  AORS_ERR3_N (sbb);
+}
+
+
+mp_limb_t
+refmpn_addlsh_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp,
+		 mp_size_t n, unsigned int s)
+{
+  mp_limb_t cy;
+  mp_ptr tp;
+
+  ASSERT (refmpn_overlap_fullonly_two_p (rp, up, vp, n));
+  ASSERT (n >= 1);
+  ASSERT (0 < s && s < GMP_NUMB_BITS);
+  ASSERT_MPN (up, n);
+  ASSERT_MPN (vp, n);
+
+  tp = refmpn_malloc_limbs (n);
+  cy  = refmpn_lshift (tp, vp, n, s);
+  cy += refmpn_add_n (rp, up, tp, n);
+  free (tp);
+  return cy;
+}
+mp_limb_t
+refmpn_addlsh1_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
+{
+  return refmpn_addlsh_n (rp, up, vp, n, 1);
+}
+mp_limb_t
+refmpn_addlsh2_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
+{
+  return refmpn_addlsh_n (rp, up, vp, n, 2);
+}
+mp_limb_t
+refmpn_addlsh_n_ip1 (mp_ptr rp, mp_srcptr vp, mp_size_t n, unsigned int s)
+{
+  return refmpn_addlsh_n (rp, rp, vp, n, s);
+}
+mp_limb_t
+refmpn_addlsh1_n_ip1 (mp_ptr rp, mp_srcptr vp, mp_size_t n)
+{
+  return refmpn_addlsh_n (rp, rp, vp, n, 1);
+}
+mp_limb_t
+refmpn_addlsh2_n_ip1 (mp_ptr rp, mp_srcptr vp, mp_size_t n)
+{
+  return refmpn_addlsh_n (rp, rp, vp, n, 2);
+}
+mp_limb_t
+refmpn_addlsh_n_ip2 (mp_ptr rp, mp_srcptr vp, mp_size_t n, unsigned int s)
+{
+  return refmpn_addlsh_n (rp, vp, rp, n, s);
+}
+mp_limb_t
+refmpn_addlsh1_n_ip2 (mp_ptr rp, mp_srcptr vp, mp_size_t n)
+{
+  return refmpn_addlsh_n (rp, vp, rp, n, 1);
+}
+mp_limb_t
+refmpn_addlsh2_n_ip2 (mp_ptr rp, mp_srcptr vp, mp_size_t n)
+{
+  return refmpn_addlsh_n (rp, vp, rp, n, 2);
+}
+mp_limb_t
+refmpn_addlsh_nc (mp_ptr rp, mp_srcptr up, mp_srcptr vp,
+		  mp_size_t n, unsigned int s, mp_limb_t carry)
+{
+  mp_limb_t cy;
+
+  ASSERT (carry <= (CNST_LIMB(1) << s));
+
+  cy = refmpn_addlsh_n (rp, up, vp, n, s);
+  cy += refmpn_add_1 (rp, rp, n, carry);
+  return cy;
+}
+mp_limb_t
+refmpn_addlsh1_nc (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n, mp_limb_t carry)
+{
+  return refmpn_addlsh_nc (rp, up, vp, n, 1, carry);
+}
+mp_limb_t
+refmpn_addlsh2_nc (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n, mp_limb_t carry)
+{
+  return refmpn_addlsh_nc (rp, up, vp, n, 2, carry);
+}
+
+mp_limb_t
+refmpn_sublsh_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp,
+		 mp_size_t n, unsigned int s)
+{
+  mp_limb_t cy;
+  mp_ptr tp;
+
+  ASSERT (refmpn_overlap_fullonly_two_p (rp, up, vp, n));
+  ASSERT (n >= 1);
+  ASSERT (0 < s && s < GMP_NUMB_BITS);
+  ASSERT_MPN (up, n);
+  ASSERT_MPN (vp, n);
+
+  tp = refmpn_malloc_limbs (n);
+  cy  = mpn_lshift (tp, vp, n, s);
+  cy += mpn_sub_n (rp, up, tp, n);
+  free (tp);
+  return cy;
+}
+mp_limb_t
+refmpn_sublsh1_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
+{
+  return refmpn_sublsh_n (rp, up, vp, n, 1);
+}
+mp_limb_t
+refmpn_sublsh2_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
+{
+  return refmpn_sublsh_n (rp, up, vp, n, 2);
+}
+mp_limb_t
+refmpn_sublsh_n_ip1 (mp_ptr rp, mp_srcptr vp, mp_size_t n, unsigned int s)
+{
+  return refmpn_sublsh_n (rp, rp, vp, n, s);
+}
+mp_limb_t
+refmpn_sublsh1_n_ip1 (mp_ptr rp, mp_srcptr vp, mp_size_t n)
+{
+  return refmpn_sublsh_n (rp, rp, vp, n, 1);
+}
+mp_limb_t
+refmpn_sublsh2_n_ip1 (mp_ptr rp, mp_srcptr vp, mp_size_t n)
+{
+  return refmpn_sublsh_n (rp, rp, vp, n, 2);
+}
+mp_limb_t
+refmpn_sublsh_n_ip2 (mp_ptr rp, mp_srcptr vp, mp_size_t n, unsigned int s)
+{
+  return refmpn_sublsh_n (rp, vp, rp, n, s);
+}
+mp_limb_t
+refmpn_sublsh1_n_ip2 (mp_ptr rp, mp_srcptr vp, mp_size_t n)
+{
+  return refmpn_sublsh_n (rp, vp, rp, n, 1);
+}
+mp_limb_t
+refmpn_sublsh2_n_ip2 (mp_ptr rp, mp_srcptr vp, mp_size_t n)
+{
+  return refmpn_sublsh_n (rp, vp, rp, n, 2);
+}
+mp_limb_t
+refmpn_sublsh_nc (mp_ptr rp, mp_srcptr up, mp_srcptr vp,
+		  mp_size_t n, unsigned int s, mp_limb_t carry)
+{
+  mp_limb_t cy;
+
+  ASSERT (carry <= (CNST_LIMB(1) << s));
+
+  cy = refmpn_sublsh_n (rp, up, vp, n, s);
+  cy += refmpn_sub_1 (rp, rp, n, carry);
+  return cy;
+}
+mp_limb_t
+refmpn_sublsh1_nc (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n, mp_limb_t carry)
+{
+  return refmpn_sublsh_nc (rp, up, vp, n, 1, carry);
+}
+mp_limb_t
+refmpn_sublsh2_nc (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n, mp_limb_t carry)
+{
+  return refmpn_sublsh_nc (rp, up, vp, n, 2, carry);
+}
+
+mp_limb_signed_t
+refmpn_rsblsh_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp,
+		 mp_size_t n, unsigned int s)
+{
+  mp_limb_signed_t cy;
+  mp_ptr tp;
+
+  ASSERT (refmpn_overlap_fullonly_two_p (rp, up, vp, n));
+  ASSERT (n >= 1);
+  ASSERT (0 < s && s < GMP_NUMB_BITS);
+  ASSERT_MPN (up, n);
+  ASSERT_MPN (vp, n);
+
+  tp = refmpn_malloc_limbs (n);
+  cy  = mpn_lshift (tp, vp, n, s);
+  cy -= mpn_sub_n (rp, tp, up, n);
+  free (tp);
+  return cy;
+}
+mp_limb_signed_t
+refmpn_rsblsh1_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
+{
+  return refmpn_rsblsh_n (rp, up, vp, n, 1);
+}
+mp_limb_signed_t
+refmpn_rsblsh2_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
+{
+  return refmpn_rsblsh_n (rp, up, vp, n, 2);
+}
+mp_limb_signed_t
+refmpn_rsblsh_nc (mp_ptr rp, mp_srcptr up, mp_srcptr vp,
+		  mp_size_t n, unsigned int s, mp_limb_signed_t carry)
+{
+  mp_limb_signed_t cy;
+
+  ASSERT (carry == -1 || (carry >> s) == 0);
+
+  cy = refmpn_rsblsh_n (rp, up, vp, n, s);
+  if (carry > 0)
+    cy += refmpn_add_1 (rp, rp, n, carry);
+  else
+    cy -= refmpn_sub_1 (rp, rp, n, -carry);
+  return cy;
+}
+mp_limb_signed_t
+refmpn_rsblsh1_nc (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n, mp_limb_signed_t carry)
+{
+  return refmpn_rsblsh_nc (rp, up, vp, n, 1, carry);
+}
+mp_limb_signed_t
+refmpn_rsblsh2_nc (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n, mp_limb_signed_t carry)
+{
+  return refmpn_rsblsh_nc (rp, up, vp, n, 2, carry);
+}
+
+mp_limb_t
+refmpn_rsh1add_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
+{
+  mp_limb_t cya, cys;
+
+  ASSERT (refmpn_overlap_fullonly_two_p (rp, up, vp, n));
+  ASSERT (n >= 1);
+  ASSERT_MPN (up, n);
+  ASSERT_MPN (vp, n);
+
+  cya = mpn_add_n (rp, up, vp, n);
+  cys = mpn_rshift (rp, rp, n, 1) >> (GMP_NUMB_BITS - 1);
+  rp[n - 1] |= cya << (GMP_NUMB_BITS - 1);
+  return cys;
+}
+mp_limb_t
+refmpn_rsh1sub_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
+{
+  mp_limb_t cya, cys;
+
+  ASSERT (refmpn_overlap_fullonly_two_p (rp, up, vp, n));
+  ASSERT (n >= 1);
+  ASSERT_MPN (up, n);
+  ASSERT_MPN (vp, n);
+
+  cya = mpn_sub_n (rp, up, vp, n);
+  cys = mpn_rshift (rp, rp, n, 1) >> (GMP_NUMB_BITS - 1);
+  rp[n - 1] |= cya << (GMP_NUMB_BITS - 1);
+  return cys;
+}
+
+/* Twos complement, return borrow. */
+mp_limb_t
+refmpn_neg (mp_ptr dst, mp_srcptr src, mp_size_t size)
+{
+  mp_ptr     zeros;
+  mp_limb_t  ret;
+
+  ASSERT (size >= 1);
+
+  zeros = refmpn_malloc_limbs (size);
+  refmpn_fill (zeros, size, CNST_LIMB(0));
+  ret = refmpn_sub_n (dst, zeros, src, size);
+  free (zeros);
+  return ret;
+}
+
+
+#define AORS(aors_n, aors_1)                                    \
+  {                                                             \
+    mp_limb_t  c;                                               \
+    ASSERT (s1size >= s2size);                                  \
+    ASSERT (s2size >= 1);                                       \
+    c = aors_n (rp, s1p, s2p, s2size);                          \
+    if (s1size-s2size != 0)                                     \
+      c = aors_1 (rp+s2size, s1p+s2size, s1size-s2size, c);     \
+    return c;                                                   \
+  }
+mp_limb_t
+refmpn_add (mp_ptr rp,
+	    mp_srcptr s1p, mp_size_t s1size,
+	    mp_srcptr s2p, mp_size_t s2size)
+{
+  AORS (refmpn_add_n, refmpn_add_1);
+}
+mp_limb_t
+refmpn_sub (mp_ptr rp,
+	    mp_srcptr s1p, mp_size_t s1size,
+	    mp_srcptr s2p, mp_size_t s2size)
+{
+  AORS (refmpn_sub_n, refmpn_sub_1);
+}
+
+
+#define SHIFTHIGH(x) ((x) << GMP_LIMB_BITS/2)
+#define SHIFTLOW(x)  ((x) >> GMP_LIMB_BITS/2)
+
+#define LOWMASK   (((mp_limb_t) 1 << GMP_LIMB_BITS/2)-1)
+#define HIGHMASK  SHIFTHIGH(LOWMASK)
+
+#define LOWPART(x)   ((x) & LOWMASK)
+#define HIGHPART(x)  SHIFTLOW((x) & HIGHMASK)
+
+/* Set return:*lo to x*y, using full limbs not nails. */
+mp_limb_t
+refmpn_umul_ppmm (mp_limb_t *lo, mp_limb_t x, mp_limb_t y)
+{
+  mp_limb_t  hi, s;
+
+  *lo = LOWPART(x) * LOWPART(y);
+  hi = HIGHPART(x) * HIGHPART(y);
+
+  s = LOWPART(x) * HIGHPART(y);
+  hi += HIGHPART(s);
+  s = SHIFTHIGH(LOWPART(s));
+  *lo += s;
+  hi += (*lo < s);
+
+  s = HIGHPART(x) * LOWPART(y);
+  hi += HIGHPART(s);
+  s = SHIFTHIGH(LOWPART(s));
+  *lo += s;
+  hi += (*lo < s);
+
+  return hi;
+}
+
+mp_limb_t
+refmpn_umul_ppmm_r (mp_limb_t x, mp_limb_t y, mp_limb_t *lo)
+{
+  return refmpn_umul_ppmm (lo, x, y);
+}
+
+mp_limb_t
+refmpn_mul_1c (mp_ptr rp, mp_srcptr sp, mp_size_t size, mp_limb_t multiplier,
+	       mp_limb_t carry)
+{
+  mp_size_t  i;
+  mp_limb_t  hi, lo;
+
+  ASSERT (refmpn_overlap_low_to_high_p (rp, sp, size));
+  ASSERT (size >= 1);
+  ASSERT_MPN (sp, size);
+  ASSERT_LIMB (multiplier);
+  ASSERT_LIMB (carry);
+
+  multiplier <<= GMP_NAIL_BITS;
+  for (i = 0; i < size; i++)
+    {
+      hi = refmpn_umul_ppmm (&lo, sp[i], multiplier);
+      lo >>= GMP_NAIL_BITS;
+      ASSERT_NOCARRY (ref_addc_limb (&hi, hi, ref_addc_limb (&lo, lo, carry)));
+      rp[i] = lo;
+      carry = hi;
+    }
+  return carry;
+}
+
+mp_limb_t
+refmpn_mul_1 (mp_ptr rp, mp_srcptr sp, mp_size_t size, mp_limb_t multiplier)
+{
+  return refmpn_mul_1c (rp, sp, size, multiplier, CNST_LIMB(0));
+}
+
+
+mp_limb_t
+refmpn_mul_N (mp_ptr dst, mp_srcptr src, mp_size_t size,
+	      mp_srcptr mult, mp_size_t msize)
+{
+  mp_ptr     src_copy;
+  mp_limb_t  ret;
+  mp_size_t  i;
+
+  ASSERT (refmpn_overlap_fullonly_p (dst, src, size));
+  ASSERT (! refmpn_overlap_p (dst, size+msize-1, mult, msize));
+  ASSERT (size >= msize);
+  ASSERT_MPN (mult, msize);
+
+  /* in case dst==src */
+  src_copy = refmpn_malloc_limbs (size);
+  refmpn_copyi (src_copy, src, size);
+  src = src_copy;
+
+  dst[size] = refmpn_mul_1 (dst, src, size, mult[0]);
+  for (i = 1; i < msize-1; i++)
+    dst[size+i] = refmpn_addmul_1 (dst+i, src, size, mult[i]);
+  ret = refmpn_addmul_1 (dst+i, src, size, mult[i]);
+
+  free (src_copy);
+  return ret;
+}
+
+mp_limb_t
+refmpn_mul_2 (mp_ptr rp, mp_srcptr sp, mp_size_t size, mp_srcptr mult)
+{
+  return refmpn_mul_N (rp, sp, size, mult, (mp_size_t) 2);
+}
+mp_limb_t
+refmpn_mul_3 (mp_ptr rp, mp_srcptr sp, mp_size_t size, mp_srcptr mult)
+{
+  return refmpn_mul_N (rp, sp, size, mult, (mp_size_t) 3);
+}
+mp_limb_t
+refmpn_mul_4 (mp_ptr rp, mp_srcptr sp, mp_size_t size, mp_srcptr mult)
+{
+  return refmpn_mul_N (rp, sp, size, mult, (mp_size_t) 4);
+}
+mp_limb_t
+refmpn_mul_5 (mp_ptr rp, mp_srcptr sp, mp_size_t size, mp_srcptr mult)
+{
+  return refmpn_mul_N (rp, sp, size, mult, (mp_size_t) 5);
+}
+mp_limb_t
+refmpn_mul_6 (mp_ptr rp, mp_srcptr sp, mp_size_t size, mp_srcptr mult)
+{
+  return refmpn_mul_N (rp, sp, size, mult, (mp_size_t) 6);
+}
+
+#define AORSMUL_1C(operation_n)                                 \
+  {                                                             \
+    mp_ptr     p;                                               \
+    mp_limb_t  ret;                                             \
+								\
+    ASSERT (refmpn_overlap_fullonly_p (rp, sp, size));          \
+								\
+    p = refmpn_malloc_limbs (size);                             \
+    ret = refmpn_mul_1c (p, sp, size, multiplier, carry);       \
+    ret += operation_n (rp, rp, p, size);                       \
+								\
+    free (p);                                                   \
+    return ret;                                                 \
+  }
+
+mp_limb_t
+refmpn_addmul_1c (mp_ptr rp, mp_srcptr sp, mp_size_t size,
+		  mp_limb_t multiplier, mp_limb_t carry)
+{
+  AORSMUL_1C (refmpn_add_n);
+}
+mp_limb_t
+refmpn_submul_1c (mp_ptr rp, mp_srcptr sp, mp_size_t size,
+		  mp_limb_t multiplier, mp_limb_t carry)
+{
+  AORSMUL_1C (refmpn_sub_n);
+}
+
+
+mp_limb_t
+refmpn_addmul_1 (mp_ptr rp, mp_srcptr sp, mp_size_t size, mp_limb_t multiplier)
+{
+  return refmpn_addmul_1c (rp, sp, size, multiplier, CNST_LIMB(0));
+}
+mp_limb_t
+refmpn_submul_1 (mp_ptr rp, mp_srcptr sp, mp_size_t size, mp_limb_t multiplier)
+{
+  return refmpn_submul_1c (rp, sp, size, multiplier, CNST_LIMB(0));
+}
+
+
+mp_limb_t
+refmpn_addmul_N (mp_ptr dst, mp_srcptr src, mp_size_t size,
+		 mp_srcptr mult, mp_size_t msize)
+{
+  mp_ptr     src_copy;
+  mp_limb_t  ret;
+  mp_size_t  i;
+
+  ASSERT (dst == src || ! refmpn_overlap_p (dst, size+msize-1, src, size));
+  ASSERT (! refmpn_overlap_p (dst, size+msize-1, mult, msize));
+  ASSERT (size >= msize);
+  ASSERT_MPN (mult, msize);
+
+  /* in case dst==src */
+  src_copy = refmpn_malloc_limbs (size);
+  refmpn_copyi (src_copy, src, size);
+  src = src_copy;
+
+  for (i = 0; i < msize-1; i++)
+    dst[size+i] = refmpn_addmul_1 (dst+i, src, size, mult[i]);
+  ret = refmpn_addmul_1 (dst+i, src, size, mult[i]);
+
+  free (src_copy);
+  return ret;
+}
+
+mp_limb_t
+refmpn_addmul_2 (mp_ptr rp, mp_srcptr sp, mp_size_t size, mp_srcptr mult)
+{
+  return refmpn_addmul_N (rp, sp, size, mult, (mp_size_t) 2);
+}
+mp_limb_t
+refmpn_addmul_3 (mp_ptr rp, mp_srcptr sp, mp_size_t size, mp_srcptr mult)
+{
+  return refmpn_addmul_N (rp, sp, size, mult, (mp_size_t) 3);
+}
+mp_limb_t
+refmpn_addmul_4 (mp_ptr rp, mp_srcptr sp, mp_size_t size, mp_srcptr mult)
+{
+  return refmpn_addmul_N (rp, sp, size, mult, (mp_size_t) 4);
+}
+mp_limb_t
+refmpn_addmul_5 (mp_ptr rp, mp_srcptr sp, mp_size_t size, mp_srcptr mult)
+{
+  return refmpn_addmul_N (rp, sp, size, mult, (mp_size_t) 5);
+}
+mp_limb_t
+refmpn_addmul_6 (mp_ptr rp, mp_srcptr sp, mp_size_t size, mp_srcptr mult)
+{
+  return refmpn_addmul_N (rp, sp, size, mult, (mp_size_t) 6);
+}
+mp_limb_t
+refmpn_addmul_7 (mp_ptr rp, mp_srcptr sp, mp_size_t size, mp_srcptr mult)
+{
+  return refmpn_addmul_N (rp, sp, size, mult, (mp_size_t) 7);
+}
+mp_limb_t
+refmpn_addmul_8 (mp_ptr rp, mp_srcptr sp, mp_size_t size, mp_srcptr mult)
+{
+  return refmpn_addmul_N (rp, sp, size, mult, (mp_size_t) 8);
+}
+
+mp_limb_t
+refmpn_add_n_sub_nc (mp_ptr r1p, mp_ptr r2p,
+		  mp_srcptr s1p, mp_srcptr s2p, mp_size_t size,
+		  mp_limb_t carry)
+{
+  mp_ptr p;
+  mp_limb_t acy, scy;
+
+  /* Destinations can't overlap. */
+  ASSERT (! refmpn_overlap_p (r1p, size, r2p, size));
+  ASSERT (refmpn_overlap_fullonly_two_p (r1p, s1p, s2p, size));
+  ASSERT (refmpn_overlap_fullonly_two_p (r2p, s1p, s2p, size));
+  ASSERT (size >= 1);
+
+  /* in case r1p==s1p or r1p==s2p */
+  p = refmpn_malloc_limbs (size);
+
+  acy = refmpn_add_nc (p, s1p, s2p, size, carry >> 1);
+  scy = refmpn_sub_nc (r2p, s1p, s2p, size, carry & 1);
+  refmpn_copyi (r1p, p, size);
+
+  free (p);
+  return 2 * acy + scy;
+}
+
+mp_limb_t
+refmpn_add_n_sub_n (mp_ptr r1p, mp_ptr r2p,
+		 mp_srcptr s1p, mp_srcptr s2p, mp_size_t size)
+{
+  return refmpn_add_n_sub_nc (r1p, r2p, s1p, s2p, size, CNST_LIMB(0));
+}
+
+
+/* Right shift hi,lo and return the low limb of the result.
+   Note a shift by GMP_LIMB_BITS isn't assumed to work (doesn't on x86). */
+mp_limb_t
+rshift_make (mp_limb_t hi, mp_limb_t lo, unsigned shift)
+{
+  ASSERT (shift < GMP_NUMB_BITS);
+  if (shift == 0)
+    return lo;
+  else
+    return ((hi << (GMP_NUMB_BITS-shift)) | (lo >> shift)) & GMP_NUMB_MASK;
+}
+
+/* Left shift hi,lo and return the high limb of the result.
+   Note a shift by GMP_LIMB_BITS isn't assumed to work (doesn't on x86). */
+mp_limb_t
+lshift_make (mp_limb_t hi, mp_limb_t lo, unsigned shift)
+{
+  ASSERT (shift < GMP_NUMB_BITS);
+  if (shift == 0)
+    return hi;
+  else
+    return ((hi << shift) | (lo >> (GMP_NUMB_BITS-shift))) & GMP_NUMB_MASK;
+}
+
+
+mp_limb_t
+refmpn_rshift (mp_ptr rp, mp_srcptr sp, mp_size_t size, unsigned shift)
+{
+  mp_limb_t  ret;
+  mp_size_t  i;
+
+  ASSERT (refmpn_overlap_low_to_high_p (rp, sp, size));
+  ASSERT (size >= 1);
+  ASSERT (shift >= 1 && shift < GMP_NUMB_BITS);
+  ASSERT_MPN (sp, size);
+
+  ret = rshift_make (sp[0], CNST_LIMB(0), shift);
+
+  for (i = 0; i < size-1; i++)
+    rp[i] = rshift_make (sp[i+1], sp[i], shift);
+
+  rp[i] = rshift_make (CNST_LIMB(0), sp[i], shift);
+  return ret;
+}
+
+mp_limb_t
+refmpn_lshift (mp_ptr rp, mp_srcptr sp, mp_size_t size, unsigned shift)
+{
+  mp_limb_t  ret;
+  mp_size_t  i;
+
+  ASSERT (refmpn_overlap_high_to_low_p (rp, sp, size));
+  ASSERT (size >= 1);
+  ASSERT (shift >= 1 && shift < GMP_NUMB_BITS);
+  ASSERT_MPN (sp, size);
+
+  ret = lshift_make (CNST_LIMB(0), sp[size-1], shift);
+
+  for (i = size-2; i >= 0; i--)
+    rp[i+1] = lshift_make (sp[i+1], sp[i], shift);
+
+  rp[i+1] = lshift_make (sp[i+1], CNST_LIMB(0), shift);
+  return ret;
+}
+
+void
+refmpn_com (mp_ptr rp, mp_srcptr sp, mp_size_t size)
+{
+  mp_size_t i;
+
+  /* We work downwards since mpn_lshiftc needs that. */
+  ASSERT (refmpn_overlap_high_to_low_p (rp, sp, size));
+
+  for (i = size - 1; i >= 0; i--)
+    rp[i] = (~sp[i]) & GMP_NUMB_MASK;
+}
+
+mp_limb_t
+refmpn_lshiftc (mp_ptr rp, mp_srcptr sp, mp_size_t size, unsigned shift)
+{
+  mp_limb_t res;
+
+  /* No asserts here, refmpn_lshift will assert what we need. */
+
+  res = refmpn_lshift (rp, sp, size, shift);
+  refmpn_com (rp, rp, size);
+  return res;
+}
+
+/* accepting shift==0 and doing a plain copyi or copyd in that case */
+mp_limb_t
+refmpn_rshift_or_copy (mp_ptr rp, mp_srcptr sp, mp_size_t size, unsigned shift)
+{
+  if (shift == 0)
+    {
+      refmpn_copyi (rp, sp, size);
+      return 0;
+    }
+  else
+    {
+      return refmpn_rshift (rp, sp, size, shift);
+    }
+}
+mp_limb_t
+refmpn_lshift_or_copy (mp_ptr rp, mp_srcptr sp, mp_size_t size, unsigned shift)
+{
+  if (shift == 0)
+    {
+      refmpn_copyd (rp, sp, size);
+      return 0;
+    }
+  else
+    {
+      return refmpn_lshift (rp, sp, size, shift);
+    }
+}
+
+/* accepting size==0 too */
+mp_limb_t
+refmpn_rshift_or_copy_any (mp_ptr rp, mp_srcptr sp, mp_size_t size,
+			   unsigned shift)
+{
+  return (size == 0 ? 0 : refmpn_rshift_or_copy (rp, sp, size, shift));
+}
+mp_limb_t
+refmpn_lshift_or_copy_any (mp_ptr rp, mp_srcptr sp, mp_size_t size,
+			   unsigned shift)
+{
+  return (size == 0 ? 0 : refmpn_lshift_or_copy (rp, sp, size, shift));
+}
+
+/* Divide h,l by d, return quotient, store remainder to *rp.
+   Operates on full limbs, not nails.
+   Must have h < d.
+   __udiv_qrnnd_c isn't simple, and it's a bit slow, but it works. */
+mp_limb_t
+refmpn_udiv_qrnnd (mp_limb_t *rp, mp_limb_t h, mp_limb_t l, mp_limb_t d)
+{
+  mp_limb_t  q, r;
+  int  n;
+
+  ASSERT (d != 0);
+  ASSERT (h < d);
+
+#if 0
+  udiv_qrnnd (q, r, h, l, d);
+  *rp = r;
+  return q;
+#endif
+
+  n = refmpn_count_leading_zeros (d);
+  d <<= n;
+
+  if (n != 0)
+    {
+      h = (h << n) | (l >> (GMP_LIMB_BITS - n));
+      l <<= n;
+    }
+
+  __udiv_qrnnd_c (q, r, h, l, d);
+  r >>= n;
+  *rp = r;
+  return q;
+}
+
+mp_limb_t
+refmpn_udiv_qrnnd_r (mp_limb_t h, mp_limb_t l, mp_limb_t d, mp_limb_t *rp)
+{
+  return refmpn_udiv_qrnnd (rp, h, l, d);
+}
+
+/* This little subroutine avoids some bad code generation from i386 gcc 3.0
+   -fPIC -O2 -fomit-frame-pointer (%ebp being used uninitialized).  */
+static mp_limb_t
+refmpn_divmod_1c_workaround (mp_ptr rp, mp_srcptr sp, mp_size_t size,
+			     mp_limb_t divisor, mp_limb_t carry)
+{
+  mp_size_t  i;
+  mp_limb_t rem[1];
+  for (i = size-1; i >= 0; i--)
+    {
+      rp[i] = refmpn_udiv_qrnnd (rem, carry,
+				 sp[i] << GMP_NAIL_BITS,
+				 divisor << GMP_NAIL_BITS);
+      carry = *rem >> GMP_NAIL_BITS;
+    }
+  return carry;
+}
+
+mp_limb_t
+refmpn_divmod_1c (mp_ptr rp, mp_srcptr sp, mp_size_t size,
+		  mp_limb_t divisor, mp_limb_t carry)
+{
+  mp_ptr     sp_orig;
+  mp_ptr     prod;
+  mp_limb_t  carry_orig;
+
+  ASSERT (refmpn_overlap_fullonly_p (rp, sp, size));
+  ASSERT (size >= 0);
+  ASSERT (carry < divisor);
+  ASSERT_MPN (sp, size);
+  ASSERT_LIMB (divisor);
+  ASSERT_LIMB (carry);
+
+  if (size == 0)
+    return carry;
+
+  sp_orig = refmpn_memdup_limbs (sp, size);
+  prod = refmpn_malloc_limbs (size);
+  carry_orig = carry;
+
+  carry = refmpn_divmod_1c_workaround (rp, sp, size, divisor, carry);
+
+  /* check by multiplying back */
+#if 0
+  printf ("size=%ld divisor=0x%lX carry=0x%lX remainder=0x%lX\n",
+	  size, divisor, carry_orig, carry);
+  mpn_trace("s",sp_copy,size);
+  mpn_trace("r",rp,size);
+  printf ("mul_1c %lX\n", refmpn_mul_1c (prod, rp, size, divisor, carry));
+  mpn_trace("p",prod,size);
+#endif
+  ASSERT (refmpn_mul_1c (prod, rp, size, divisor, carry) == carry_orig);
+  ASSERT (refmpn_cmp (prod, sp_orig, size) == 0);
+  free (sp_orig);
+  free (prod);
+
+  return carry;
+}
+
+mp_limb_t
+refmpn_divmod_1 (mp_ptr rp, mp_srcptr sp, mp_size_t size, mp_limb_t divisor)
+{
+  return refmpn_divmod_1c (rp, sp, size, divisor, CNST_LIMB(0));
+}
+
+
+mp_limb_t
+refmpn_mod_1c (mp_srcptr sp, mp_size_t size, mp_limb_t divisor,
+	       mp_limb_t carry)
+{
+  mp_ptr  p = refmpn_malloc_limbs (size);
+  carry = refmpn_divmod_1c (p, sp, size, divisor, carry);
+  free (p);
+  return carry;
+}
+
+mp_limb_t
+refmpn_mod_1 (mp_srcptr sp, mp_size_t size, mp_limb_t divisor)
+{
+  return refmpn_mod_1c (sp, size, divisor, CNST_LIMB(0));
+}
+
+mp_limb_t
+refmpn_preinv_mod_1 (mp_srcptr sp, mp_size_t size, mp_limb_t divisor,
+		     mp_limb_t inverse)
+{
+  ASSERT (divisor & GMP_NUMB_HIGHBIT);
+  ASSERT (inverse == refmpn_invert_limb (divisor));
+  return refmpn_mod_1 (sp, size, divisor);
+}
+
+/* This implementation will be rather slow, but has the advantage of being
+   in a different style than the libgmp versions.  */
+mp_limb_t
+refmpn_mod_34lsub1 (mp_srcptr p, mp_size_t n)
+{
+  ASSERT ((GMP_NUMB_BITS % 4) == 0);
+  return mpn_mod_1 (p, n, (CNST_LIMB(1) << (3 * GMP_NUMB_BITS / 4)) - 1);
+}
+
+
+mp_limb_t
+refmpn_divrem_1c (mp_ptr rp, mp_size_t xsize,
+		  mp_srcptr sp, mp_size_t size, mp_limb_t divisor,
+		  mp_limb_t carry)
+{
+  mp_ptr  z;
+
+  z = refmpn_malloc_limbs (xsize);
+  refmpn_fill (z, xsize, CNST_LIMB(0));
+
+  carry = refmpn_divmod_1c (rp+xsize, sp, size, divisor, carry);
+  carry = refmpn_divmod_1c (rp, z, xsize, divisor, carry);
+
+  free (z);
+  return carry;
+}
+
+mp_limb_t
+refmpn_divrem_1 (mp_ptr rp, mp_size_t xsize,
+		 mp_srcptr sp, mp_size_t size, mp_limb_t divisor)
+{
+  return refmpn_divrem_1c (rp, xsize, sp, size, divisor, CNST_LIMB(0));
+}
+
+mp_limb_t
+refmpn_preinv_divrem_1 (mp_ptr rp, mp_size_t xsize,
+			mp_srcptr sp, mp_size_t size,
+			mp_limb_t divisor, mp_limb_t inverse, unsigned shift)
+{
+  ASSERT (size >= 0);
+  ASSERT (shift == refmpn_count_leading_zeros (divisor));
+  ASSERT (inverse == refmpn_invert_limb (divisor << shift));
+
+  return refmpn_divrem_1 (rp, xsize, sp, size, divisor);
+}
+
+mp_limb_t
+refmpn_divrem_2 (mp_ptr qp, mp_size_t qxn,
+		 mp_ptr np, mp_size_t nn,
+		 mp_srcptr dp)
+{
+  mp_ptr tp;
+  mp_limb_t qh;
+
+  tp = refmpn_malloc_limbs (nn + qxn);
+  refmpn_zero (tp, qxn);
+  refmpn_copyi (tp + qxn, np, nn);
+  qh = refmpn_sb_div_qr (qp, tp, nn + qxn, dp, 2);
+  refmpn_copyi (np, tp, 2);
+  free (tp);
+  return qh;
+}
+
+/* Inverse is floor((b*(b-d)-1) / d), per division by invariant integers
+   paper, figure 8.1 m', where b=2^GMP_LIMB_BITS.  Note that -d-1 < d
+   since d has the high bit set. */
+
+mp_limb_t
+refmpn_invert_limb (mp_limb_t d)
+{
+  mp_limb_t r;
+  ASSERT (d & GMP_LIMB_HIGHBIT);
+  return refmpn_udiv_qrnnd (&r, -d-1, MP_LIMB_T_MAX, d);
+}
+
+void
+refmpn_invert (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_ptr scratch)
+{
+  mp_ptr qp, tp;
+  TMP_DECL;
+  TMP_MARK;
+
+  tp = TMP_ALLOC_LIMBS (2 * n);
+  qp = TMP_ALLOC_LIMBS (n + 1);
+
+  MPN_ZERO (tp, 2 * n);  mpn_sub_1 (tp, tp, 2 * n, 1);
+
+  refmpn_tdiv_qr (qp, rp, 0, tp, 2 * n, up, n);
+  refmpn_copyi (rp, qp, n);
+
+  TMP_FREE;
+}
+
+void
+refmpn_binvert (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_ptr scratch)
+{
+  mp_ptr tp;
+  mp_limb_t binv;
+  TMP_DECL;
+  TMP_MARK;
+
+  /* We use the library mpn_sbpi1_bdiv_q here, which isn't kosher in testing
+     code.  To make up for it, we check that the inverse is correct using a
+     multiply.  */
+
+  tp = TMP_ALLOC_LIMBS (2 * n);
+
+  MPN_ZERO (tp, n);
+  tp[0] = 1;
+  binvert_limb (binv, up[0]);
+  mpn_sbpi1_bdiv_q (rp, tp, n, up, n, -binv);
+
+  refmpn_mul_n (tp, rp, up, n);
+  ASSERT_ALWAYS (tp[0] == 1 && mpn_zero_p (tp + 1, n - 1));
+
+  TMP_FREE;
+}
+
+/* The aim is to produce a dst quotient and return a remainder c, satisfying
+   c*b^n + src-i == 3*dst, where i is the incoming carry.
+
+   Some value c==0, c==1 or c==2 will satisfy, so just try each.
+
+   If GMP_NUMB_BITS is even then 2^GMP_NUMB_BITS==1mod3 and a non-zero
+   remainder from the first division attempt determines the correct
+   remainder (3-c), but don't bother with that, since we can't guarantee
+   anything about GMP_NUMB_BITS when using nails.
+
+   If the initial src-i produces a borrow then refmpn_sub_1 leaves a twos
+   complement negative, ie. b^n+a-i, and the calculation produces c1
+   satisfying c1*b^n + b^n+src-i == 3*dst, from which clearly c=c1+1.  This
+   means it's enough to just add any borrow back at the end.
+
+   A borrow only occurs when a==0 or a==1, and, by the same reasoning as in
+   mpn/generic/diveby3.c, the c1 that results in those cases will only be 0
+   or 1 respectively, so with 1 added the final return value is still in the
+   prescribed range 0 to 2. */
+
+mp_limb_t
+refmpn_divexact_by3c (mp_ptr rp, mp_srcptr sp, mp_size_t size, mp_limb_t carry)
+{
+  mp_ptr     spcopy;
+  mp_limb_t  c, cs;
+
+  ASSERT (refmpn_overlap_fullonly_p (rp, sp, size));
+  ASSERT (size >= 1);
+  ASSERT (carry <= 2);
+  ASSERT_MPN (sp, size);
+
+  spcopy = refmpn_malloc_limbs (size);
+  cs = refmpn_sub_1 (spcopy, sp, size, carry);
+
+  for (c = 0; c <= 2; c++)
+    if (refmpn_divmod_1c (rp, spcopy, size, CNST_LIMB(3), c) == 0)
+      goto done;
+  ASSERT_FAIL (no value of c satisfies);
+
+ done:
+  c += cs;
+  ASSERT (c <= 2);
+
+  free (spcopy);
+  return c;
+}
+
+mp_limb_t
+refmpn_divexact_by3 (mp_ptr rp, mp_srcptr sp, mp_size_t size)
+{
+  return refmpn_divexact_by3c (rp, sp, size, CNST_LIMB(0));
+}
+
+
+/* The same as mpn/generic/mul_basecase.c, but using refmpn functions. */
+void
+refmpn_mul_basecase (mp_ptr prodp,
+		     mp_srcptr up, mp_size_t usize,
+		     mp_srcptr vp, mp_size_t vsize)
+{
+  mp_size_t i;
+
+  ASSERT (! refmpn_overlap_p (prodp, usize+vsize, up, usize));
+  ASSERT (! refmpn_overlap_p (prodp, usize+vsize, vp, vsize));
+  ASSERT (usize >= vsize);
+  ASSERT (vsize >= 1);
+  ASSERT_MPN (up, usize);
+  ASSERT_MPN (vp, vsize);
+
+  prodp[usize] = refmpn_mul_1 (prodp, up, usize, vp[0]);
+  for (i = 1; i < vsize; i++)
+    prodp[usize+i] = refmpn_addmul_1 (prodp+i, up, usize, vp[i]);
+}
+
+
+/* The same as mpn/generic/mulmid_basecase.c, but using refmpn functions. */
+void
+refmpn_mulmid_basecase (mp_ptr rp,
+			mp_srcptr up, mp_size_t un,
+			mp_srcptr vp, mp_size_t vn)
+{
+  mp_limb_t cy;
+  mp_size_t i;
+
+  ASSERT (un >= vn);
+  ASSERT (vn >= 1);
+  ASSERT (! refmpn_overlap_p (rp, un - vn + 3, up, un));
+  ASSERT (! refmpn_overlap_p (rp, un - vn + 3, vp, vn));
+  ASSERT_MPN (up, un);
+  ASSERT_MPN (vp, vn);
+
+  rp[un - vn + 1] = refmpn_mul_1 (rp, up + vn - 1, un - vn + 1, vp[0]);
+  rp[un - vn + 2] = CNST_LIMB (0);
+  for (i = 1; i < vn; i++)
+    {
+      cy = refmpn_addmul_1 (rp, up + vn - i - 1, un - vn + 1, vp[i]);
+      cy = ref_addc_limb (&rp[un - vn + 1], rp[un - vn + 1], cy);
+      cy = ref_addc_limb (&rp[un - vn + 2], rp[un - vn + 2], cy);
+      ASSERT (cy == 0);
+    }
+}
+
+void
+refmpn_toom42_mulmid (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n,
+		      mp_ptr scratch)
+{
+  refmpn_mulmid_basecase (rp, up, 2*n - 1, vp, n);
+}
+
+void
+refmpn_mulmid_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
+{
+  /* FIXME: this could be made faster by using refmpn_mul and then subtracting
+     off products near the middle product region boundary */
+  refmpn_mulmid_basecase (rp, up, 2*n - 1, vp, n);
+}
+
+void
+refmpn_mulmid (mp_ptr rp, mp_srcptr up, mp_size_t un,
+	       mp_srcptr vp, mp_size_t vn)
+{
+  /* FIXME: this could be made faster by using refmpn_mul and then subtracting
+     off products near the middle product region boundary */
+  refmpn_mulmid_basecase (rp, up, un, vp, vn);
+}
+
+
+
+#define TOOM3_THRESHOLD (MAX (MUL_TOOM33_THRESHOLD, SQR_TOOM3_THRESHOLD))
+#define TOOM4_THRESHOLD (MAX (MUL_TOOM44_THRESHOLD, SQR_TOOM4_THRESHOLD))
+#define TOOM6_THRESHOLD (MAX (MUL_TOOM6H_THRESHOLD, SQR_TOOM6_THRESHOLD))
+#if WANT_FFT
+#define FFT_THRESHOLD (MAX (MUL_FFT_THRESHOLD, SQR_FFT_THRESHOLD))
+#else
+#define FFT_THRESHOLD MP_SIZE_T_MAX /* don't use toom44 here */
+#endif
+
+void
+refmpn_mul (mp_ptr wp, mp_srcptr up, mp_size_t un, mp_srcptr vp, mp_size_t vn)
+{
+  mp_ptr tp, rp;
+  mp_size_t tn;
+
+  if (vn < TOOM3_THRESHOLD)
+    {
+      /* In the mpn_mul_basecase and toom2 ranges, use our own mul_basecase. */
+      if (vn != 0)
+	refmpn_mul_basecase (wp, up, un, vp, vn);
+      else
+	MPN_ZERO (wp, un);
+      return;
+    }
+
+  MPN_ZERO (wp, vn);
+  rp = refmpn_malloc_limbs (2 * vn);
+
+  if (vn < TOOM4_THRESHOLD)
+    tn = mpn_toom22_mul_itch (vn, vn);
+  else if (vn < TOOM6_THRESHOLD)
+    tn = mpn_toom33_mul_itch (vn, vn);
+  else if (vn < FFT_THRESHOLD)
+    tn = mpn_toom44_mul_itch (vn, vn);
+  else
+    tn = mpn_toom6h_mul_itch (vn, vn);
+  tp = refmpn_malloc_limbs (tn);
+
+  while (un >= vn)
+    {
+      if (vn < TOOM4_THRESHOLD)
+	/* In the toom3 range, use mpn_toom22_mul.  */
+	mpn_toom22_mul (rp, up, vn, vp, vn, tp);
+      else if (vn < TOOM6_THRESHOLD)
+	/* In the toom4 range, use mpn_toom33_mul.  */
+	mpn_toom33_mul (rp, up, vn, vp, vn, tp);
+      else if (vn < FFT_THRESHOLD)
+	/* In the toom6 range, use mpn_toom44_mul.  */
+	mpn_toom44_mul (rp, up, vn, vp, vn, tp);
+      else
+	/* For the largest operands, use mpn_toom6h_mul.  */
+	mpn_toom6h_mul (rp, up, vn, vp, vn, tp);
+
+      ASSERT_NOCARRY (refmpn_add (wp, rp, 2 * vn, wp, vn));
+      wp += vn;
+
+      up += vn;
+      un -= vn;
+    }
+
+  free (tp);
+
+  if (un != 0)
+    {
+      refmpn_mul (rp, vp, vn, up, un);
+      ASSERT_NOCARRY (refmpn_add (wp, rp, un + vn, wp, vn));
+    }
+  free (rp);
+}
+
+void
+refmpn_mul_n (mp_ptr prodp, mp_srcptr up, mp_srcptr vp, mp_size_t size)
+{
+  refmpn_mul (prodp, up, size, vp, size);
+}
+
+void
+refmpn_mullo_n (mp_ptr prodp, mp_srcptr up, mp_srcptr vp, mp_size_t size)
+{
+  mp_ptr tp = refmpn_malloc_limbs (2*size);
+  refmpn_mul (tp, up, size, vp, size);
+  refmpn_copyi (prodp, tp, size);
+  free (tp);
+}
+
+void
+refmpn_sqr (mp_ptr dst, mp_srcptr src, mp_size_t size)
+{
+  refmpn_mul (dst, src, size, src, size);
+}
+
+void
+refmpn_sqrlo (mp_ptr dst, mp_srcptr src, mp_size_t size)
+{
+  refmpn_mullo_n (dst, src, src, size);
+}
+
+/* Allowing usize<vsize, usize==0 or vsize==0. */
+void
+refmpn_mul_any (mp_ptr prodp,
+		     mp_srcptr up, mp_size_t usize,
+		     mp_srcptr vp, mp_size_t vsize)
+{
+  ASSERT (! refmpn_overlap_p (prodp, usize+vsize, up, usize));
+  ASSERT (! refmpn_overlap_p (prodp, usize+vsize, vp, vsize));
+  ASSERT (usize >= 0);
+  ASSERT (vsize >= 0);
+  ASSERT_MPN (up, usize);
+  ASSERT_MPN (vp, vsize);
+
+  if (usize == 0)
+    {
+      refmpn_fill (prodp, vsize, CNST_LIMB(0));
+      return;
+    }
+
+  if (vsize == 0)
+    {
+      refmpn_fill (prodp, usize, CNST_LIMB(0));
+      return;
+    }
+
+  if (usize >= vsize)
+    refmpn_mul (prodp, up, usize, vp, vsize);
+  else
+    refmpn_mul (prodp, vp, vsize, up, usize);
+}
+
+
+mp_limb_t
+refmpn_gcd_11 (mp_limb_t x, mp_limb_t y)
+{
+  /* The non-ref function also requires input operands to be odd, but
+     below refmpn_gcd_1 doesn't guarantee that. */
+  ASSERT (x > 0);
+  ASSERT (y > 0);
+  do
+    {
+      while ((x & 1) == 0)  x >>= 1;
+      while ((y & 1) == 0)  y >>= 1;
+
+      if (x < y)
+	MP_LIMB_T_SWAP (x, y);
+
+      x -= y;
+    }
+  while (x != 0);
+
+  return y;
+}
+
+mp_double_limb_t
+refmpn_gcd_22 (mp_limb_t x1, mp_limb_t x0, mp_limb_t y1, mp_limb_t y0)
+{
+  ASSERT ((x0 & 1) != 0);
+  ASSERT ((y0 & 1) != 0);
+  mp_double_limb_t g;
+  mp_limb_t cy;
+
+  do
+    {
+      while ((x0 & 1) == 0)
+	{
+	  x0 = (x1 << (GMP_NUMB_BITS - 1)) | (x0 >> 1);
+	  x1 >>= 1;
+	}
+      while ((y0 & 1) == 0)
+	{
+	  y0 = (y1 << (GMP_NUMB_BITS - 1)) | (y0 >> 1);
+	  y1 >>= 1;
+	}
+
+
+      if (x1 < y1 || (x1 == y1 && x0 < y0))
+	{
+	  mp_limb_t t;
+	  t = x1; x1 = y1; y1 = t;
+	  t = x0; x0 = y0; y0 = t;
+	}
+
+      cy = (x0 < y0);
+      x0 -= y0;
+      x1 -= y1 + cy;
+    }
+  while ((x1 | x0) != 0);
+
+  g.d1 = y1;
+  g.d0 = y0;
+  return g;
+}
+
+mp_limb_t
+refmpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y)
+{
+  mp_limb_t  x;
+  int  twos;
+
+  ASSERT (y != 0);
+  ASSERT (! refmpn_zero_p (xp, xsize));
+  ASSERT_MPN (xp, xsize);
+  ASSERT_LIMB (y);
+
+  x = refmpn_mod_1 (xp, xsize, y);
+  if (x == 0)
+    return y;
+
+  twos = 0;
+  while ((x & 1) == 0 && (y & 1) == 0)
+    {
+      x >>= 1;
+      y >>= 1;
+      twos++;
+    }
+
+  return refmpn_gcd_11 (x, y) << twos;
+}
+
+
+/* Based on the full limb x, not nails. */
+unsigned
+refmpn_count_leading_zeros (mp_limb_t x)
+{
+  unsigned  n = 0;
+
+  ASSERT (x != 0);
+
+  while ((x & GMP_LIMB_HIGHBIT) == 0)
+    {
+      x <<= 1;
+      n++;
+    }
+  return n;
+}
+
+/* Full limbs allowed, not limited to nails. */
+unsigned
+refmpn_count_trailing_zeros (mp_limb_t x)
+{
+  unsigned  n = 0;
+
+  ASSERT (x != 0);
+  ASSERT_LIMB (x);
+
+  while ((x & 1) == 0)
+    {
+      x >>= 1;
+      n++;
+    }
+  return n;
+}
+
+/* Strip factors of two (low zero bits) from {p,size} by right shifting.
+   The return value is the number of twos stripped.  */
+mp_size_t
+refmpn_strip_twos (mp_ptr p, mp_size_t size)
+{
+  mp_size_t  limbs;
+  unsigned   shift;
+
+  ASSERT (size >= 1);
+  ASSERT (! refmpn_zero_p (p, size));
+  ASSERT_MPN (p, size);
+
+  for (limbs = 0; p[0] == 0; limbs++)
+    {
+      refmpn_copyi (p, p+1, size-1);
+      p[size-1] = 0;
+    }
+
+  shift = refmpn_count_trailing_zeros (p[0]);
+  if (shift)
+    refmpn_rshift (p, p, size, shift);
+
+  return limbs*GMP_NUMB_BITS + shift;
+}
+
+mp_limb_t
+refmpn_gcd (mp_ptr gp, mp_ptr xp, mp_size_t xsize, mp_ptr yp, mp_size_t ysize)
+{
+  int       cmp;
+
+  ASSERT (ysize >= 1);
+  ASSERT (xsize >= ysize);
+  ASSERT ((xp[0] & 1) != 0);
+  ASSERT ((yp[0] & 1) != 0);
+  /* ASSERT (xp[xsize-1] != 0); */  /* don't think x needs to be odd */
+  ASSERT (yp[ysize-1] != 0);
+  ASSERT (refmpn_overlap_fullonly_p (gp, xp, xsize));
+  ASSERT (refmpn_overlap_fullonly_p (gp, yp, ysize));
+  ASSERT (! refmpn_overlap_p (xp, xsize, yp, ysize));
+  if (xsize == ysize)
+    ASSERT (refmpn_msbone (xp[xsize-1]) >= refmpn_msbone (yp[ysize-1]));
+  ASSERT_MPN (xp, xsize);
+  ASSERT_MPN (yp, ysize);
+
+  refmpn_strip_twos (xp, xsize);
+  MPN_NORMALIZE (xp, xsize);
+  MPN_NORMALIZE (yp, ysize);
+
+  for (;;)
+    {
+      cmp = refmpn_cmp_twosizes (xp, xsize, yp, ysize);
+      if (cmp == 0)
+	break;
+      if (cmp < 0)
+	MPN_PTR_SWAP (xp,xsize, yp,ysize);
+
+      ASSERT_NOCARRY (refmpn_sub (xp, xp, xsize, yp, ysize));
+
+      refmpn_strip_twos (xp, xsize);
+      MPN_NORMALIZE (xp, xsize);
+    }
+
+  refmpn_copyi (gp, xp, xsize);
+  return xsize;
+}
+
+unsigned long
+ref_popc_limb (mp_limb_t src)
+{
+  unsigned long  count;
+  int  i;
+
+  count = 0;
+  for (i = 0; i < GMP_LIMB_BITS; i++)
+    {
+      count += (src & 1);
+      src >>= 1;
+    }
+  return count;
+}
+
+unsigned long
+refmpn_popcount (mp_srcptr sp, mp_size_t size)
+{
+  unsigned long  count = 0;
+  mp_size_t  i;
+
+  ASSERT (size >= 0);
+  ASSERT_MPN (sp, size);
+
+  for (i = 0; i < size; i++)
+    count += ref_popc_limb (sp[i]);
+  return count;
+}
+
+unsigned long
+refmpn_hamdist (mp_srcptr s1p, mp_srcptr s2p, mp_size_t size)
+{
+  mp_ptr  d;
+  unsigned long  count;
+
+  ASSERT (size >= 0);
+  ASSERT_MPN (s1p, size);
+  ASSERT_MPN (s2p, size);
+
+  if (size == 0)
+    return 0;
+
+  d = refmpn_malloc_limbs (size);
+  refmpn_xor_n (d, s1p, s2p, size);
+  count = refmpn_popcount (d, size);
+  free (d);
+  return count;
+}
+
+
+/* set r to a%d */
+void
+refmpn_mod2 (mp_limb_t r[2], const mp_limb_t a[2], const mp_limb_t d[2])
+{
+  mp_limb_t  D[2];
+  int        n;
+
+  ASSERT (! refmpn_overlap_p (r, (mp_size_t) 2, d, (mp_size_t) 2));
+  ASSERT_MPN (a, 2);
+  ASSERT_MPN (d, 2);
+
+  D[1] = d[1], D[0] = d[0];
+  r[1] = a[1], r[0] = a[0];
+  n = 0;
+
+  for (;;)
+    {
+      if (D[1] & GMP_NUMB_HIGHBIT)
+	break;
+      if (refmpn_cmp (r, D, (mp_size_t) 2) <= 0)
+	break;
+      refmpn_lshift (D, D, (mp_size_t) 2, 1);
+      n++;
+      ASSERT (n <= GMP_NUMB_BITS);
+    }
+
+  while (n >= 0)
+    {
+      if (refmpn_cmp (r, D, (mp_size_t) 2) >= 0)
+	ASSERT_NOCARRY (refmpn_sub_n (r, r, D, (mp_size_t) 2));
+      refmpn_rshift (D, D, (mp_size_t) 2, 1);
+      n--;
+    }
+
+  ASSERT (refmpn_cmp (r, d, (mp_size_t) 2) < 0);
+}
+
+
+
+/* Similar to the old mpn/generic/sb_divrem_mn.c, but somewhat simplified, in
+   particular the trial quotient is allowed to be 2 too big. */
+mp_limb_t
+refmpn_sb_div_qr (mp_ptr qp,
+		  mp_ptr np, mp_size_t nsize,
+		  mp_srcptr dp, mp_size_t dsize)
+{
+  mp_limb_t  retval = 0;
+  mp_size_t  i;
+  mp_limb_t  d1 = dp[dsize-1];
+  mp_ptr     np_orig = refmpn_memdup_limbs (np, nsize);
+
+  ASSERT (nsize >= dsize);
+  /* ASSERT (dsize > 2); */
+  ASSERT (dsize >= 2);
+  ASSERT (dp[dsize-1] & GMP_NUMB_HIGHBIT);
+  ASSERT (! refmpn_overlap_p (qp, nsize-dsize, np, nsize) || qp+dsize >= np);
+  ASSERT_MPN (np, nsize);
+  ASSERT_MPN (dp, dsize);
+
+  i = nsize-dsize;
+  if (refmpn_cmp (np+i, dp, dsize) >= 0)
+    {
+      ASSERT_NOCARRY (refmpn_sub_n (np+i, np+i, dp, dsize));
+      retval = 1;
+    }
+
+  for (i--; i >= 0; i--)
+    {
+      mp_limb_t  n0 = np[i+dsize];
+      mp_limb_t  n1 = np[i+dsize-1];
+      mp_limb_t  q, dummy_r;
+
+      ASSERT (n0 <= d1);
+      if (n0 == d1)
+	q = GMP_NUMB_MAX;
+      else
+	q = refmpn_udiv_qrnnd (&dummy_r, n0, n1 << GMP_NAIL_BITS,
+			       d1 << GMP_NAIL_BITS);
+
+      n0 -= refmpn_submul_1 (np+i, dp, dsize, q);
+      ASSERT (n0 == 0 || n0 == MP_LIMB_T_MAX);
+      if (n0)
+	{
+	  q--;
+	  if (! refmpn_add_n (np+i, np+i, dp, dsize))
+	    {
+	      q--;
+	      ASSERT_CARRY (refmpn_add_n (np+i, np+i, dp, dsize));
+	    }
+	}
+      np[i+dsize] = 0;
+
+      qp[i] = q;
+    }
+
+  /* remainder < divisor */
+#if 0		/* ASSERT triggers gcc 4.2.1 bug */
+  ASSERT (refmpn_cmp (np, dp, dsize) < 0);
+#endif
+
+  /* multiply back to original */
+  {
+    mp_ptr  mp = refmpn_malloc_limbs (nsize);
+
+    refmpn_mul_any (mp, qp, nsize-dsize, dp, dsize);
+    if (retval)
+      ASSERT_NOCARRY (refmpn_add_n (mp+nsize-dsize,mp+nsize-dsize, dp, dsize));
+    ASSERT_NOCARRY (refmpn_add (mp, mp, nsize, np, dsize));
+    ASSERT (refmpn_cmp (mp, np_orig, nsize) == 0);
+
+    free (mp);
+  }
+
+  free (np_orig);
+  return retval;
+}
+
+/* Similar to the old mpn/generic/sb_divrem_mn.c, but somewhat simplified, in
+   particular the trial quotient is allowed to be 2 too big. */
+void
+refmpn_tdiv_qr (mp_ptr qp, mp_ptr rp, mp_size_t qxn,
+		mp_ptr np, mp_size_t nsize,
+		mp_srcptr dp, mp_size_t dsize)
+{
+  ASSERT (qxn == 0);
+  ASSERT_MPN (np, nsize);
+  ASSERT_MPN (dp, dsize);
+  ASSERT (dsize > 0);
+  ASSERT (dp[dsize-1] != 0);
+
+  if (dsize == 1)
+    {
+      rp[0] = refmpn_divmod_1 (qp, np, nsize, dp[0]);
+      return;
+    }
+  else
+    {
+      mp_ptr  n2p = refmpn_malloc_limbs (nsize+1);
+      mp_ptr  d2p = refmpn_malloc_limbs (dsize);
+      int     norm = refmpn_count_leading_zeros (dp[dsize-1]) - GMP_NAIL_BITS;
+
+      n2p[nsize] = refmpn_lshift_or_copy (n2p, np, nsize, norm);
+      ASSERT_NOCARRY (refmpn_lshift_or_copy (d2p, dp, dsize, norm));
+
+      refmpn_sb_div_qr (qp, n2p, nsize+1, d2p, dsize);
+      refmpn_rshift_or_copy (rp, n2p, dsize, norm);
+
+      /* ASSERT (refmpn_zero_p (tp+dsize, nsize-dsize)); */
+      free (n2p);
+      free (d2p);
+    }
+}
+
+mp_limb_t
+refmpn_redc_1 (mp_ptr rp, mp_ptr up, mp_srcptr mp, mp_size_t n, mp_limb_t invm)
+{
+  mp_size_t j;
+  mp_limb_t cy;
+
+  ASSERT_MPN (up, 2*n);
+  /* ASSERT about directed overlap rp, up */
+  /* ASSERT about overlap rp, mp */
+  /* ASSERT about overlap up, mp */
+
+  for (j = n - 1; j >= 0; j--)
+    {
+      up[0] = refmpn_addmul_1 (up, mp, n, (up[0] * invm) & GMP_NUMB_MASK);
+      up++;
+    }
+  cy = mpn_add_n (rp, up, up - n, n);
+  return cy;
+}
+
+size_t
+refmpn_get_str (unsigned char *dst, int base, mp_ptr src, mp_size_t size)
+{
+  unsigned char  *d;
+  size_t  dsize;
+
+  ASSERT (size >= 0);
+  ASSERT (base >= 2);
+  ASSERT (base < numberof (mp_bases));
+  ASSERT (size == 0 || src[size-1] != 0);
+  ASSERT_MPN (src, size);
+
+  MPN_SIZEINBASE (dsize, src, size, base);
+  ASSERT (dsize >= 1);
+  ASSERT (! byte_overlap_p (dst, (mp_size_t) dsize, src, size * GMP_LIMB_BYTES));
+
+  if (size == 0)
+    {
+      dst[0] = 0;
+      return 1;
+    }
+
+  /* don't clobber input for power of 2 bases */
+  if (POW2_P (base))
+    src = refmpn_memdup_limbs (src, size);
+
+  d = dst + dsize;
+  do
+    {
+      d--;
+      ASSERT (d >= dst);
+      *d = refmpn_divrem_1 (src, (mp_size_t) 0, src, size, (mp_limb_t) base);
+      size -= (src[size-1] == 0);
+    }
+  while (size != 0);
+
+  /* Move result back and decrement dsize if we didn't generate
+     the maximum possible digits.  */
+  if (d != dst)
+    {
+      size_t i;
+      dsize -= d - dst;
+      for (i = 0; i < dsize; i++)
+	dst[i] = d[i];
+    }
+
+  if (POW2_P (base))
+    free (src);
+
+  return dsize;
+}
+
+
+mp_limb_t
+ref_bswap_limb (mp_limb_t src)
+{
+  mp_limb_t  dst;
+  int        i;
+
+  dst = 0;
+  for (i = 0; i < GMP_LIMB_BYTES; i++)
+    {
+      dst = (dst << 8) + (src & 0xFF);
+      src >>= 8;
+    }
+  return dst;
+}
+
+
+/* These random functions are mostly for transitional purposes while adding
+   nail support, since they're independent of the normal mpn routines.  They
+   can probably be removed when those normal routines are reliable, though
+   perhaps something independent would still be useful at times.  */
+
+#if GMP_LIMB_BITS == 32
+#define RAND_A  CNST_LIMB(0x29CF535)
+#endif
+#if GMP_LIMB_BITS == 64
+#define RAND_A  CNST_LIMB(0xBAECD515DAF0B49D)
+#endif
+
+mp_limb_t  refmpn_random_seed;
+
+mp_limb_t
+refmpn_random_half (void)
+{
+  refmpn_random_seed = refmpn_random_seed * RAND_A + 1;
+  return (refmpn_random_seed >> GMP_LIMB_BITS/2);
+}
+
+mp_limb_t
+refmpn_random_limb (void)
+{
+  return ((refmpn_random_half () << (GMP_LIMB_BITS/2))
+	   | refmpn_random_half ()) & GMP_NUMB_MASK;
+}
+
+void
+refmpn_random (mp_ptr ptr, mp_size_t size)
+{
+  mp_size_t  i;
+  if (GMP_NAIL_BITS == 0)
+    {
+      mpn_random (ptr, size);
+      return;
+    }
+
+  for (i = 0; i < size; i++)
+    ptr[i] = refmpn_random_limb ();
+}
+
+void
+refmpn_random2 (mp_ptr ptr, mp_size_t size)
+{
+  mp_size_t  i;
+  mp_limb_t  bit, mask, limb;
+  int        run;
+
+  if (GMP_NAIL_BITS == 0)
+    {
+      mpn_random2 (ptr, size);
+      return;
+    }
+
+#define RUN_MODULUS  32
+
+  /* start with ones at a random pos in the high limb */
+  bit = CNST_LIMB(1) << (refmpn_random_half () % GMP_NUMB_BITS);
+  mask = 0;
+  run = 0;
+
+  for (i = size-1; i >= 0; i--)
+    {
+      limb = 0;
+      do
+	{
+	  if (run == 0)
+	    {
+	      run = (refmpn_random_half () % RUN_MODULUS) + 1;
+	      mask = ~mask;
+	    }
+
+	  limb |= (bit & mask);
+	  bit >>= 1;
+	  run--;
+	}
+      while (bit != 0);
+
+      ptr[i] = limb;
+      bit = GMP_NUMB_HIGHBIT;
+    }
+}
+
+/* This is a simple bitwise algorithm working high to low across "s" and
+   testing each time whether setting the bit would make s^2 exceed n.  */
+mp_size_t
+refmpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nsize)
+{
+  mp_ptr     tp, dp;
+  mp_size_t  ssize, talloc, tsize, dsize, ret, ilimbs;
+  unsigned   ibit;
+  long       i;
+  mp_limb_t  c;
+
+  ASSERT (nsize >= 0);
+
+  /* If n==0, then s=0 and r=0.  */
+  if (nsize == 0)
+    return 0;
+
+  ASSERT (np[nsize - 1] != 0);
+  ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nsize));
+  ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nsize + 1) / 2, rp, nsize));
+  ASSERT (! MPN_OVERLAP_P (sp, (nsize + 1) / 2, np, nsize));
+
+  /* root */
+  ssize = (nsize+1)/2;
+  refmpn_zero (sp, ssize);
+
+  /* the remainder so far */
+  dp = refmpn_memdup_limbs (np, nsize);
+  dsize = nsize;
+
+  /* temporary */
+  talloc = 2*ssize + 1;
+  tp = refmpn_malloc_limbs (talloc);
+
+  for (i = GMP_NUMB_BITS * ssize - 1; i >= 0; i--)
+    {
+      /* t = 2*s*2^i + 2^(2*i), being the amount s^2 will increase by if 2^i
+	 is added to it */
+
+      ilimbs = (i+1) / GMP_NUMB_BITS;
+      ibit = (i+1) % GMP_NUMB_BITS;
+      refmpn_zero (tp, ilimbs);
+      c = refmpn_lshift_or_copy (tp+ilimbs, sp, ssize, ibit);
+      tsize = ilimbs + ssize;
+      tp[tsize] = c;
+      tsize += (c != 0);
+
+      ilimbs = (2*i) / GMP_NUMB_BITS;
+      ibit = (2*i) % GMP_NUMB_BITS;
+      if (ilimbs + 1 > tsize)
+	{
+	  refmpn_zero_extend (tp, tsize, ilimbs + 1);
+	  tsize = ilimbs + 1;
+	}
+      c = refmpn_add_1 (tp+ilimbs, tp+ilimbs, tsize-ilimbs,
+			CNST_LIMB(1) << ibit);
+      ASSERT (tsize < talloc);
+      tp[tsize] = c;
+      tsize += (c != 0);
+
+      if (refmpn_cmp_twosizes (dp, dsize, tp, tsize) >= 0)
+	{
+	  /* set this bit in s and subtract from the remainder */
+	  refmpn_setbit (sp, i);
+
+	  ASSERT_NOCARRY (refmpn_sub_n (dp, dp, tp, dsize));
+	  dsize = refmpn_normalize (dp, dsize);
+	}
+    }
+
+  if (rp == NULL)
+    {
+      ret = ! refmpn_zero_p (dp, dsize);
+    }
+  else
+    {
+      ASSERT (dsize == 0 || dp[dsize-1] != 0);
+      refmpn_copy (rp, dp, dsize);
+      ret = dsize;
+    }
+
+  free (dp);
+  free (tp);
+  return ret;
+}