third_party/gmp/mpn/generic/sqrtrem.c - RealtimeRoboticsGroup/test - Gitiles

 /* mpn_sqrtrem -- square root and remainder

    Contributed to the GNU project by Paul Zimmermann (most code),
    Torbjorn Granlund (mpn_sqrtrem1) and Marco Bodrato (mpn_dc_sqrt).

    THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH MUTABLE
    INTERFACES.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.
    IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A
    FUTURE GMP RELEASE.

 Copyright 1999-2002, 2004, 2005, 2008, 2010, 2012, 2015, 2017 Free Software
 Foundation, Inc.

 This file is part of the GNU MP Library.

 The GNU MP Library is free software; you can redistribute it and/or modify
 it under the terms of either:

   * the GNU Lesser General Public License as published by the Free
     Software Foundation; either version 3 of the License, or (at your
     option) any later version.

 or

   * the GNU General Public License as published by the Free Software
     Foundation; either version 2 of the License, or (at your option) any
     later version.

 or both in parallel, as here.

 The GNU MP Library is distributed in the hope that it will be useful, but
 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 for more details.

 You should have received copies of the GNU General Public License and the
 GNU Lesser General Public License along with the GNU MP Library.  If not,
 see https://www.gnu.org/licenses/.  */


 /* See "Karatsuba Square Root", reference in gmp.texi.  */


 #include <stdio.h>
 #include <stdlib.h>

 #include "gmp-impl.h"
 #include "longlong.h"
 #define USE_DIVAPPR_Q 1
 #define TRACE(x)

 static const unsigned char invsqrttab[384] = /* The common 0x100 was removed */
 {
   0xff,0xfd,0xfb,0xf9,0xf7,0xf5,0xf3,0xf2, /* sqrt(1/80)..sqrt(1/87) */
   0xf0,0xee,0xec,0xea,0xe9,0xe7,0xe5,0xe4, /* sqrt(1/88)..sqrt(1/8f) */
   0xe2,0xe0,0xdf,0xdd,0xdb,0xda,0xd8,0xd7, /* sqrt(1/90)..sqrt(1/97) */
   0xd5,0xd4,0xd2,0xd1,0xcf,0xce,0xcc,0xcb, /* sqrt(1/98)..sqrt(1/9f) */
   0xc9,0xc8,0xc6,0xc5,0xc4,0xc2,0xc1,0xc0, /* sqrt(1/a0)..sqrt(1/a7) */
   0xbe,0xbd,0xbc,0xba,0xb9,0xb8,0xb7,0xb5, /* sqrt(1/a8)..sqrt(1/af) */
   0xb4,0xb3,0xb2,0xb0,0xaf,0xae,0xad,0xac, /* sqrt(1/b0)..sqrt(1/b7) */
   0xaa,0xa9,0xa8,0xa7,0xa6,0xa5,0xa4,0xa3, /* sqrt(1/b8)..sqrt(1/bf) */
   0xa2,0xa0,0x9f,0x9e,0x9d,0x9c,0x9b,0x9a, /* sqrt(1/c0)..sqrt(1/c7) */
   0x99,0x98,0x97,0x96,0x95,0x94,0x93,0x92, /* sqrt(1/c8)..sqrt(1/cf) */
   0x91,0x90,0x8f,0x8e,0x8d,0x8c,0x8c,0x8b, /* sqrt(1/d0)..sqrt(1/d7) */
   0x8a,0x89,0x88,0x87,0x86,0x85,0x84,0x83, /* sqrt(1/d8)..sqrt(1/df) */
   0x83,0x82,0x81,0x80,0x7f,0x7e,0x7e,0x7d, /* sqrt(1/e0)..sqrt(1/e7) */
   0x7c,0x7b,0x7a,0x79,0x79,0x78,0x77,0x76, /* sqrt(1/e8)..sqrt(1/ef) */
   0x76,0x75,0x74,0x73,0x72,0x72,0x71,0x70, /* sqrt(1/f0)..sqrt(1/f7) */
   0x6f,0x6f,0x6e,0x6d,0x6d,0x6c,0x6b,0x6a, /* sqrt(1/f8)..sqrt(1/ff) */
   0x6a,0x69,0x68,0x68,0x67,0x66,0x66,0x65, /* sqrt(1/100)..sqrt(1/107) */
   0x64,0x64,0x63,0x62,0x62,0x61,0x60,0x60, /* sqrt(1/108)..sqrt(1/10f) */
   0x5f,0x5e,0x5e,0x5d,0x5c,0x5c,0x5b,0x5a, /* sqrt(1/110)..sqrt(1/117) */
   0x5a,0x59,0x59,0x58,0x57,0x57,0x56,0x56, /* sqrt(1/118)..sqrt(1/11f) */
   0x55,0x54,0x54,0x53,0x53,0x52,0x52,0x51, /* sqrt(1/120)..sqrt(1/127) */
   0x50,0x50,0x4f,0x4f,0x4e,0x4e,0x4d,0x4d, /* sqrt(1/128)..sqrt(1/12f) */
   0x4c,0x4b,0x4b,0x4a,0x4a,0x49,0x49,0x48, /* sqrt(1/130)..sqrt(1/137) */
   0x48,0x47,0x47,0x46,0x46,0x45,0x45,0x44, /* sqrt(1/138)..sqrt(1/13f) */
   0x44,0x43,0x43,0x42,0x42,0x41,0x41,0x40, /* sqrt(1/140)..sqrt(1/147) */
   0x40,0x3f,0x3f,0x3e,0x3e,0x3d,0x3d,0x3c, /* sqrt(1/148)..sqrt(1/14f) */
   0x3c,0x3b,0x3b,0x3a,0x3a,0x39,0x39,0x39, /* sqrt(1/150)..sqrt(1/157) */
   0x38,0x38,0x37,0x37,0x36,0x36,0x35,0x35, /* sqrt(1/158)..sqrt(1/15f) */
   0x35,0x34,0x34,0x33,0x33,0x32,0x32,0x32, /* sqrt(1/160)..sqrt(1/167) */
   0x31,0x31,0x30,0x30,0x2f,0x2f,0x2f,0x2e, /* sqrt(1/168)..sqrt(1/16f) */
   0x2e,0x2d,0x2d,0x2d,0x2c,0x2c,0x2b,0x2b, /* sqrt(1/170)..sqrt(1/177) */
   0x2b,0x2a,0x2a,0x29,0x29,0x29,0x28,0x28, /* sqrt(1/178)..sqrt(1/17f) */
   0x27,0x27,0x27,0x26,0x26,0x26,0x25,0x25, /* sqrt(1/180)..sqrt(1/187) */
   0x24,0x24,0x24,0x23,0x23,0x23,0x22,0x22, /* sqrt(1/188)..sqrt(1/18f) */
   0x21,0x21,0x21,0x20,0x20,0x20,0x1f,0x1f, /* sqrt(1/190)..sqrt(1/197) */
   0x1f,0x1e,0x1e,0x1e,0x1d,0x1d,0x1d,0x1c, /* sqrt(1/198)..sqrt(1/19f) */
   0x1c,0x1b,0x1b,0x1b,0x1a,0x1a,0x1a,0x19, /* sqrt(1/1a0)..sqrt(1/1a7) */
   0x19,0x19,0x18,0x18,0x18,0x18,0x17,0x17, /* sqrt(1/1a8)..sqrt(1/1af) */
   0x17,0x16,0x16,0x16,0x15,0x15,0x15,0x14, /* sqrt(1/1b0)..sqrt(1/1b7) */
   0x14,0x14,0x13,0x13,0x13,0x12,0x12,0x12, /* sqrt(1/1b8)..sqrt(1/1bf) */
   0x12,0x11,0x11,0x11,0x10,0x10,0x10,0x0f, /* sqrt(1/1c0)..sqrt(1/1c7) */
   0x0f,0x0f,0x0f,0x0e,0x0e,0x0e,0x0d,0x0d, /* sqrt(1/1c8)..sqrt(1/1cf) */
   0x0d,0x0c,0x0c,0x0c,0x0c,0x0b,0x0b,0x0b, /* sqrt(1/1d0)..sqrt(1/1d7) */
   0x0a,0x0a,0x0a,0x0a,0x09,0x09,0x09,0x09, /* sqrt(1/1d8)..sqrt(1/1df) */
   0x08,0x08,0x08,0x07,0x07,0x07,0x07,0x06, /* sqrt(1/1e0)..sqrt(1/1e7) */
   0x06,0x06,0x06,0x05,0x05,0x05,0x04,0x04, /* sqrt(1/1e8)..sqrt(1/1ef) */
   0x04,0x04,0x03,0x03,0x03,0x03,0x02,0x02, /* sqrt(1/1f0)..sqrt(1/1f7) */
   0x02,0x02,0x01,0x01,0x01,0x01,0x00,0x00  /* sqrt(1/1f8)..sqrt(1/1ff) */
 };

 /* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2.  */

 #if GMP_NUMB_BITS > 32
 #define MAGIC CNST_LIMB(0x10000000000)	/* 0xffe7debbfc < MAGIC < 0x232b1850f410 */
 #else
 #define MAGIC CNST_LIMB(0x100000)		/* 0xfee6f < MAGIC < 0x29cbc8 */
 #endif

 static mp_limb_t
 mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
 {
 #if GMP_NUMB_BITS > 32
   mp_limb_t a1;
 #endif
   mp_limb_t x0, t2, t, x2;
   unsigned abits;

   ASSERT_ALWAYS (GMP_NAIL_BITS == 0);
   ASSERT_ALWAYS (GMP_LIMB_BITS == 32 || GMP_LIMB_BITS == 64);
   ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2);

   /* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a),
      since we can do the former without division.  As part of the last
      iteration convert from 1/sqrt(a) to sqrt(a).  */

   abits = a0 >> (GMP_LIMB_BITS - 1 - 8);	/* extract bits for table lookup */
   x0 = 0x100 | invsqrttab[abits - 0x80];	/* initial 1/sqrt(a) */

   /* x0 is now an 8 bits approximation of 1/sqrt(a0) */

 #if GMP_NUMB_BITS > 32
   a1 = a0 >> (GMP_LIMB_BITS - 1 - 32);
   t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000 - a1 * x0 * x0) >> 16;
   x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2));

   /* x0 is now a 16 bits approximation of 1/sqrt(a0) */

   t2 = x0 * (a0 >> (32-8));
   t = t2 >> 25;
   t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8));
   x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15);
   x0 >>= 32;
 #else
   t2 = x0 * (a0 >> (16-8));
   t = t2 >> 13;
   t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8));
   x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7);
   x0 >>= 16;
 #endif

   /* x0 is now a full limb approximation of sqrt(a0) */

   x2 = x0 * x0;
   if (x2 + 2*x0 <= a0 - 1)
     {
       x2 += 2*x0 + 1;
       x0++;
     }

   *rp = a0 - x2;
   return x0;
 }


 #define Prec (GMP_NUMB_BITS >> 1)
 #if ! defined(SQRTREM2_INPLACE)
 #define SQRTREM2_INPLACE 0
 #endif

 /* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
    return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */
 #if SQRTREM2_INPLACE
 #define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp)
 static mp_limb_t
 mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp)
 {
   mp_srcptr np = rp;
 #else
 #define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp, rp)
 static mp_limb_t
 mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
 {
 #endif
   mp_limb_t q, u, np0, sp0, rp0, q2;
   int cc;

   ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2);

   np0 = np[0];
   sp0 = mpn_sqrtrem1 (rp, np[1]);
   rp0 = rp[0];
   /* rp0 <= 2*sp0 < 2^(Prec + 1) */
   rp0 = (rp0 << (Prec - 1)) + (np0 >> (Prec + 1));
   q = rp0 / sp0;
   /* q <= 2^Prec, if q = 2^Prec, reduce the overestimate. */
   q -= q >> Prec;
   /* now we have q < 2^Prec */
   u = rp0 - q * sp0;
   /* now we have (rp[0]<<Prec + np0>>Prec)/2 = q * sp0 + u */
   sp0 = (sp0 << Prec) | q;
   cc = u >> (Prec - 1);
   rp0 = ((u << (Prec + 1)) & GMP_NUMB_MASK) + (np0 & ((CNST_LIMB (1) << (Prec + 1)) - 1));
   /* subtract q * q from rp */
   q2 = q * q;
   cc -= rp0 < q2;
   rp0 -= q2;
   if (cc < 0)
     {
       rp0 += sp0;
       cc += rp0 < sp0;
       --sp0;
       rp0 += sp0;
       cc += rp0 < sp0;
     }

   rp[0] = rp0;
   sp[0] = sp0;
   return cc;
 }

 /* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
    and in {np, n} the low n limbs of the remainder, returns the high
    limb of the remainder (which is 0 or 1).
    Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4
    where B=2^GMP_NUMB_BITS.
    Needs a scratch of n/2+1 limbs. */
 static mp_limb_t
 mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n, mp_limb_t approx, mp_ptr scratch)
 {
   mp_limb_t q;			/* carry out of {sp, n} */
   int c, b;			/* carry out of remainder */
   mp_size_t l, h;

   ASSERT (n > 1);
   ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2);

   l = n / 2;
   h = n - l;
   if (h == 1)
     q = CALL_SQRTREM2_INPLACE (sp + l, np + 2 * l);
   else
     q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h, 0, scratch);
   if (q != 0)
     ASSERT_CARRY (mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h));
   TRACE(printf("tdiv_qr(,,,,%u,,%u) -> %u\n", (unsigned) n, (unsigned) h, (unsigned) (n - h + 1)));
   mpn_tdiv_qr (scratch, np + l, 0, np + l, n, sp + l, h);
   q += scratch[l];
   c = scratch[0] & 1;
   mpn_rshift (sp, scratch, l, 1);
   sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
   if (UNLIKELY ((sp[0] & approx) != 0)) /* (sp[0] & mask) > 1 */
     return 1; /* Remainder is non-zero */
   q >>= 1;
   if (c != 0)
     c = mpn_add_n (np + l, np + l, sp + l, h);
   TRACE(printf("sqr(,,%u)\n", (unsigned) l));
   mpn_sqr (np + n, sp, l);
   b = q + mpn_sub_n (np, np, np + n, 2 * l);
   c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b);

   if (c < 0)
     {
       q = mpn_add_1 (sp + l, sp + l, h, q);
 #if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
       c += mpn_addlsh1_n_ip1 (np, sp, n) + 2 * q;
 #else
       c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q;
 #endif
       c -= mpn_sub_1 (np, np, n, CNST_LIMB(1));
       q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1));
     }

   return c;
 }

 #if USE_DIVAPPR_Q
 static void
 mpn_divappr_q (mp_ptr qp, mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_ptr scratch)
 {
   gmp_pi1_t inv;
   mp_limb_t qh;
   ASSERT (dn > 2);
   ASSERT (nn >= dn);
   ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);

   MPN_COPY (scratch, np, nn);
   invert_pi1 (inv, dp[dn-1], dp[dn-2]);
   if (BELOW_THRESHOLD (dn, DC_DIVAPPR_Q_THRESHOLD))
     qh = mpn_sbpi1_divappr_q (qp, scratch, nn, dp, dn, inv.inv32);
   else if (BELOW_THRESHOLD (dn, MU_DIVAPPR_Q_THRESHOLD))
     qh = mpn_dcpi1_divappr_q (qp, scratch, nn, dp, dn, &inv);
   else
     {
       mp_size_t itch = mpn_mu_divappr_q_itch (nn, dn, 0);
       TMP_DECL;
       TMP_MARK;
       /* Sadly, scratch is too small. */
       qh = mpn_mu_divappr_q (qp, np, nn, dp, dn, TMP_ALLOC_LIMBS (itch));
       TMP_FREE;
     }
   qp [nn - dn] = qh;
 }
 #endif

 /* writes in {sp, n} the square root (rounded towards zero) of {np, 2n-odd},
    returns zero if the operand was a perfect square, one otherwise.
    Assumes {np, 2n-odd}*4^nsh is normalized, i.e. B > np[2n-1-odd]*4^nsh >= B/4
    where B=2^GMP_NUMB_BITS.
    THINK: In the odd case, three more (dummy) limbs are taken into account,
    when nsh is maximal, two limbs are discarded from the result of the
    division. Too much? Is a single dummy limb enough? */
 static int
 mpn_dc_sqrt (mp_ptr sp, mp_srcptr np, mp_size_t n, unsigned nsh, unsigned odd)
 {
   mp_limb_t q;			/* carry out of {sp, n} */
   int c;			/* carry out of remainder */
   mp_size_t l, h;
   mp_ptr qp, tp, scratch;
   TMP_DECL;
   TMP_MARK;

   ASSERT (np[2 * n - 1 - odd] != 0);
   ASSERT (n > 4);
   ASSERT (nsh < GMP_NUMB_BITS / 2);

   l = (n - 1) / 2;
   h = n - l;
   ASSERT (n >= l + 2 && l + 2 >= h && h > l && l >= 1 + odd);
   scratch = TMP_ALLOC_LIMBS (l + 2 * n + 5 - USE_DIVAPPR_Q); /* n + 2-USE_DIVAPPR_Q */
   tp = scratch + n + 2 - USE_DIVAPPR_Q; /* n + h + 1, but tp [-1] is writable */
   if (nsh != 0)
     {
       /* o is used to exactly set the lowest bits of the dividend, is it needed? */
       int o = l > (1 + odd);
       ASSERT_NOCARRY (mpn_lshift (tp - o, np + l - 1 - o - odd, n + h + 1 + o, 2 * nsh));
     }
   else
     MPN_COPY (tp, np + l - 1 - odd, n + h + 1);
   q = mpn_dc_sqrtrem (sp + l, tp + l + 1, h, 0, scratch);
   if (q != 0)
     ASSERT_CARRY (mpn_sub_n (tp + l + 1, tp + l + 1, sp + l, h));
   qp = tp + n + 1; /* l + 2 */
   TRACE(printf("div(appr)_q(,,%u,,%u) -> %u \n", (unsigned) n+1, (unsigned) h, (unsigned) (n + 1 - h + 1)));
 #if USE_DIVAPPR_Q
   mpn_divappr_q (qp, tp, n + 1, sp + l, h, scratch);
 #else
   mpn_div_q (qp, tp, n + 1, sp + l, h, scratch);
 #endif
   q += qp [l + 1];
   c = 1;
   if (q > 1)
     {
       /* FIXME: if s!=0 we will shift later, a noop on this area. */
       MPN_FILL (sp, l, GMP_NUMB_MAX);
     }
   else
     {
       /* FIXME: if s!=0 we will shift again later, shift just once. */
       mpn_rshift (sp, qp + 1, l, 1);
       sp[l - 1] |= q << (GMP_NUMB_BITS - 1);
       if (((qp[0] >> (2 + USE_DIVAPPR_Q)) | /* < 3 + 4*USE_DIVAPPR_Q */
 	   (qp[1] & (GMP_NUMB_MASK >> ((GMP_NUMB_BITS >> odd)- nsh - 1)))) == 0)
 	{
 	  mp_limb_t cy;
 	  /* Approximation is not good enough, the extra limb(+ nsh bits)
 	     is smaller than needed to absorb the possible error. */
 	  /* {qp + 1, l + 1} equals 2*{sp, l} */
 	  /* FIXME: use mullo or wrap-around, or directly evaluate
 	     remainder with a single sqrmod_bnm1. */
 	  TRACE(printf("mul(,,%u,,%u)\n", (unsigned) h, (unsigned) (l+1)));
 	  ASSERT_NOCARRY (mpn_mul (scratch, sp + l, h, qp + 1, l + 1));
 	  /* Compute the remainder of the previous mpn_div(appr)_q. */
 	  cy = mpn_sub_n (tp + 1, tp + 1, scratch, h);
 #if USE_DIVAPPR_Q || WANT_ASSERT
 	  MPN_DECR_U (tp + 1 + h, l, cy);
 #if USE_DIVAPPR_Q
 	  ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) <= 0);
 	  if (mpn_cmp (tp + 1 + h, scratch + h, l) < 0)
 	    {
 	      /* May happen only if div result was not exact. */
 #if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
 	      cy = mpn_addlsh1_n_ip1 (tp + 1, sp + l, h);
 #else
 	      cy = mpn_addmul_1 (tp + 1, sp + l, h, CNST_LIMB(2));
 #endif
 	      ASSERT_NOCARRY (mpn_add_1 (tp + 1 + h, tp + 1 + h, l, cy));
 	      MPN_DECR_U (sp, l, 1);
 	    }
 	  /* Can the root be exact when a correction was needed? We
 	     did not find an example, but it depends on divappr
 	     internals, and we can not assume it true in general...*/
 	  /* else */
 #else /* WANT_ASSERT */
 	  ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) == 0);
 #endif
 #endif
 	  if (mpn_zero_p (tp + l + 1, h - l))
 	    {
 	      TRACE(printf("sqr(,,%u)\n", (unsigned) l));
 	      mpn_sqr (scratch, sp, l);
 	      c = mpn_cmp (tp + 1, scratch + l, l);
 	      if (c == 0)
 		{
 		  if (nsh != 0)
 		    {
 		      mpn_lshift (tp, np, l, 2 * nsh);
 		      np = tp;
 		    }
 		  c = mpn_cmp (np, scratch + odd, l - odd);
 		}
 	      if (c < 0)
 		{
 		  MPN_DECR_U (sp, l, 1);
 		  c = 1;
 		}
 	    }
 	}
     }
   TMP_FREE;

   if ((odd | nsh) != 0)
     mpn_rshift (sp, sp, n, nsh + (odd ? GMP_NUMB_BITS / 2 : 0));
   return c;
 }


 mp_size_t
 mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
 {
   mp_limb_t cc, high, rl;
   int c;
   mp_size_t rn, tn;
   TMP_DECL;

   ASSERT (nn > 0);
   ASSERT_MPN (np, nn);

   ASSERT (np[nn - 1] != 0);
   ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn));
   ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
   ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));

   high = np[nn - 1];
   if (high & (GMP_NUMB_HIGHBIT | (GMP_NUMB_HIGHBIT / 2)))
     c = 0;
   else
     {
       count_leading_zeros (c, high);
       c -= GMP_NAIL_BITS;

       c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
     }
   if (nn == 1) {
     if (c == 0)
       {
 	sp[0] = mpn_sqrtrem1 (&rl, high);
 	if (rp != NULL)
 	  rp[0] = rl;
       }
     else
       {
 	cc = mpn_sqrtrem1 (&rl, high << (2*c)) >> c;
 	sp[0] = cc;
 	if (rp != NULL)
 	  rp[0] = rl = high - cc*cc;
       }
     return rl != 0;
   }
   if (nn == 2) {
     mp_limb_t tp [2];
     if (rp == NULL) rp = tp;
     if (c == 0)
       {
 #if SQRTREM2_INPLACE
 	rp[1] = high;
 	rp[0] = np[0];
 	cc = CALL_SQRTREM2_INPLACE (sp, rp);
 #else
 	cc = mpn_sqrtrem2 (sp, rp, np);
 #endif
 	rp[1] = cc;
 	return ((rp[0] | cc) != 0) + cc;
       }
     else
       {
 	rl = np[0];
 	rp[1] = (high << (2*c)) | (rl >> (GMP_NUMB_BITS - 2*c));
 	rp[0] = rl << (2*c);
 	CALL_SQRTREM2_INPLACE (sp, rp);
 	cc = sp[0] >>= c;	/* c != 0, the highest bit of the root cc is 0. */
 	rp[0] = rl -= cc*cc;	/* Computed modulo 2^GMP_LIMB_BITS, because it's smaller. */
 	return rl != 0;
       }
   }
   tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */

   if ((rp == NULL) && (nn > 8))
     return mpn_dc_sqrt (sp, np, tn, c, nn & 1);
   TMP_MARK;
   if (((nn & 1) | c) != 0)
     {
       mp_limb_t s0[1], mask;
       mp_ptr tp, scratch;
       TMP_ALLOC_LIMBS_2 (tp, 2 * tn, scratch, tn / 2 + 1);
       tp[0] = 0;	     /* needed only when 2*tn > nn, but saves a test */
       if (c != 0)
 	mpn_lshift (tp + (nn & 1), np, nn, 2 * c);
       else
 	MPN_COPY (tp + (nn & 1), np, nn);
       c += (nn & 1) ? GMP_NUMB_BITS / 2 : 0;		/* c now represents k */
       mask = (CNST_LIMB (1) << c) - 1;
       rl = mpn_dc_sqrtrem (sp, tp, tn, (rp == NULL) ? mask - 1 : 0, scratch);
       /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2,
 	 thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */
       s0[0] = sp[0] & mask;	/* S mod 2^k */
       rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]);	/* R = R + 2*s0*S */
       cc = mpn_submul_1 (tp, s0, 1, s0[0]);
       rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc;
       mpn_rshift (sp, sp, tn, c);
       tp[tn] = rl;
       if (rp == NULL)
 	rp = tp;
       c = c << 1;
       if (c < GMP_NUMB_BITS)
 	tn++;
       else
 	{
 	  tp++;
 	  c -= GMP_NUMB_BITS;
 	}
       if (c != 0)
 	mpn_rshift (rp, tp, tn, c);
       else
 	MPN_COPY_INCR (rp, tp, tn);
       rn = tn;
     }
   else
     {
       if (rp != np)
 	{
 	  if (rp == NULL) /* nn <= 8 */
 	    rp = TMP_SALLOC_LIMBS (nn);
 	  MPN_COPY (rp, np, nn);
 	}
       rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn, 0, TMP_ALLOC_LIMBS(tn / 2 + 1)));
     }

   MPN_NORMALIZE (rp, rn);

   TMP_FREE;
   return rn;
 }
	/* mpn_sqrtrem -- square root and remainder

	Contributed to the GNU project by Paul Zimmermann (most code),
	Torbjorn Granlund (mpn_sqrtrem1) and Marco Bodrato (mpn_dc_sqrt).

	THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH MUTABLE
	INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.
	IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A
	FUTURE GMP RELEASE.

	Copyright 1999-2002, 2004, 2005, 2008, 2010, 2012, 2015, 2017 Free Software
	Foundation, Inc.

	This file is part of the GNU MP Library.

	The GNU MP Library is free software; you can redistribute it and/or modify
	it under the terms of either:

	* the GNU Lesser General Public License as published by the Free
	Software Foundation; either version 3 of the License, or (at your
	option) any later version.

	or

	* the GNU General Public License as published by the Free Software
	Foundation; either version 2 of the License, or (at your option) any
	later version.

	or both in parallel, as here.

	The GNU MP Library is distributed in the hope that it will be useful, but
	WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
	or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	for more details.

	You should have received copies of the GNU General Public License and the
	GNU Lesser General Public License along with the GNU MP Library. If not,
	see https://www.gnu.org/licenses/. */


	/* See "Karatsuba Square Root", reference in gmp.texi. */


	#include <stdio.h>
	#include <stdlib.h>

	#include "gmp-impl.h"
	#include "longlong.h"
	#define USE_DIVAPPR_Q 1
	#define TRACE(x)

	static const unsigned char invsqrttab[384] = /* The common 0x100 was removed */
	{
	0xff,0xfd,0xfb,0xf9,0xf7,0xf5,0xf3,0xf2, /* sqrt(1/80)..sqrt(1/87) */
	0xf0,0xee,0xec,0xea,0xe9,0xe7,0xe5,0xe4, /* sqrt(1/88)..sqrt(1/8f) */
	0xe2,0xe0,0xdf,0xdd,0xdb,0xda,0xd8,0xd7, /* sqrt(1/90)..sqrt(1/97) */
	0xd5,0xd4,0xd2,0xd1,0xcf,0xce,0xcc,0xcb, /* sqrt(1/98)..sqrt(1/9f) */
	0xc9,0xc8,0xc6,0xc5,0xc4,0xc2,0xc1,0xc0, /* sqrt(1/a0)..sqrt(1/a7) */
	0xbe,0xbd,0xbc,0xba,0xb9,0xb8,0xb7,0xb5, /* sqrt(1/a8)..sqrt(1/af) */
	0xb4,0xb3,0xb2,0xb0,0xaf,0xae,0xad,0xac, /* sqrt(1/b0)..sqrt(1/b7) */
	0xaa,0xa9,0xa8,0xa7,0xa6,0xa5,0xa4,0xa3, /* sqrt(1/b8)..sqrt(1/bf) */
	0xa2,0xa0,0x9f,0x9e,0x9d,0x9c,0x9b,0x9a, /* sqrt(1/c0)..sqrt(1/c7) */
	0x99,0x98,0x97,0x96,0x95,0x94,0x93,0x92, /* sqrt(1/c8)..sqrt(1/cf) */
	0x91,0x90,0x8f,0x8e,0x8d,0x8c,0x8c,0x8b, /* sqrt(1/d0)..sqrt(1/d7) */
	0x8a,0x89,0x88,0x87,0x86,0x85,0x84,0x83, /* sqrt(1/d8)..sqrt(1/df) */
	0x83,0x82,0x81,0x80,0x7f,0x7e,0x7e,0x7d, /* sqrt(1/e0)..sqrt(1/e7) */
	0x7c,0x7b,0x7a,0x79,0x79,0x78,0x77,0x76, /* sqrt(1/e8)..sqrt(1/ef) */
	0x76,0x75,0x74,0x73,0x72,0x72,0x71,0x70, /* sqrt(1/f0)..sqrt(1/f7) */
	0x6f,0x6f,0x6e,0x6d,0x6d,0x6c,0x6b,0x6a, /* sqrt(1/f8)..sqrt(1/ff) */
	0x6a,0x69,0x68,0x68,0x67,0x66,0x66,0x65, /* sqrt(1/100)..sqrt(1/107) */
	0x64,0x64,0x63,0x62,0x62,0x61,0x60,0x60, /* sqrt(1/108)..sqrt(1/10f) */
	0x5f,0x5e,0x5e,0x5d,0x5c,0x5c,0x5b,0x5a, /* sqrt(1/110)..sqrt(1/117) */
	0x5a,0x59,0x59,0x58,0x57,0x57,0x56,0x56, /* sqrt(1/118)..sqrt(1/11f) */
	0x55,0x54,0x54,0x53,0x53,0x52,0x52,0x51, /* sqrt(1/120)..sqrt(1/127) */
	0x50,0x50,0x4f,0x4f,0x4e,0x4e,0x4d,0x4d, /* sqrt(1/128)..sqrt(1/12f) */
	0x4c,0x4b,0x4b,0x4a,0x4a,0x49,0x49,0x48, /* sqrt(1/130)..sqrt(1/137) */
	0x48,0x47,0x47,0x46,0x46,0x45,0x45,0x44, /* sqrt(1/138)..sqrt(1/13f) */
	0x44,0x43,0x43,0x42,0x42,0x41,0x41,0x40, /* sqrt(1/140)..sqrt(1/147) */
	0x40,0x3f,0x3f,0x3e,0x3e,0x3d,0x3d,0x3c, /* sqrt(1/148)..sqrt(1/14f) */
	0x3c,0x3b,0x3b,0x3a,0x3a,0x39,0x39,0x39, /* sqrt(1/150)..sqrt(1/157) */
	0x38,0x38,0x37,0x37,0x36,0x36,0x35,0x35, /* sqrt(1/158)..sqrt(1/15f) */
	0x35,0x34,0x34,0x33,0x33,0x32,0x32,0x32, /* sqrt(1/160)..sqrt(1/167) */
	0x31,0x31,0x30,0x30,0x2f,0x2f,0x2f,0x2e, /* sqrt(1/168)..sqrt(1/16f) */
	0x2e,0x2d,0x2d,0x2d,0x2c,0x2c,0x2b,0x2b, /* sqrt(1/170)..sqrt(1/177) */
	0x2b,0x2a,0x2a,0x29,0x29,0x29,0x28,0x28, /* sqrt(1/178)..sqrt(1/17f) */
	0x27,0x27,0x27,0x26,0x26,0x26,0x25,0x25, /* sqrt(1/180)..sqrt(1/187) */
	0x24,0x24,0x24,0x23,0x23,0x23,0x22,0x22, /* sqrt(1/188)..sqrt(1/18f) */
	0x21,0x21,0x21,0x20,0x20,0x20,0x1f,0x1f, /* sqrt(1/190)..sqrt(1/197) */
	0x1f,0x1e,0x1e,0x1e,0x1d,0x1d,0x1d,0x1c, /* sqrt(1/198)..sqrt(1/19f) */
	0x1c,0x1b,0x1b,0x1b,0x1a,0x1a,0x1a,0x19, /* sqrt(1/1a0)..sqrt(1/1a7) */
	0x19,0x19,0x18,0x18,0x18,0x18,0x17,0x17, /* sqrt(1/1a8)..sqrt(1/1af) */
	0x17,0x16,0x16,0x16,0x15,0x15,0x15,0x14, /* sqrt(1/1b0)..sqrt(1/1b7) */
	0x14,0x14,0x13,0x13,0x13,0x12,0x12,0x12, /* sqrt(1/1b8)..sqrt(1/1bf) */
	0x12,0x11,0x11,0x11,0x10,0x10,0x10,0x0f, /* sqrt(1/1c0)..sqrt(1/1c7) */
	0x0f,0x0f,0x0f,0x0e,0x0e,0x0e,0x0d,0x0d, /* sqrt(1/1c8)..sqrt(1/1cf) */
	0x0d,0x0c,0x0c,0x0c,0x0c,0x0b,0x0b,0x0b, /* sqrt(1/1d0)..sqrt(1/1d7) */
	0x0a,0x0a,0x0a,0x0a,0x09,0x09,0x09,0x09, /* sqrt(1/1d8)..sqrt(1/1df) */
	0x08,0x08,0x08,0x07,0x07,0x07,0x07,0x06, /* sqrt(1/1e0)..sqrt(1/1e7) */
	0x06,0x06,0x06,0x05,0x05,0x05,0x04,0x04, /* sqrt(1/1e8)..sqrt(1/1ef) */
	0x04,0x04,0x03,0x03,0x03,0x03,0x02,0x02, /* sqrt(1/1f0)..sqrt(1/1f7) */
	0x02,0x02,0x01,0x01,0x01,0x01,0x00,0x00 /* sqrt(1/1f8)..sqrt(1/1ff) */
	};

	/* Compute s = floor(sqrt(a0)), and rp = a0 - s^2. /

	#if GMP_NUMB_BITS > 32
	#define MAGIC CNST_LIMB(0x10000000000) /* 0xffe7debbfc < MAGIC < 0x232b1850f410 */
	#else
	#define MAGIC CNST_LIMB(0x100000) /* 0xfee6f < MAGIC < 0x29cbc8 */
	#endif

	static mp_limb_t
	mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
	{
	#if GMP_NUMB_BITS > 32
	mp_limb_t a1;
	#endif
	mp_limb_t x0, t2, t, x2;
	unsigned abits;

	ASSERT_ALWAYS (GMP_NAIL_BITS == 0);
	ASSERT_ALWAYS (GMP_LIMB_BITS == 32 \|\| GMP_LIMB_BITS == 64);
	ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2);

	/* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a),
	since we can do the former without division. As part of the last
	iteration convert from 1/sqrt(a) to sqrt(a). */

	abits = a0 >> (GMP_LIMB_BITS - 1 - 8); /* extract bits for table lookup */
	x0 = 0x100 \| invsqrttab[abits - 0x80]; /* initial 1/sqrt(a) */

	/* x0 is now an 8 bits approximation of 1/sqrt(a0) */

	#if GMP_NUMB_BITS > 32
	a1 = a0 >> (GMP_LIMB_BITS - 1 - 32);
	t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000 - a1 * x0 * x0) >> 16;
	x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2));

	/* x0 is now a 16 bits approximation of 1/sqrt(a0) */

	t2 = x0 * (a0 >> (32-8));
	t = t2 >> 25;
	t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8));
	x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15);
	x0 >>= 32;
	#else
	t2 = x0 * (a0 >> (16-8));
	t = t2 >> 13;
	t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8));
	x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7);
	x0 >>= 16;
	#endif

	/* x0 is now a full limb approximation of sqrt(a0) */

	x2 = x0 * x0;
	if (x2 + 2*x0 <= a0 - 1)
	{
	x2 += 2*x0 + 1;
	x0++;
	}

	*rp = a0 - x2;
	return x0;
	}


	#define Prec (GMP_NUMB_BITS >> 1)
	#if ! defined(SQRTREM2_INPLACE)
	#define SQRTREM2_INPLACE 0
	#endif

	/* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
	return cc such that {np, 2} = sp[0]^2 + cc2^GMP_NUMB_BITS + rp[0] /
	#if SQRTREM2_INPLACE
	#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp)
	static mp_limb_t
	mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp)
	{
	mp_srcptr np = rp;
	#else
	#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp, rp)
	static mp_limb_t
	mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
	{
	#endif
	mp_limb_t q, u, np0, sp0, rp0, q2;
	int cc;

	ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2);

	np0 = np[0];
	sp0 = mpn_sqrtrem1 (rp, np[1]);
	rp0 = rp[0];
	/* rp0 <= 2sp0 < 2^(Prec + 1) /
	rp0 = (rp0 << (Prec - 1)) + (np0 >> (Prec + 1));
	q = rp0 / sp0;
	/* q <= 2^Prec, if q = 2^Prec, reduce the overestimate. */
	q -= q >> Prec;
	/* now we have q < 2^Prec */
	u = rp0 - q * sp0;
	/* now we have (rp[0]<<Prec + np0>>Prec)/2 = q * sp0 + u */
	sp0 = (sp0 << Prec) \| q;
	cc = u >> (Prec - 1);
	rp0 = ((u << (Prec + 1)) & GMP_NUMB_MASK) + (np0 & ((CNST_LIMB (1) << (Prec + 1)) - 1));
	/* subtract q * q from rp */
	q2 = q * q;
	cc -= rp0 < q2;
	rp0 -= q2;
	if (cc < 0)
	{
	rp0 += sp0;
	cc += rp0 < sp0;
	--sp0;
	rp0 += sp0;
	cc += rp0 < sp0;
	}

	rp[0] = rp0;
	sp[0] = sp0;
	return cc;
	}

	/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
	and in {np, n} the low n limbs of the remainder, returns the high
	limb of the remainder (which is 0 or 1).
	Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4
	where B=2^GMP_NUMB_BITS.
	Needs a scratch of n/2+1 limbs. */
	static mp_limb_t
	mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n, mp_limb_t approx, mp_ptr scratch)
	{
	mp_limb_t q; /* carry out of {sp, n} */
	int c, b; /* carry out of remainder */
	mp_size_t l, h;

	ASSERT (n > 1);
	ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2);

	l = n / 2;
	h = n - l;
	if (h == 1)
	q = CALL_SQRTREM2_INPLACE (sp + l, np + 2 * l);
	else
	q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h, 0, scratch);
	if (q != 0)
	ASSERT_CARRY (mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h));
	TRACE(printf("tdiv_qr(,,,,%u,,%u) -> %u\n", (unsigned) n, (unsigned) h, (unsigned) (n - h + 1)));
	mpn_tdiv_qr (scratch, np + l, 0, np + l, n, sp + l, h);
	q += scratch[l];
	c = scratch[0] & 1;
	mpn_rshift (sp, scratch, l, 1);
	sp[l - 1] \|= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
	if (UNLIKELY ((sp[0] & approx) != 0)) /* (sp[0] & mask) > 1 */
	return 1; /* Remainder is non-zero */
	q >>= 1;
	if (c != 0)
	c = mpn_add_n (np + l, np + l, sp + l, h);
	TRACE(printf("sqr(,,%u)\n", (unsigned) l));
	mpn_sqr (np + n, sp, l);
	b = q + mpn_sub_n (np, np, np + n, 2 * l);
	c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b);

	if (c < 0)
	{
	q = mpn_add_1 (sp + l, sp + l, h, q);
	#if HAVE_NATIVE_mpn_addlsh1_n_ip1 \|\| HAVE_NATIVE_mpn_addlsh1_n
	c += mpn_addlsh1_n_ip1 (np, sp, n) + 2 * q;
	#else
	c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q;
	#endif
	c -= mpn_sub_1 (np, np, n, CNST_LIMB(1));
	q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1));
	}

	return c;
	}

	#if USE_DIVAPPR_Q
	static void
	mpn_divappr_q (mp_ptr qp, mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_ptr scratch)
	{
	gmp_pi1_t inv;
	mp_limb_t qh;
	ASSERT (dn > 2);
	ASSERT (nn >= dn);
	ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);

	MPN_COPY (scratch, np, nn);
	invert_pi1 (inv, dp[dn-1], dp[dn-2]);
	if (BELOW_THRESHOLD (dn, DC_DIVAPPR_Q_THRESHOLD))
	qh = mpn_sbpi1_divappr_q (qp, scratch, nn, dp, dn, inv.inv32);
	else if (BELOW_THRESHOLD (dn, MU_DIVAPPR_Q_THRESHOLD))
	qh = mpn_dcpi1_divappr_q (qp, scratch, nn, dp, dn, &inv);
	else
	{
	mp_size_t itch = mpn_mu_divappr_q_itch (nn, dn, 0);
	TMP_DECL;
	TMP_MARK;
	/* Sadly, scratch is too small. */
	qh = mpn_mu_divappr_q (qp, np, nn, dp, dn, TMP_ALLOC_LIMBS (itch));
	TMP_FREE;
	}
	qp [nn - dn] = qh;
	}
	#endif

	/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n-odd},
	returns zero if the operand was a perfect square, one otherwise.
	Assumes {np, 2n-odd}4^nsh is normalized, i.e. B > np[2n-1-odd]4^nsh >= B/4
	where B=2^GMP_NUMB_BITS.
	THINK: In the odd case, three more (dummy) limbs are taken into account,
	when nsh is maximal, two limbs are discarded from the result of the
	division. Too much? Is a single dummy limb enough? */
	static int
	mpn_dc_sqrt (mp_ptr sp, mp_srcptr np, mp_size_t n, unsigned nsh, unsigned odd)
	{
	mp_limb_t q; /* carry out of {sp, n} */
	int c; /* carry out of remainder */
	mp_size_t l, h;
	mp_ptr qp, tp, scratch;
	TMP_DECL;
	TMP_MARK;

	ASSERT (np[2 * n - 1 - odd] != 0);
	ASSERT (n > 4);
	ASSERT (nsh < GMP_NUMB_BITS / 2);

	l = (n - 1) / 2;
	h = n - l;
	ASSERT (n >= l + 2 && l + 2 >= h && h > l && l >= 1 + odd);
	scratch = TMP_ALLOC_LIMBS (l + 2 * n + 5 - USE_DIVAPPR_Q); /* n + 2-USE_DIVAPPR_Q */
	tp = scratch + n + 2 - USE_DIVAPPR_Q; /* n + h + 1, but tp [-1] is writable */
	if (nsh != 0)
	{
	/* o is used to exactly set the lowest bits of the dividend, is it needed? */
	int o = l > (1 + odd);
	ASSERT_NOCARRY (mpn_lshift (tp - o, np + l - 1 - o - odd, n + h + 1 + o, 2 * nsh));
	}
	else
	MPN_COPY (tp, np + l - 1 - odd, n + h + 1);
	q = mpn_dc_sqrtrem (sp + l, tp + l + 1, h, 0, scratch);
	if (q != 0)
	ASSERT_CARRY (mpn_sub_n (tp + l + 1, tp + l + 1, sp + l, h));
	qp = tp + n + 1; /* l + 2 */
	TRACE(printf("div(appr)_q(,,%u,,%u) -> %u \n", (unsigned) n+1, (unsigned) h, (unsigned) (n + 1 - h + 1)));
	#if USE_DIVAPPR_Q
	mpn_divappr_q (qp, tp, n + 1, sp + l, h, scratch);
	#else
	mpn_div_q (qp, tp, n + 1, sp + l, h, scratch);
	#endif
	q += qp [l + 1];
	c = 1;
	if (q > 1)
	{
	/* FIXME: if s!=0 we will shift later, a noop on this area. */
	MPN_FILL (sp, l, GMP_NUMB_MAX);
	}
	else
	{
	/* FIXME: if s!=0 we will shift again later, shift just once. */
	mpn_rshift (sp, qp + 1, l, 1);
	sp[l - 1] \|= q << (GMP_NUMB_BITS - 1);
	if (((qp[0] >> (2 + USE_DIVAPPR_Q)) \| /* < 3 + 4USE_DIVAPPR_Q /
	(qp[1] & (GMP_NUMB_MASK >> ((GMP_NUMB_BITS >> odd)- nsh - 1)))) == 0)
	{
	mp_limb_t cy;
	/* Approximation is not good enough, the extra limb(+ nsh bits)
	is smaller than needed to absorb the possible error. */
	/* {qp + 1, l + 1} equals 2{sp, l} /
	/* FIXME: use mullo or wrap-around, or directly evaluate
	remainder with a single sqrmod_bnm1. */
	TRACE(printf("mul(,,%u,,%u)\n", (unsigned) h, (unsigned) (l+1)));
	ASSERT_NOCARRY (mpn_mul (scratch, sp + l, h, qp + 1, l + 1));
	/* Compute the remainder of the previous mpn_div(appr)_q. */
	cy = mpn_sub_n (tp + 1, tp + 1, scratch, h);
	#if USE_DIVAPPR_Q \|\| WANT_ASSERT
	MPN_DECR_U (tp + 1 + h, l, cy);
	#if USE_DIVAPPR_Q
	ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) <= 0);
	if (mpn_cmp (tp + 1 + h, scratch + h, l) < 0)
	{
	/* May happen only if div result was not exact. */
	#if HAVE_NATIVE_mpn_addlsh1_n_ip1 \|\| HAVE_NATIVE_mpn_addlsh1_n
	cy = mpn_addlsh1_n_ip1 (tp + 1, sp + l, h);
	#else
	cy = mpn_addmul_1 (tp + 1, sp + l, h, CNST_LIMB(2));
	#endif
	ASSERT_NOCARRY (mpn_add_1 (tp + 1 + h, tp + 1 + h, l, cy));
	MPN_DECR_U (sp, l, 1);
	}
	/* Can the root be exact when a correction was needed? We
	did not find an example, but it depends on divappr
	internals, and we can not assume it true in general...*/
	/* else */
	#else /* WANT_ASSERT */
	ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) == 0);
	#endif
	#endif
	if (mpn_zero_p (tp + l + 1, h - l))
	{
	TRACE(printf("sqr(,,%u)\n", (unsigned) l));
	mpn_sqr (scratch, sp, l);
	c = mpn_cmp (tp + 1, scratch + l, l);
	if (c == 0)
	{
	if (nsh != 0)
	{
	mpn_lshift (tp, np, l, 2 * nsh);
	np = tp;
	}
	c = mpn_cmp (np, scratch + odd, l - odd);
	}
	if (c < 0)
	{
	MPN_DECR_U (sp, l, 1);
	c = 1;
	}
	}
	}
	}
	TMP_FREE;

	if ((odd \| nsh) != 0)
	mpn_rshift (sp, sp, n, nsh + (odd ? GMP_NUMB_BITS / 2 : 0));
	return c;
	}


	mp_size_t
	mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
	{
	mp_limb_t cc, high, rl;
	int c;
	mp_size_t rn, tn;
	TMP_DECL;

	ASSERT (nn > 0);
	ASSERT_MPN (np, nn);

	ASSERT (np[nn - 1] != 0);
	ASSERT (rp == NULL \|\| MPN_SAME_OR_SEPARATE_P (np, rp, nn));
	ASSERT (rp == NULL \|\| ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
	ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));

	high = np[nn - 1];
	if (high & (GMP_NUMB_HIGHBIT \| (GMP_NUMB_HIGHBIT / 2)))
	c = 0;
	else
	{
	count_leading_zeros (c, high);
	c -= GMP_NAIL_BITS;

	c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
	}
	if (nn == 1) {
	if (c == 0)
	{
	sp[0] = mpn_sqrtrem1 (&rl, high);
	if (rp != NULL)
	rp[0] = rl;
	}
	else
	{
	cc = mpn_sqrtrem1 (&rl, high << (2*c)) >> c;
	sp[0] = cc;
	if (rp != NULL)
	rp[0] = rl = high - cc*cc;
	}
	return rl != 0;
	}
	if (nn == 2) {
	mp_limb_t tp [2];
	if (rp == NULL) rp = tp;
	if (c == 0)
	{
	#if SQRTREM2_INPLACE
	rp[1] = high;
	rp[0] = np[0];
	cc = CALL_SQRTREM2_INPLACE (sp, rp);
	#else
	cc = mpn_sqrtrem2 (sp, rp, np);
	#endif
	rp[1] = cc;
	return ((rp[0] \| cc) != 0) + cc;
	}
	else
	{
	rl = np[0];
	rp[1] = (high << (2c)) \| (rl >> (GMP_NUMB_BITS - 2c));
	rp[0] = rl << (2*c);
	CALL_SQRTREM2_INPLACE (sp, rp);
	cc = sp[0] >>= c; /* c != 0, the highest bit of the root cc is 0. */
	rp[0] = rl -= cccc; / Computed modulo 2^GMP_LIMB_BITS, because it's smaller. */
	return rl != 0;
	}
	}
	tn = (nn + 1) / 2; /* 2tn is the smallest even integer >= nn /

	if ((rp == NULL) && (nn > 8))
	return mpn_dc_sqrt (sp, np, tn, c, nn & 1);
	TMP_MARK;
	if (((nn & 1) \| c) != 0)
	{
	mp_limb_t s0[1], mask;
	mp_ptr tp, scratch;
	TMP_ALLOC_LIMBS_2 (tp, 2 * tn, scratch, tn / 2 + 1);
	tp[0] = 0; /* needed only when 2tn > nn, but saves a test /
	if (c != 0)
	mpn_lshift (tp + (nn & 1), np, nn, 2 * c);
	else
	MPN_COPY (tp + (nn & 1), np, nn);
	c += (nn & 1) ? GMP_NUMB_BITS / 2 : 0; /* c now represents k */
	mask = (CNST_LIMB (1) << c) - 1;
	rl = mpn_dc_sqrtrem (sp, tp, tn, (rp == NULL) ? mask - 1 : 0, scratch);
	/* We have 2^(2k)N = S^2 + R where k = c + (2tn-nn)GMP_NUMB_BITS/2,
	thus 2^(2k)N = (S-s0)^2 + 2Ss0 - s0^2 + R where s0=S mod 2^k /
	s0[0] = sp[0] & mask; /* S mod 2^k */
	rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]); /* R = R + 2s0S */
	cc = mpn_submul_1 (tp, s0, 1, s0[0]);
	rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc;
	mpn_rshift (sp, sp, tn, c);
	tp[tn] = rl;
	if (rp == NULL)
	rp = tp;
	c = c << 1;
	if (c < GMP_NUMB_BITS)
	tn++;
	else
	{
	tp++;
	c -= GMP_NUMB_BITS;
	}
	if (c != 0)
	mpn_rshift (rp, tp, tn, c);
	else
	MPN_COPY_INCR (rp, tp, tn);
	rn = tn;
	}
	else
	{
	if (rp != np)
	{
	if (rp == NULL) /* nn <= 8 */
	rp = TMP_SALLOC_LIMBS (nn);
	MPN_COPY (rp, np, nn);
	}
	rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn, 0, TMP_ALLOC_LIMBS(tn / 2 + 1)));
	}

	MPN_NORMALIZE (rp, rn);

	TMP_FREE;
	return rn;
	}