Blame - third_party/gmp/mpn/generic/sqrtrem.c - RealtimeRoboticsGroup/test

blob: cc6dd9c88d72ed9585c3dfa8675973bdccf098fb [file] [log] [blame]

Austin Schuh	dace2a6	2020-08-18 10:56:48 -0700	[diff] [blame]	1	/* mpn_sqrtrem -- square root and remainder
				2
				3	Contributed to the GNU project by Paul Zimmermann (most code),
				4	Torbjorn Granlund (mpn_sqrtrem1) and Marco Bodrato (mpn_dc_sqrt).
				5
				6	THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH MUTABLE
				7	INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.
				8	IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A
				9	FUTURE GMP RELEASE.
				10
				11	Copyright 1999-2002, 2004, 2005, 2008, 2010, 2012, 2015, 2017 Free Software
				12	Foundation, Inc.
				13
				14	This file is part of the GNU MP Library.
				15
				16	The GNU MP Library is free software; you can redistribute it and/or modify
				17	it under the terms of either:
				18
				19	* the GNU Lesser General Public License as published by the Free
				20	Software Foundation; either version 3 of the License, or (at your
				21	option) any later version.
				22
				23	or
				24
				25	* the GNU General Public License as published by the Free Software
				26	Foundation; either version 2 of the License, or (at your option) any
				27	later version.
				28
				29	or both in parallel, as here.
				30
				31	The GNU MP Library is distributed in the hope that it will be useful, but
				32	WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
				33	or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
				34	for more details.
				35
				36	You should have received copies of the GNU General Public License and the
				37	GNU Lesser General Public License along with the GNU MP Library. If not,
				38	see https://www.gnu.org/licenses/. */
				39
				40
				41	/* See "Karatsuba Square Root", reference in gmp.texi. */
				42
				43
				44	#include <stdio.h>
				45	#include <stdlib.h>
				46
				47	#include "gmp-impl.h"
				48	#include "longlong.h"
				49	#define USE_DIVAPPR_Q 1
				50	#define TRACE(x)
				51
				52	static const unsigned char invsqrttab[384] = /* The common 0x100 was removed */
				53	{
				54	0xff,0xfd,0xfb,0xf9,0xf7,0xf5,0xf3,0xf2, /* sqrt(1/80)..sqrt(1/87) */
				55	0xf0,0xee,0xec,0xea,0xe9,0xe7,0xe5,0xe4, /* sqrt(1/88)..sqrt(1/8f) */
				56	0xe2,0xe0,0xdf,0xdd,0xdb,0xda,0xd8,0xd7, /* sqrt(1/90)..sqrt(1/97) */
				57	0xd5,0xd4,0xd2,0xd1,0xcf,0xce,0xcc,0xcb, /* sqrt(1/98)..sqrt(1/9f) */
				58	0xc9,0xc8,0xc6,0xc5,0xc4,0xc2,0xc1,0xc0, /* sqrt(1/a0)..sqrt(1/a7) */
				59	0xbe,0xbd,0xbc,0xba,0xb9,0xb8,0xb7,0xb5, /* sqrt(1/a8)..sqrt(1/af) */
				60	0xb4,0xb3,0xb2,0xb0,0xaf,0xae,0xad,0xac, /* sqrt(1/b0)..sqrt(1/b7) */
				61	0xaa,0xa9,0xa8,0xa7,0xa6,0xa5,0xa4,0xa3, /* sqrt(1/b8)..sqrt(1/bf) */
				62	0xa2,0xa0,0x9f,0x9e,0x9d,0x9c,0x9b,0x9a, /* sqrt(1/c0)..sqrt(1/c7) */
				63	0x99,0x98,0x97,0x96,0x95,0x94,0x93,0x92, /* sqrt(1/c8)..sqrt(1/cf) */
				64	0x91,0x90,0x8f,0x8e,0x8d,0x8c,0x8c,0x8b, /* sqrt(1/d0)..sqrt(1/d7) */
				65	0x8a,0x89,0x88,0x87,0x86,0x85,0x84,0x83, /* sqrt(1/d8)..sqrt(1/df) */
				66	0x83,0x82,0x81,0x80,0x7f,0x7e,0x7e,0x7d, /* sqrt(1/e0)..sqrt(1/e7) */
				67	0x7c,0x7b,0x7a,0x79,0x79,0x78,0x77,0x76, /* sqrt(1/e8)..sqrt(1/ef) */
				68	0x76,0x75,0x74,0x73,0x72,0x72,0x71,0x70, /* sqrt(1/f0)..sqrt(1/f7) */
				69	0x6f,0x6f,0x6e,0x6d,0x6d,0x6c,0x6b,0x6a, /* sqrt(1/f8)..sqrt(1/ff) */
				70	0x6a,0x69,0x68,0x68,0x67,0x66,0x66,0x65, /* sqrt(1/100)..sqrt(1/107) */
				71	0x64,0x64,0x63,0x62,0x62,0x61,0x60,0x60, /* sqrt(1/108)..sqrt(1/10f) */
				72	0x5f,0x5e,0x5e,0x5d,0x5c,0x5c,0x5b,0x5a, /* sqrt(1/110)..sqrt(1/117) */
				73	0x5a,0x59,0x59,0x58,0x57,0x57,0x56,0x56, /* sqrt(1/118)..sqrt(1/11f) */
				74	0x55,0x54,0x54,0x53,0x53,0x52,0x52,0x51, /* sqrt(1/120)..sqrt(1/127) */
				75	0x50,0x50,0x4f,0x4f,0x4e,0x4e,0x4d,0x4d, /* sqrt(1/128)..sqrt(1/12f) */
				76	0x4c,0x4b,0x4b,0x4a,0x4a,0x49,0x49,0x48, /* sqrt(1/130)..sqrt(1/137) */
				77	0x48,0x47,0x47,0x46,0x46,0x45,0x45,0x44, /* sqrt(1/138)..sqrt(1/13f) */
				78	0x44,0x43,0x43,0x42,0x42,0x41,0x41,0x40, /* sqrt(1/140)..sqrt(1/147) */
				79	0x40,0x3f,0x3f,0x3e,0x3e,0x3d,0x3d,0x3c, /* sqrt(1/148)..sqrt(1/14f) */
				80	0x3c,0x3b,0x3b,0x3a,0x3a,0x39,0x39,0x39, /* sqrt(1/150)..sqrt(1/157) */
				81	0x38,0x38,0x37,0x37,0x36,0x36,0x35,0x35, /* sqrt(1/158)..sqrt(1/15f) */
				82	0x35,0x34,0x34,0x33,0x33,0x32,0x32,0x32, /* sqrt(1/160)..sqrt(1/167) */
				83	0x31,0x31,0x30,0x30,0x2f,0x2f,0x2f,0x2e, /* sqrt(1/168)..sqrt(1/16f) */
				84	0x2e,0x2d,0x2d,0x2d,0x2c,0x2c,0x2b,0x2b, /* sqrt(1/170)..sqrt(1/177) */
				85	0x2b,0x2a,0x2a,0x29,0x29,0x29,0x28,0x28, /* sqrt(1/178)..sqrt(1/17f) */
				86	0x27,0x27,0x27,0x26,0x26,0x26,0x25,0x25, /* sqrt(1/180)..sqrt(1/187) */
				87	0x24,0x24,0x24,0x23,0x23,0x23,0x22,0x22, /* sqrt(1/188)..sqrt(1/18f) */
				88	0x21,0x21,0x21,0x20,0x20,0x20,0x1f,0x1f, /* sqrt(1/190)..sqrt(1/197) */
				89	0x1f,0x1e,0x1e,0x1e,0x1d,0x1d,0x1d,0x1c, /* sqrt(1/198)..sqrt(1/19f) */
				90	0x1c,0x1b,0x1b,0x1b,0x1a,0x1a,0x1a,0x19, /* sqrt(1/1a0)..sqrt(1/1a7) */
				91	0x19,0x19,0x18,0x18,0x18,0x18,0x17,0x17, /* sqrt(1/1a8)..sqrt(1/1af) */
				92	0x17,0x16,0x16,0x16,0x15,0x15,0x15,0x14, /* sqrt(1/1b0)..sqrt(1/1b7) */
				93	0x14,0x14,0x13,0x13,0x13,0x12,0x12,0x12, /* sqrt(1/1b8)..sqrt(1/1bf) */
				94	0x12,0x11,0x11,0x11,0x10,0x10,0x10,0x0f, /* sqrt(1/1c0)..sqrt(1/1c7) */
				95	0x0f,0x0f,0x0f,0x0e,0x0e,0x0e,0x0d,0x0d, /* sqrt(1/1c8)..sqrt(1/1cf) */
				96	0x0d,0x0c,0x0c,0x0c,0x0c,0x0b,0x0b,0x0b, /* sqrt(1/1d0)..sqrt(1/1d7) */
				97	0x0a,0x0a,0x0a,0x0a,0x09,0x09,0x09,0x09, /* sqrt(1/1d8)..sqrt(1/1df) */
				98	0x08,0x08,0x08,0x07,0x07,0x07,0x07,0x06, /* sqrt(1/1e0)..sqrt(1/1e7) */
				99	0x06,0x06,0x06,0x05,0x05,0x05,0x04,0x04, /* sqrt(1/1e8)..sqrt(1/1ef) */
				100	0x04,0x04,0x03,0x03,0x03,0x03,0x02,0x02, /* sqrt(1/1f0)..sqrt(1/1f7) */
				101	0x02,0x02,0x01,0x01,0x01,0x01,0x00,0x00 /* sqrt(1/1f8)..sqrt(1/1ff) */
				102	};
				103
				104	/* Compute s = floor(sqrt(a0)), and rp = a0 - s^2. /
				105
				106	#if GMP_NUMB_BITS > 32
				107	#define MAGIC CNST_LIMB(0x10000000000) /* 0xffe7debbfc < MAGIC < 0x232b1850f410 */
				108	#else
				109	#define MAGIC CNST_LIMB(0x100000) /* 0xfee6f < MAGIC < 0x29cbc8 */
				110	#endif
				111
				112	static mp_limb_t
				113	mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
				114	{
				115	#if GMP_NUMB_BITS > 32
				116	mp_limb_t a1;
				117	#endif
				118	mp_limb_t x0, t2, t, x2;
				119	unsigned abits;
				120
				121	ASSERT_ALWAYS (GMP_NAIL_BITS == 0);
				122	ASSERT_ALWAYS (GMP_LIMB_BITS == 32 \|\| GMP_LIMB_BITS == 64);
				123	ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2);
				124
				125	/* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a),
				126	since we can do the former without division. As part of the last
				127	iteration convert from 1/sqrt(a) to sqrt(a). */
				128
				129	abits = a0 >> (GMP_LIMB_BITS - 1 - 8); /* extract bits for table lookup */
				130	x0 = 0x100 \| invsqrttab[abits - 0x80]; /* initial 1/sqrt(a) */
				131
				132	/* x0 is now an 8 bits approximation of 1/sqrt(a0) */
				133
				134	#if GMP_NUMB_BITS > 32
				135	a1 = a0 >> (GMP_LIMB_BITS - 1 - 32);
				136	t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000 - a1 * x0 * x0) >> 16;
				137	x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2));
				138
				139	/* x0 is now a 16 bits approximation of 1/sqrt(a0) */
				140
				141	t2 = x0 * (a0 >> (32-8));
				142	t = t2 >> 25;
				143	t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8));
				144	x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15);
				145	x0 >>= 32;
				146	#else
				147	t2 = x0 * (a0 >> (16-8));
				148	t = t2 >> 13;
				149	t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8));
				150	x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7);
				151	x0 >>= 16;
				152	#endif
				153
				154	/* x0 is now a full limb approximation of sqrt(a0) */
				155
				156	x2 = x0 * x0;
				157	if (x2 + 2*x0 <= a0 - 1)
				158	{
				159	x2 += 2*x0 + 1;
				160	x0++;
				161	}
				162
				163	*rp = a0 - x2;
				164	return x0;
				165	}
				166
				167
				168	#define Prec (GMP_NUMB_BITS >> 1)
				169	#if ! defined(SQRTREM2_INPLACE)
				170	#define SQRTREM2_INPLACE 0
				171	#endif
				172
				173	/* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
				174	return cc such that {np, 2} = sp[0]^2 + cc2^GMP_NUMB_BITS + rp[0] /
				175	#if SQRTREM2_INPLACE
				176	#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp)
				177	static mp_limb_t
				178	mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp)
				179	{
				180	mp_srcptr np = rp;
				181	#else
				182	#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp, rp)
				183	static mp_limb_t
				184	mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
				185	{
				186	#endif
				187	mp_limb_t q, u, np0, sp0, rp0, q2;
				188	int cc;
				189
				190	ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2);
				191
				192	np0 = np[0];
				193	sp0 = mpn_sqrtrem1 (rp, np[1]);
				194	rp0 = rp[0];
				195	/* rp0 <= 2sp0 < 2^(Prec + 1) /
				196	rp0 = (rp0 << (Prec - 1)) + (np0 >> (Prec + 1));
				197	q = rp0 / sp0;
				198	/* q <= 2^Prec, if q = 2^Prec, reduce the overestimate. */
				199	q -= q >> Prec;
				200	/* now we have q < 2^Prec */
				201	u = rp0 - q * sp0;
				202	/* now we have (rp[0]<<Prec + np0>>Prec)/2 = q * sp0 + u */
				203	sp0 = (sp0 << Prec) \| q;
				204	cc = u >> (Prec - 1);
				205	rp0 = ((u << (Prec + 1)) & GMP_NUMB_MASK) + (np0 & ((CNST_LIMB (1) << (Prec + 1)) - 1));
				206	/* subtract q * q from rp */
				207	q2 = q * q;
				208	cc -= rp0 < q2;
				209	rp0 -= q2;
				210	if (cc < 0)
				211	{
				212	rp0 += sp0;
				213	cc += rp0 < sp0;
				214	--sp0;
				215	rp0 += sp0;
				216	cc += rp0 < sp0;
				217	}
				218
				219	rp[0] = rp0;
				220	sp[0] = sp0;
				221	return cc;
				222	}
				223
				224	/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
				225	and in {np, n} the low n limbs of the remainder, returns the high
				226	limb of the remainder (which is 0 or 1).
				227	Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4
				228	where B=2^GMP_NUMB_BITS.
				229	Needs a scratch of n/2+1 limbs. */
				230	static mp_limb_t
				231	mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n, mp_limb_t approx, mp_ptr scratch)
				232	{
				233	mp_limb_t q; /* carry out of {sp, n} */
				234	int c, b; /* carry out of remainder */
				235	mp_size_t l, h;
				236
				237	ASSERT (n > 1);
				238	ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2);
				239
				240	l = n / 2;
				241	h = n - l;
				242	if (h == 1)
				243	q = CALL_SQRTREM2_INPLACE (sp + l, np + 2 * l);
				244	else
				245	q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h, 0, scratch);
				246	if (q != 0)
				247	ASSERT_CARRY (mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h));
				248	TRACE(printf("tdiv_qr(,,,,%u,,%u) -> %u\n", (unsigned) n, (unsigned) h, (unsigned) (n - h + 1)));
				249	mpn_tdiv_qr (scratch, np + l, 0, np + l, n, sp + l, h);
				250	q += scratch[l];
				251	c = scratch[0] & 1;
				252	mpn_rshift (sp, scratch, l, 1);
				253	sp[l - 1] \|= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
				254	if (UNLIKELY ((sp[0] & approx) != 0)) /* (sp[0] & mask) > 1 */
				255	return 1; /* Remainder is non-zero */
				256	q >>= 1;
				257	if (c != 0)
				258	c = mpn_add_n (np + l, np + l, sp + l, h);
				259	TRACE(printf("sqr(,,%u)\n", (unsigned) l));
				260	mpn_sqr (np + n, sp, l);
				261	b = q + mpn_sub_n (np, np, np + n, 2 * l);
				262	c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b);
				263
				264	if (c < 0)
				265	{
				266	q = mpn_add_1 (sp + l, sp + l, h, q);
				267	#if HAVE_NATIVE_mpn_addlsh1_n_ip1 \|\| HAVE_NATIVE_mpn_addlsh1_n
				268	c += mpn_addlsh1_n_ip1 (np, sp, n) + 2 * q;
				269	#else
				270	c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q;
				271	#endif
				272	c -= mpn_sub_1 (np, np, n, CNST_LIMB(1));
				273	q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1));
				274	}
				275
				276	return c;
				277	}
				278
				279	#if USE_DIVAPPR_Q
				280	static void
				281	mpn_divappr_q (mp_ptr qp, mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_ptr scratch)
				282	{
				283	gmp_pi1_t inv;
				284	mp_limb_t qh;
				285	ASSERT (dn > 2);
				286	ASSERT (nn >= dn);
				287	ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);
				288
				289	MPN_COPY (scratch, np, nn);
				290	invert_pi1 (inv, dp[dn-1], dp[dn-2]);
				291	if (BELOW_THRESHOLD (dn, DC_DIVAPPR_Q_THRESHOLD))
				292	qh = mpn_sbpi1_divappr_q (qp, scratch, nn, dp, dn, inv.inv32);
				293	else if (BELOW_THRESHOLD (dn, MU_DIVAPPR_Q_THRESHOLD))
				294	qh = mpn_dcpi1_divappr_q (qp, scratch, nn, dp, dn, &inv);
				295	else
				296	{
				297	mp_size_t itch = mpn_mu_divappr_q_itch (nn, dn, 0);
				298	TMP_DECL;
				299	TMP_MARK;
				300	/* Sadly, scratch is too small. */
				301	qh = mpn_mu_divappr_q (qp, np, nn, dp, dn, TMP_ALLOC_LIMBS (itch));
				302	TMP_FREE;
				303	}
				304	qp [nn - dn] = qh;
				305	}
				306	#endif
				307
				308	/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n-odd},
				309	returns zero if the operand was a perfect square, one otherwise.
				310	Assumes {np, 2n-odd}4^nsh is normalized, i.e. B > np[2n-1-odd]4^nsh >= B/4
				311	where B=2^GMP_NUMB_BITS.
				312	THINK: In the odd case, three more (dummy) limbs are taken into account,
				313	when nsh is maximal, two limbs are discarded from the result of the
				314	division. Too much? Is a single dummy limb enough? */
				315	static int
				316	mpn_dc_sqrt (mp_ptr sp, mp_srcptr np, mp_size_t n, unsigned nsh, unsigned odd)
				317	{
				318	mp_limb_t q; /* carry out of {sp, n} */
				319	int c; /* carry out of remainder */
				320	mp_size_t l, h;
				321	mp_ptr qp, tp, scratch;
				322	TMP_DECL;
				323	TMP_MARK;
				324
				325	ASSERT (np[2 * n - 1 - odd] != 0);
				326	ASSERT (n > 4);
				327	ASSERT (nsh < GMP_NUMB_BITS / 2);
				328
				329	l = (n - 1) / 2;
				330	h = n - l;
				331	ASSERT (n >= l + 2 && l + 2 >= h && h > l && l >= 1 + odd);
				332	scratch = TMP_ALLOC_LIMBS (l + 2 * n + 5 - USE_DIVAPPR_Q); /* n + 2-USE_DIVAPPR_Q */
				333	tp = scratch + n + 2 - USE_DIVAPPR_Q; /* n + h + 1, but tp [-1] is writable */
				334	if (nsh != 0)
				335	{
				336	/* o is used to exactly set the lowest bits of the dividend, is it needed? */
				337	int o = l > (1 + odd);
				338	ASSERT_NOCARRY (mpn_lshift (tp - o, np + l - 1 - o - odd, n + h + 1 + o, 2 * nsh));
				339	}
				340	else
				341	MPN_COPY (tp, np + l - 1 - odd, n + h + 1);
				342	q = mpn_dc_sqrtrem (sp + l, tp + l + 1, h, 0, scratch);
				343	if (q != 0)
				344	ASSERT_CARRY (mpn_sub_n (tp + l + 1, tp + l + 1, sp + l, h));
				345	qp = tp + n + 1; /* l + 2 */
				346	TRACE(printf("div(appr)_q(,,%u,,%u) -> %u \n", (unsigned) n+1, (unsigned) h, (unsigned) (n + 1 - h + 1)));
				347	#if USE_DIVAPPR_Q
				348	mpn_divappr_q (qp, tp, n + 1, sp + l, h, scratch);
				349	#else
				350	mpn_div_q (qp, tp, n + 1, sp + l, h, scratch);
				351	#endif
				352	q += qp [l + 1];
				353	c = 1;
				354	if (q > 1)
				355	{
				356	/* FIXME: if s!=0 we will shift later, a noop on this area. */
				357	MPN_FILL (sp, l, GMP_NUMB_MAX);
				358	}
				359	else
				360	{
				361	/* FIXME: if s!=0 we will shift again later, shift just once. */
				362	mpn_rshift (sp, qp + 1, l, 1);
				363	sp[l - 1] \|= q << (GMP_NUMB_BITS - 1);
				364	if (((qp[0] >> (2 + USE_DIVAPPR_Q)) \| /* < 3 + 4USE_DIVAPPR_Q /
				365	(qp[1] & (GMP_NUMB_MASK >> ((GMP_NUMB_BITS >> odd)- nsh - 1)))) == 0)
				366	{
				367	mp_limb_t cy;
				368	/* Approximation is not good enough, the extra limb(+ nsh bits)
				369	is smaller than needed to absorb the possible error. */
				370	/* {qp + 1, l + 1} equals 2{sp, l} /
				371	/* FIXME: use mullo or wrap-around, or directly evaluate
				372	remainder with a single sqrmod_bnm1. */
				373	TRACE(printf("mul(,,%u,,%u)\n", (unsigned) h, (unsigned) (l+1)));
				374	ASSERT_NOCARRY (mpn_mul (scratch, sp + l, h, qp + 1, l + 1));
				375	/* Compute the remainder of the previous mpn_div(appr)_q. */
				376	cy = mpn_sub_n (tp + 1, tp + 1, scratch, h);
				377	#if USE_DIVAPPR_Q \|\| WANT_ASSERT
				378	MPN_DECR_U (tp + 1 + h, l, cy);
				379	#if USE_DIVAPPR_Q
				380	ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) <= 0);
				381	if (mpn_cmp (tp + 1 + h, scratch + h, l) < 0)
				382	{
				383	/* May happen only if div result was not exact. */
				384	#if HAVE_NATIVE_mpn_addlsh1_n_ip1 \|\| HAVE_NATIVE_mpn_addlsh1_n
				385	cy = mpn_addlsh1_n_ip1 (tp + 1, sp + l, h);
				386	#else
				387	cy = mpn_addmul_1 (tp + 1, sp + l, h, CNST_LIMB(2));
				388	#endif
				389	ASSERT_NOCARRY (mpn_add_1 (tp + 1 + h, tp + 1 + h, l, cy));
				390	MPN_DECR_U (sp, l, 1);
				391	}
				392	/* Can the root be exact when a correction was needed? We
				393	did not find an example, but it depends on divappr
				394	internals, and we can not assume it true in general...*/
				395	/* else */
				396	#else /* WANT_ASSERT */
				397	ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) == 0);
				398	#endif
				399	#endif
				400	if (mpn_zero_p (tp + l + 1, h - l))
				401	{
				402	TRACE(printf("sqr(,,%u)\n", (unsigned) l));
				403	mpn_sqr (scratch, sp, l);
				404	c = mpn_cmp (tp + 1, scratch + l, l);
				405	if (c == 0)
				406	{
				407	if (nsh != 0)
				408	{
				409	mpn_lshift (tp, np, l, 2 * nsh);
				410	np = tp;
				411	}
				412	c = mpn_cmp (np, scratch + odd, l - odd);
				413	}
				414	if (c < 0)
				415	{
				416	MPN_DECR_U (sp, l, 1);
				417	c = 1;
				418	}
				419	}
				420	}
				421	}
				422	TMP_FREE;
				423
				424	if ((odd \| nsh) != 0)
				425	mpn_rshift (sp, sp, n, nsh + (odd ? GMP_NUMB_BITS / 2 : 0));
				426	return c;
				427	}
				428
				429
				430	mp_size_t
				431	mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
				432	{
				433	mp_limb_t cc, high, rl;
				434	int c;
				435	mp_size_t rn, tn;
				436	TMP_DECL;
				437
				438	ASSERT (nn > 0);
				439	ASSERT_MPN (np, nn);
				440
				441	ASSERT (np[nn - 1] != 0);
				442	ASSERT (rp == NULL \|\| MPN_SAME_OR_SEPARATE_P (np, rp, nn));
				443	ASSERT (rp == NULL \|\| ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
				444	ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));
				445
				446	high = np[nn - 1];
				447	if (high & (GMP_NUMB_HIGHBIT \| (GMP_NUMB_HIGHBIT / 2)))
				448	c = 0;
				449	else
				450	{
				451	count_leading_zeros (c, high);
				452	c -= GMP_NAIL_BITS;
				453
				454	c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
				455	}
				456	if (nn == 1) {
				457	if (c == 0)
				458	{
				459	sp[0] = mpn_sqrtrem1 (&rl, high);
				460	if (rp != NULL)
				461	rp[0] = rl;
				462	}
				463	else
				464	{
				465	cc = mpn_sqrtrem1 (&rl, high << (2*c)) >> c;
				466	sp[0] = cc;
				467	if (rp != NULL)
				468	rp[0] = rl = high - cc*cc;
				469	}
				470	return rl != 0;
				471	}
				472	if (nn == 2) {
				473	mp_limb_t tp [2];
				474	if (rp == NULL) rp = tp;
				475	if (c == 0)
				476	{
				477	#if SQRTREM2_INPLACE
				478	rp[1] = high;
				479	rp[0] = np[0];
				480	cc = CALL_SQRTREM2_INPLACE (sp, rp);
				481	#else
				482	cc = mpn_sqrtrem2 (sp, rp, np);
				483	#endif
				484	rp[1] = cc;
				485	return ((rp[0] \| cc) != 0) + cc;
				486	}
				487	else
				488	{
				489	rl = np[0];
				490	rp[1] = (high << (2c)) \| (rl >> (GMP_NUMB_BITS - 2c));
				491	rp[0] = rl << (2*c);
				492	CALL_SQRTREM2_INPLACE (sp, rp);
				493	cc = sp[0] >>= c; /* c != 0, the highest bit of the root cc is 0. */
				494	rp[0] = rl -= cccc; / Computed modulo 2^GMP_LIMB_BITS, because it's smaller. */
				495	return rl != 0;
				496	}
				497	}
				498	tn = (nn + 1) / 2; /* 2tn is the smallest even integer >= nn /
				499
				500	if ((rp == NULL) && (nn > 8))
				501	return mpn_dc_sqrt (sp, np, tn, c, nn & 1);
				502	TMP_MARK;
				503	if (((nn & 1) \| c) != 0)
				504	{
				505	mp_limb_t s0[1], mask;
				506	mp_ptr tp, scratch;
				507	TMP_ALLOC_LIMBS_2 (tp, 2 * tn, scratch, tn / 2 + 1);
				508	tp[0] = 0; /* needed only when 2tn > nn, but saves a test /
				509	if (c != 0)
				510	mpn_lshift (tp + (nn & 1), np, nn, 2 * c);
				511	else
				512	MPN_COPY (tp + (nn & 1), np, nn);
				513	c += (nn & 1) ? GMP_NUMB_BITS / 2 : 0; /* c now represents k */
				514	mask = (CNST_LIMB (1) << c) - 1;
				515	rl = mpn_dc_sqrtrem (sp, tp, tn, (rp == NULL) ? mask - 1 : 0, scratch);
				516	/* We have 2^(2k)N = S^2 + R where k = c + (2tn-nn)GMP_NUMB_BITS/2,
				517	thus 2^(2k)N = (S-s0)^2 + 2Ss0 - s0^2 + R where s0=S mod 2^k /
				518	s0[0] = sp[0] & mask; /* S mod 2^k */
				519	rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]); /* R = R + 2s0S */
				520	cc = mpn_submul_1 (tp, s0, 1, s0[0]);
				521	rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc;
				522	mpn_rshift (sp, sp, tn, c);
				523	tp[tn] = rl;
				524	if (rp == NULL)
				525	rp = tp;
				526	c = c << 1;
				527	if (c < GMP_NUMB_BITS)
				528	tn++;
				529	else
				530	{
				531	tp++;
				532	c -= GMP_NUMB_BITS;
				533	}
				534	if (c != 0)
				535	mpn_rshift (rp, tp, tn, c);
				536	else
				537	MPN_COPY_INCR (rp, tp, tn);
				538	rn = tn;
				539	}
				540	else
				541	{
				542	if (rp != np)
				543	{
				544	if (rp == NULL) /* nn <= 8 */
				545	rp = TMP_SALLOC_LIMBS (nn);
				546	MPN_COPY (rp, np, nn);
				547	}
				548	rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn, 0, TMP_ALLOC_LIMBS(tn / 2 + 1)));
				549	}
				550
				551	MPN_NORMALIZE (rp, rn);
				552
				553	TMP_FREE;
				554	return rn;
				555	}