blob: cc6dd9c88d72ed9585c3dfa8675973bdccf098fb [file] [log] [blame]
Austin Schuhdace2a62020-08-18 10:56:48 -07001/* mpn_sqrtrem -- square root and remainder
2
3 Contributed to the GNU project by Paul Zimmermann (most code),
4 Torbjorn Granlund (mpn_sqrtrem1) and Marco Bodrato (mpn_dc_sqrt).
5
6 THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH MUTABLE
7 INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.
8 IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A
9 FUTURE GMP RELEASE.
10
11Copyright 1999-2002, 2004, 2005, 2008, 2010, 2012, 2015, 2017 Free Software
12Foundation, Inc.
13
14This file is part of the GNU MP Library.
15
16The GNU MP Library is free software; you can redistribute it and/or modify
17it under the terms of either:
18
19 * the GNU Lesser General Public License as published by the Free
20 Software Foundation; either version 3 of the License, or (at your
21 option) any later version.
22
23or
24
25 * the GNU General Public License as published by the Free Software
26 Foundation; either version 2 of the License, or (at your option) any
27 later version.
28
29or both in parallel, as here.
30
31The GNU MP Library is distributed in the hope that it will be useful, but
32WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
33or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
34for more details.
35
36You should have received copies of the GNU General Public License and the
37GNU Lesser General Public License along with the GNU MP Library. If not,
38see https://www.gnu.org/licenses/. */
39
40
41/* See "Karatsuba Square Root", reference in gmp.texi. */
42
43
44#include <stdio.h>
45#include <stdlib.h>
46
47#include "gmp-impl.h"
48#include "longlong.h"
49#define USE_DIVAPPR_Q 1
50#define TRACE(x)
51
52static const unsigned char invsqrttab[384] = /* The common 0x100 was removed */
53{
54 0xff,0xfd,0xfb,0xf9,0xf7,0xf5,0xf3,0xf2, /* sqrt(1/80)..sqrt(1/87) */
55 0xf0,0xee,0xec,0xea,0xe9,0xe7,0xe5,0xe4, /* sqrt(1/88)..sqrt(1/8f) */
56 0xe2,0xe0,0xdf,0xdd,0xdb,0xda,0xd8,0xd7, /* sqrt(1/90)..sqrt(1/97) */
57 0xd5,0xd4,0xd2,0xd1,0xcf,0xce,0xcc,0xcb, /* sqrt(1/98)..sqrt(1/9f) */
58 0xc9,0xc8,0xc6,0xc5,0xc4,0xc2,0xc1,0xc0, /* sqrt(1/a0)..sqrt(1/a7) */
59 0xbe,0xbd,0xbc,0xba,0xb9,0xb8,0xb7,0xb5, /* sqrt(1/a8)..sqrt(1/af) */
60 0xb4,0xb3,0xb2,0xb0,0xaf,0xae,0xad,0xac, /* sqrt(1/b0)..sqrt(1/b7) */
61 0xaa,0xa9,0xa8,0xa7,0xa6,0xa5,0xa4,0xa3, /* sqrt(1/b8)..sqrt(1/bf) */
62 0xa2,0xa0,0x9f,0x9e,0x9d,0x9c,0x9b,0x9a, /* sqrt(1/c0)..sqrt(1/c7) */
63 0x99,0x98,0x97,0x96,0x95,0x94,0x93,0x92, /* sqrt(1/c8)..sqrt(1/cf) */
64 0x91,0x90,0x8f,0x8e,0x8d,0x8c,0x8c,0x8b, /* sqrt(1/d0)..sqrt(1/d7) */
65 0x8a,0x89,0x88,0x87,0x86,0x85,0x84,0x83, /* sqrt(1/d8)..sqrt(1/df) */
66 0x83,0x82,0x81,0x80,0x7f,0x7e,0x7e,0x7d, /* sqrt(1/e0)..sqrt(1/e7) */
67 0x7c,0x7b,0x7a,0x79,0x79,0x78,0x77,0x76, /* sqrt(1/e8)..sqrt(1/ef) */
68 0x76,0x75,0x74,0x73,0x72,0x72,0x71,0x70, /* sqrt(1/f0)..sqrt(1/f7) */
69 0x6f,0x6f,0x6e,0x6d,0x6d,0x6c,0x6b,0x6a, /* sqrt(1/f8)..sqrt(1/ff) */
70 0x6a,0x69,0x68,0x68,0x67,0x66,0x66,0x65, /* sqrt(1/100)..sqrt(1/107) */
71 0x64,0x64,0x63,0x62,0x62,0x61,0x60,0x60, /* sqrt(1/108)..sqrt(1/10f) */
72 0x5f,0x5e,0x5e,0x5d,0x5c,0x5c,0x5b,0x5a, /* sqrt(1/110)..sqrt(1/117) */
73 0x5a,0x59,0x59,0x58,0x57,0x57,0x56,0x56, /* sqrt(1/118)..sqrt(1/11f) */
74 0x55,0x54,0x54,0x53,0x53,0x52,0x52,0x51, /* sqrt(1/120)..sqrt(1/127) */
75 0x50,0x50,0x4f,0x4f,0x4e,0x4e,0x4d,0x4d, /* sqrt(1/128)..sqrt(1/12f) */
76 0x4c,0x4b,0x4b,0x4a,0x4a,0x49,0x49,0x48, /* sqrt(1/130)..sqrt(1/137) */
77 0x48,0x47,0x47,0x46,0x46,0x45,0x45,0x44, /* sqrt(1/138)..sqrt(1/13f) */
78 0x44,0x43,0x43,0x42,0x42,0x41,0x41,0x40, /* sqrt(1/140)..sqrt(1/147) */
79 0x40,0x3f,0x3f,0x3e,0x3e,0x3d,0x3d,0x3c, /* sqrt(1/148)..sqrt(1/14f) */
80 0x3c,0x3b,0x3b,0x3a,0x3a,0x39,0x39,0x39, /* sqrt(1/150)..sqrt(1/157) */
81 0x38,0x38,0x37,0x37,0x36,0x36,0x35,0x35, /* sqrt(1/158)..sqrt(1/15f) */
82 0x35,0x34,0x34,0x33,0x33,0x32,0x32,0x32, /* sqrt(1/160)..sqrt(1/167) */
83 0x31,0x31,0x30,0x30,0x2f,0x2f,0x2f,0x2e, /* sqrt(1/168)..sqrt(1/16f) */
84 0x2e,0x2d,0x2d,0x2d,0x2c,0x2c,0x2b,0x2b, /* sqrt(1/170)..sqrt(1/177) */
85 0x2b,0x2a,0x2a,0x29,0x29,0x29,0x28,0x28, /* sqrt(1/178)..sqrt(1/17f) */
86 0x27,0x27,0x27,0x26,0x26,0x26,0x25,0x25, /* sqrt(1/180)..sqrt(1/187) */
87 0x24,0x24,0x24,0x23,0x23,0x23,0x22,0x22, /* sqrt(1/188)..sqrt(1/18f) */
88 0x21,0x21,0x21,0x20,0x20,0x20,0x1f,0x1f, /* sqrt(1/190)..sqrt(1/197) */
89 0x1f,0x1e,0x1e,0x1e,0x1d,0x1d,0x1d,0x1c, /* sqrt(1/198)..sqrt(1/19f) */
90 0x1c,0x1b,0x1b,0x1b,0x1a,0x1a,0x1a,0x19, /* sqrt(1/1a0)..sqrt(1/1a7) */
91 0x19,0x19,0x18,0x18,0x18,0x18,0x17,0x17, /* sqrt(1/1a8)..sqrt(1/1af) */
92 0x17,0x16,0x16,0x16,0x15,0x15,0x15,0x14, /* sqrt(1/1b0)..sqrt(1/1b7) */
93 0x14,0x14,0x13,0x13,0x13,0x12,0x12,0x12, /* sqrt(1/1b8)..sqrt(1/1bf) */
94 0x12,0x11,0x11,0x11,0x10,0x10,0x10,0x0f, /* sqrt(1/1c0)..sqrt(1/1c7) */
95 0x0f,0x0f,0x0f,0x0e,0x0e,0x0e,0x0d,0x0d, /* sqrt(1/1c8)..sqrt(1/1cf) */
96 0x0d,0x0c,0x0c,0x0c,0x0c,0x0b,0x0b,0x0b, /* sqrt(1/1d0)..sqrt(1/1d7) */
97 0x0a,0x0a,0x0a,0x0a,0x09,0x09,0x09,0x09, /* sqrt(1/1d8)..sqrt(1/1df) */
98 0x08,0x08,0x08,0x07,0x07,0x07,0x07,0x06, /* sqrt(1/1e0)..sqrt(1/1e7) */
99 0x06,0x06,0x06,0x05,0x05,0x05,0x04,0x04, /* sqrt(1/1e8)..sqrt(1/1ef) */
100 0x04,0x04,0x03,0x03,0x03,0x03,0x02,0x02, /* sqrt(1/1f0)..sqrt(1/1f7) */
101 0x02,0x02,0x01,0x01,0x01,0x01,0x00,0x00 /* sqrt(1/1f8)..sqrt(1/1ff) */
102};
103
104/* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2. */
105
106#if GMP_NUMB_BITS > 32
107#define MAGIC CNST_LIMB(0x10000000000) /* 0xffe7debbfc < MAGIC < 0x232b1850f410 */
108#else
109#define MAGIC CNST_LIMB(0x100000) /* 0xfee6f < MAGIC < 0x29cbc8 */
110#endif
111
112static mp_limb_t
113mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
114{
115#if GMP_NUMB_BITS > 32
116 mp_limb_t a1;
117#endif
118 mp_limb_t x0, t2, t, x2;
119 unsigned abits;
120
121 ASSERT_ALWAYS (GMP_NAIL_BITS == 0);
122 ASSERT_ALWAYS (GMP_LIMB_BITS == 32 || GMP_LIMB_BITS == 64);
123 ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2);
124
125 /* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a),
126 since we can do the former without division. As part of the last
127 iteration convert from 1/sqrt(a) to sqrt(a). */
128
129 abits = a0 >> (GMP_LIMB_BITS - 1 - 8); /* extract bits for table lookup */
130 x0 = 0x100 | invsqrttab[abits - 0x80]; /* initial 1/sqrt(a) */
131
132 /* x0 is now an 8 bits approximation of 1/sqrt(a0) */
133
134#if GMP_NUMB_BITS > 32
135 a1 = a0 >> (GMP_LIMB_BITS - 1 - 32);
136 t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000 - a1 * x0 * x0) >> 16;
137 x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2));
138
139 /* x0 is now a 16 bits approximation of 1/sqrt(a0) */
140
141 t2 = x0 * (a0 >> (32-8));
142 t = t2 >> 25;
143 t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8));
144 x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15);
145 x0 >>= 32;
146#else
147 t2 = x0 * (a0 >> (16-8));
148 t = t2 >> 13;
149 t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8));
150 x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7);
151 x0 >>= 16;
152#endif
153
154 /* x0 is now a full limb approximation of sqrt(a0) */
155
156 x2 = x0 * x0;
157 if (x2 + 2*x0 <= a0 - 1)
158 {
159 x2 += 2*x0 + 1;
160 x0++;
161 }
162
163 *rp = a0 - x2;
164 return x0;
165}
166
167
168#define Prec (GMP_NUMB_BITS >> 1)
169#if ! defined(SQRTREM2_INPLACE)
170#define SQRTREM2_INPLACE 0
171#endif
172
173/* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
174 return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */
175#if SQRTREM2_INPLACE
176#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp)
177static mp_limb_t
178mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp)
179{
180 mp_srcptr np = rp;
181#else
182#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp, rp)
183static mp_limb_t
184mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
185{
186#endif
187 mp_limb_t q, u, np0, sp0, rp0, q2;
188 int cc;
189
190 ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2);
191
192 np0 = np[0];
193 sp0 = mpn_sqrtrem1 (rp, np[1]);
194 rp0 = rp[0];
195 /* rp0 <= 2*sp0 < 2^(Prec + 1) */
196 rp0 = (rp0 << (Prec - 1)) + (np0 >> (Prec + 1));
197 q = rp0 / sp0;
198 /* q <= 2^Prec, if q = 2^Prec, reduce the overestimate. */
199 q -= q >> Prec;
200 /* now we have q < 2^Prec */
201 u = rp0 - q * sp0;
202 /* now we have (rp[0]<<Prec + np0>>Prec)/2 = q * sp0 + u */
203 sp0 = (sp0 << Prec) | q;
204 cc = u >> (Prec - 1);
205 rp0 = ((u << (Prec + 1)) & GMP_NUMB_MASK) + (np0 & ((CNST_LIMB (1) << (Prec + 1)) - 1));
206 /* subtract q * q from rp */
207 q2 = q * q;
208 cc -= rp0 < q2;
209 rp0 -= q2;
210 if (cc < 0)
211 {
212 rp0 += sp0;
213 cc += rp0 < sp0;
214 --sp0;
215 rp0 += sp0;
216 cc += rp0 < sp0;
217 }
218
219 rp[0] = rp0;
220 sp[0] = sp0;
221 return cc;
222}
223
224/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
225 and in {np, n} the low n limbs of the remainder, returns the high
226 limb of the remainder (which is 0 or 1).
227 Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4
228 where B=2^GMP_NUMB_BITS.
229 Needs a scratch of n/2+1 limbs. */
230static mp_limb_t
231mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n, mp_limb_t approx, mp_ptr scratch)
232{
233 mp_limb_t q; /* carry out of {sp, n} */
234 int c, b; /* carry out of remainder */
235 mp_size_t l, h;
236
237 ASSERT (n > 1);
238 ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2);
239
240 l = n / 2;
241 h = n - l;
242 if (h == 1)
243 q = CALL_SQRTREM2_INPLACE (sp + l, np + 2 * l);
244 else
245 q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h, 0, scratch);
246 if (q != 0)
247 ASSERT_CARRY (mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h));
248 TRACE(printf("tdiv_qr(,,,,%u,,%u) -> %u\n", (unsigned) n, (unsigned) h, (unsigned) (n - h + 1)));
249 mpn_tdiv_qr (scratch, np + l, 0, np + l, n, sp + l, h);
250 q += scratch[l];
251 c = scratch[0] & 1;
252 mpn_rshift (sp, scratch, l, 1);
253 sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
254 if (UNLIKELY ((sp[0] & approx) != 0)) /* (sp[0] & mask) > 1 */
255 return 1; /* Remainder is non-zero */
256 q >>= 1;
257 if (c != 0)
258 c = mpn_add_n (np + l, np + l, sp + l, h);
259 TRACE(printf("sqr(,,%u)\n", (unsigned) l));
260 mpn_sqr (np + n, sp, l);
261 b = q + mpn_sub_n (np, np, np + n, 2 * l);
262 c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b);
263
264 if (c < 0)
265 {
266 q = mpn_add_1 (sp + l, sp + l, h, q);
267#if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
268 c += mpn_addlsh1_n_ip1 (np, sp, n) + 2 * q;
269#else
270 c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q;
271#endif
272 c -= mpn_sub_1 (np, np, n, CNST_LIMB(1));
273 q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1));
274 }
275
276 return c;
277}
278
279#if USE_DIVAPPR_Q
280static void
281mpn_divappr_q (mp_ptr qp, mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_ptr scratch)
282{
283 gmp_pi1_t inv;
284 mp_limb_t qh;
285 ASSERT (dn > 2);
286 ASSERT (nn >= dn);
287 ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);
288
289 MPN_COPY (scratch, np, nn);
290 invert_pi1 (inv, dp[dn-1], dp[dn-2]);
291 if (BELOW_THRESHOLD (dn, DC_DIVAPPR_Q_THRESHOLD))
292 qh = mpn_sbpi1_divappr_q (qp, scratch, nn, dp, dn, inv.inv32);
293 else if (BELOW_THRESHOLD (dn, MU_DIVAPPR_Q_THRESHOLD))
294 qh = mpn_dcpi1_divappr_q (qp, scratch, nn, dp, dn, &inv);
295 else
296 {
297 mp_size_t itch = mpn_mu_divappr_q_itch (nn, dn, 0);
298 TMP_DECL;
299 TMP_MARK;
300 /* Sadly, scratch is too small. */
301 qh = mpn_mu_divappr_q (qp, np, nn, dp, dn, TMP_ALLOC_LIMBS (itch));
302 TMP_FREE;
303 }
304 qp [nn - dn] = qh;
305}
306#endif
307
308/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n-odd},
309 returns zero if the operand was a perfect square, one otherwise.
310 Assumes {np, 2n-odd}*4^nsh is normalized, i.e. B > np[2n-1-odd]*4^nsh >= B/4
311 where B=2^GMP_NUMB_BITS.
312 THINK: In the odd case, three more (dummy) limbs are taken into account,
313 when nsh is maximal, two limbs are discarded from the result of the
314 division. Too much? Is a single dummy limb enough? */
315static int
316mpn_dc_sqrt (mp_ptr sp, mp_srcptr np, mp_size_t n, unsigned nsh, unsigned odd)
317{
318 mp_limb_t q; /* carry out of {sp, n} */
319 int c; /* carry out of remainder */
320 mp_size_t l, h;
321 mp_ptr qp, tp, scratch;
322 TMP_DECL;
323 TMP_MARK;
324
325 ASSERT (np[2 * n - 1 - odd] != 0);
326 ASSERT (n > 4);
327 ASSERT (nsh < GMP_NUMB_BITS / 2);
328
329 l = (n - 1) / 2;
330 h = n - l;
331 ASSERT (n >= l + 2 && l + 2 >= h && h > l && l >= 1 + odd);
332 scratch = TMP_ALLOC_LIMBS (l + 2 * n + 5 - USE_DIVAPPR_Q); /* n + 2-USE_DIVAPPR_Q */
333 tp = scratch + n + 2 - USE_DIVAPPR_Q; /* n + h + 1, but tp [-1] is writable */
334 if (nsh != 0)
335 {
336 /* o is used to exactly set the lowest bits of the dividend, is it needed? */
337 int o = l > (1 + odd);
338 ASSERT_NOCARRY (mpn_lshift (tp - o, np + l - 1 - o - odd, n + h + 1 + o, 2 * nsh));
339 }
340 else
341 MPN_COPY (tp, np + l - 1 - odd, n + h + 1);
342 q = mpn_dc_sqrtrem (sp + l, tp + l + 1, h, 0, scratch);
343 if (q != 0)
344 ASSERT_CARRY (mpn_sub_n (tp + l + 1, tp + l + 1, sp + l, h));
345 qp = tp + n + 1; /* l + 2 */
346 TRACE(printf("div(appr)_q(,,%u,,%u) -> %u \n", (unsigned) n+1, (unsigned) h, (unsigned) (n + 1 - h + 1)));
347#if USE_DIVAPPR_Q
348 mpn_divappr_q (qp, tp, n + 1, sp + l, h, scratch);
349#else
350 mpn_div_q (qp, tp, n + 1, sp + l, h, scratch);
351#endif
352 q += qp [l + 1];
353 c = 1;
354 if (q > 1)
355 {
356 /* FIXME: if s!=0 we will shift later, a noop on this area. */
357 MPN_FILL (sp, l, GMP_NUMB_MAX);
358 }
359 else
360 {
361 /* FIXME: if s!=0 we will shift again later, shift just once. */
362 mpn_rshift (sp, qp + 1, l, 1);
363 sp[l - 1] |= q << (GMP_NUMB_BITS - 1);
364 if (((qp[0] >> (2 + USE_DIVAPPR_Q)) | /* < 3 + 4*USE_DIVAPPR_Q */
365 (qp[1] & (GMP_NUMB_MASK >> ((GMP_NUMB_BITS >> odd)- nsh - 1)))) == 0)
366 {
367 mp_limb_t cy;
368 /* Approximation is not good enough, the extra limb(+ nsh bits)
369 is smaller than needed to absorb the possible error. */
370 /* {qp + 1, l + 1} equals 2*{sp, l} */
371 /* FIXME: use mullo or wrap-around, or directly evaluate
372 remainder with a single sqrmod_bnm1. */
373 TRACE(printf("mul(,,%u,,%u)\n", (unsigned) h, (unsigned) (l+1)));
374 ASSERT_NOCARRY (mpn_mul (scratch, sp + l, h, qp + 1, l + 1));
375 /* Compute the remainder of the previous mpn_div(appr)_q. */
376 cy = mpn_sub_n (tp + 1, tp + 1, scratch, h);
377#if USE_DIVAPPR_Q || WANT_ASSERT
378 MPN_DECR_U (tp + 1 + h, l, cy);
379#if USE_DIVAPPR_Q
380 ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) <= 0);
381 if (mpn_cmp (tp + 1 + h, scratch + h, l) < 0)
382 {
383 /* May happen only if div result was not exact. */
384#if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
385 cy = mpn_addlsh1_n_ip1 (tp + 1, sp + l, h);
386#else
387 cy = mpn_addmul_1 (tp + 1, sp + l, h, CNST_LIMB(2));
388#endif
389 ASSERT_NOCARRY (mpn_add_1 (tp + 1 + h, tp + 1 + h, l, cy));
390 MPN_DECR_U (sp, l, 1);
391 }
392 /* Can the root be exact when a correction was needed? We
393 did not find an example, but it depends on divappr
394 internals, and we can not assume it true in general...*/
395 /* else */
396#else /* WANT_ASSERT */
397 ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) == 0);
398#endif
399#endif
400 if (mpn_zero_p (tp + l + 1, h - l))
401 {
402 TRACE(printf("sqr(,,%u)\n", (unsigned) l));
403 mpn_sqr (scratch, sp, l);
404 c = mpn_cmp (tp + 1, scratch + l, l);
405 if (c == 0)
406 {
407 if (nsh != 0)
408 {
409 mpn_lshift (tp, np, l, 2 * nsh);
410 np = tp;
411 }
412 c = mpn_cmp (np, scratch + odd, l - odd);
413 }
414 if (c < 0)
415 {
416 MPN_DECR_U (sp, l, 1);
417 c = 1;
418 }
419 }
420 }
421 }
422 TMP_FREE;
423
424 if ((odd | nsh) != 0)
425 mpn_rshift (sp, sp, n, nsh + (odd ? GMP_NUMB_BITS / 2 : 0));
426 return c;
427}
428
429
430mp_size_t
431mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
432{
433 mp_limb_t cc, high, rl;
434 int c;
435 mp_size_t rn, tn;
436 TMP_DECL;
437
438 ASSERT (nn > 0);
439 ASSERT_MPN (np, nn);
440
441 ASSERT (np[nn - 1] != 0);
442 ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn));
443 ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
444 ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));
445
446 high = np[nn - 1];
447 if (high & (GMP_NUMB_HIGHBIT | (GMP_NUMB_HIGHBIT / 2)))
448 c = 0;
449 else
450 {
451 count_leading_zeros (c, high);
452 c -= GMP_NAIL_BITS;
453
454 c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
455 }
456 if (nn == 1) {
457 if (c == 0)
458 {
459 sp[0] = mpn_sqrtrem1 (&rl, high);
460 if (rp != NULL)
461 rp[0] = rl;
462 }
463 else
464 {
465 cc = mpn_sqrtrem1 (&rl, high << (2*c)) >> c;
466 sp[0] = cc;
467 if (rp != NULL)
468 rp[0] = rl = high - cc*cc;
469 }
470 return rl != 0;
471 }
472 if (nn == 2) {
473 mp_limb_t tp [2];
474 if (rp == NULL) rp = tp;
475 if (c == 0)
476 {
477#if SQRTREM2_INPLACE
478 rp[1] = high;
479 rp[0] = np[0];
480 cc = CALL_SQRTREM2_INPLACE (sp, rp);
481#else
482 cc = mpn_sqrtrem2 (sp, rp, np);
483#endif
484 rp[1] = cc;
485 return ((rp[0] | cc) != 0) + cc;
486 }
487 else
488 {
489 rl = np[0];
490 rp[1] = (high << (2*c)) | (rl >> (GMP_NUMB_BITS - 2*c));
491 rp[0] = rl << (2*c);
492 CALL_SQRTREM2_INPLACE (sp, rp);
493 cc = sp[0] >>= c; /* c != 0, the highest bit of the root cc is 0. */
494 rp[0] = rl -= cc*cc; /* Computed modulo 2^GMP_LIMB_BITS, because it's smaller. */
495 return rl != 0;
496 }
497 }
498 tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */
499
500 if ((rp == NULL) && (nn > 8))
501 return mpn_dc_sqrt (sp, np, tn, c, nn & 1);
502 TMP_MARK;
503 if (((nn & 1) | c) != 0)
504 {
505 mp_limb_t s0[1], mask;
506 mp_ptr tp, scratch;
507 TMP_ALLOC_LIMBS_2 (tp, 2 * tn, scratch, tn / 2 + 1);
508 tp[0] = 0; /* needed only when 2*tn > nn, but saves a test */
509 if (c != 0)
510 mpn_lshift (tp + (nn & 1), np, nn, 2 * c);
511 else
512 MPN_COPY (tp + (nn & 1), np, nn);
513 c += (nn & 1) ? GMP_NUMB_BITS / 2 : 0; /* c now represents k */
514 mask = (CNST_LIMB (1) << c) - 1;
515 rl = mpn_dc_sqrtrem (sp, tp, tn, (rp == NULL) ? mask - 1 : 0, scratch);
516 /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2,
517 thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */
518 s0[0] = sp[0] & mask; /* S mod 2^k */
519 rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]); /* R = R + 2*s0*S */
520 cc = mpn_submul_1 (tp, s0, 1, s0[0]);
521 rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc;
522 mpn_rshift (sp, sp, tn, c);
523 tp[tn] = rl;
524 if (rp == NULL)
525 rp = tp;
526 c = c << 1;
527 if (c < GMP_NUMB_BITS)
528 tn++;
529 else
530 {
531 tp++;
532 c -= GMP_NUMB_BITS;
533 }
534 if (c != 0)
535 mpn_rshift (rp, tp, tn, c);
536 else
537 MPN_COPY_INCR (rp, tp, tn);
538 rn = tn;
539 }
540 else
541 {
542 if (rp != np)
543 {
544 if (rp == NULL) /* nn <= 8 */
545 rp = TMP_SALLOC_LIMBS (nn);
546 MPN_COPY (rp, np, nn);
547 }
548 rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn, 0, TMP_ALLOC_LIMBS(tn / 2 + 1)));
549 }
550
551 MPN_NORMALIZE (rp, rn);
552
553 TMP_FREE;
554 return rn;
555}