blob: 55014807effee62d66a786daa871fd5ed9cf42e1 [file] [log] [blame]
Austin Schuhbb1338c2024-06-15 19:31:16 -07001/* mpn_gcdext -- Extended Greatest Common Divisor.
2
3Copyright 1996, 1998, 2000-2005, 2008, 2009, 2012 Free Software Foundation,
4Inc.
5
6This file is part of the GNU MP Library.
7
8The GNU MP Library is free software; you can redistribute it and/or modify
9it under the terms of either:
10
11 * the GNU Lesser General Public License as published by the Free
12 Software Foundation; either version 3 of the License, or (at your
13 option) any later version.
14
15or
16
17 * the GNU General Public License as published by the Free Software
18 Foundation; either version 2 of the License, or (at your option) any
19 later version.
20
21or both in parallel, as here.
22
23The GNU MP Library is distributed in the hope that it will be useful, but
24WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26for more details.
27
28You should have received copies of the GNU General Public License and the
29GNU Lesser General Public License along with the GNU MP Library. If not,
30see https://www.gnu.org/licenses/. */
31
32#include "gmp-impl.h"
33#include "longlong.h"
34
35/* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and
36 the size is returned (if inputs are non-normalized, result may be
37 non-normalized too). Temporary space needed is M->n + n.
38 */
39static size_t
40hgcd_mul_matrix_vector (struct hgcd_matrix *M,
41 mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
42{
43 mp_limb_t ah, bh;
44
45 /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
46
47 t = u00 * a
48 r = u10 * b
49 r += t;
50
51 t = u11 * b
52 b = u01 * a
53 b += t;
54 */
55
56 if (M->n >= n)
57 {
58 mpn_mul (tp, M->p[0][0], M->n, ap, n);
59 mpn_mul (rp, M->p[1][0], M->n, bp, n);
60 }
61 else
62 {
63 mpn_mul (tp, ap, n, M->p[0][0], M->n);
64 mpn_mul (rp, bp, n, M->p[1][0], M->n);
65 }
66
67 ah = mpn_add_n (rp, rp, tp, n + M->n);
68
69 if (M->n >= n)
70 {
71 mpn_mul (tp, M->p[1][1], M->n, bp, n);
72 mpn_mul (bp, M->p[0][1], M->n, ap, n);
73 }
74 else
75 {
76 mpn_mul (tp, bp, n, M->p[1][1], M->n);
77 mpn_mul (bp, ap, n, M->p[0][1], M->n);
78 }
79 bh = mpn_add_n (bp, bp, tp, n + M->n);
80
81 n += M->n;
82 if ( (ah | bh) > 0)
83 {
84 rp[n] = ah;
85 bp[n] = bh;
86 n++;
87 }
88 else
89 {
90 /* Normalize */
91 while ( (rp[n-1] | bp[n-1]) == 0)
92 n--;
93 }
94
95 return n;
96}
97
98#define COMPUTE_V_ITCH(n) (2*(n))
99
100/* Computes |v| = |(g - u a)| / b, where u may be positive or
101 negative, and v is of the opposite sign. max(a, b) is of size n, u and
102 v at most size n, and v must have space for n+1 limbs. */
103static mp_size_t
104compute_v (mp_ptr vp,
105 mp_srcptr ap, mp_srcptr bp, mp_size_t n,
106 mp_srcptr gp, mp_size_t gn,
107 mp_srcptr up, mp_size_t usize,
108 mp_ptr tp)
109{
110 mp_size_t size;
111 mp_size_t an;
112 mp_size_t bn;
113 mp_size_t vn;
114
115 ASSERT (n > 0);
116 ASSERT (gn > 0);
117 ASSERT (usize != 0);
118
119 size = ABS (usize);
120 ASSERT (size <= n);
121 ASSERT (up[size-1] > 0);
122
123 an = n;
124 MPN_NORMALIZE (ap, an);
125 ASSERT (gn <= an);
126
127 if (an >= size)
128 mpn_mul (tp, ap, an, up, size);
129 else
130 mpn_mul (tp, up, size, ap, an);
131
132 size += an;
133
134 if (usize > 0)
135 {
136 /* |v| = -v = (u a - g) / b */
137
138 ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn));
139 MPN_NORMALIZE (tp, size);
140 if (size == 0)
141 return 0;
142 }
143 else
144 { /* |v| = v = (g - u a) / b = (g + |u| a) / b. Since g <= a,
145 (g + |u| a) always fits in (|usize| + an) limbs. */
146
147 ASSERT_NOCARRY (mpn_add (tp, tp, size, gp, gn));
148 size -= (tp[size - 1] == 0);
149 }
150
151 /* Now divide t / b. There must be no remainder */
152 bn = n;
153 MPN_NORMALIZE (bp, bn);
154 ASSERT (size >= bn);
155
156 vn = size + 1 - bn;
157 ASSERT (vn <= n + 1);
158
159 mpn_divexact (vp, tp, size, bp, bn);
160 vn -= (vp[vn-1] == 0);
161
162 return vn;
163}
164
165/* Temporary storage:
166
167 Initial division: Quotient of at most an - n + 1 <= an limbs.
168
169 Storage for u0 and u1: 2(n+1).
170
171 Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4)
172
173 Storage for hgcd, input (n + 1)/2: 9 n/4 plus some.
174
175 When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors.
176
177 When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less.
178
179 For the lehmer call after the loop, Let T denote
180 GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for
181 u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T
182 for u, T+1 for v and 2T scratch space. In all, 7T + 3 is
183 sufficient for both operations.
184
185*/
186
187/* Optimal choice of p seems difficult. In each iteration the division
188 * of work between hgcd and the updates of u0 and u1 depends on the
189 * current size of the u. It may be desirable to use a different
190 * choice of p in each iteration. Also the input size seems to matter;
191 * choosing p = n / 3 in the first iteration seems to improve
192 * performance slightly for input size just above the threshold, but
193 * degrade performance for larger inputs. */
194#define CHOOSE_P_1(n) ((n) / 2)
195#define CHOOSE_P_2(n) ((n) / 3)
196
197mp_size_t
198mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
199 mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n)
200{
201 mp_size_t talloc;
202 mp_size_t scratch;
203 mp_size_t matrix_scratch;
204 mp_size_t ualloc = n + 1;
205
206 struct gcdext_ctx ctx;
207 mp_size_t un;
208 mp_ptr u0;
209 mp_ptr u1;
210
211 mp_ptr tp;
212
213 TMP_DECL;
214
215 ASSERT (an >= n);
216 ASSERT (n > 0);
217 ASSERT (bp[n-1] > 0);
218
219 TMP_MARK;
220
221 /* FIXME: Check for small sizes first, before setting up temporary
222 storage etc. */
223 talloc = MPN_GCDEXT_LEHMER_N_ITCH(n);
224
225 /* For initial division */
226 scratch = an - n + 1;
227 if (scratch > talloc)
228 talloc = scratch;
229
230 if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
231 {
232 /* For hgcd loop. */
233 mp_size_t hgcd_scratch;
234 mp_size_t update_scratch;
235 mp_size_t p1 = CHOOSE_P_1 (n);
236 mp_size_t p2 = CHOOSE_P_2 (n);
237 mp_size_t min_p = MIN(p1, p2);
238 mp_size_t max_p = MAX(p1, p2);
239 matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p);
240 hgcd_scratch = mpn_hgcd_itch (n - min_p);
241 update_scratch = max_p + n - 1;
242
243 scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch);
244 if (scratch > talloc)
245 talloc = scratch;
246
247 /* Final mpn_gcdext_lehmer_n call. Need space for u and for
248 copies of a and b. */
249 scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD)
250 + 3*GCDEXT_DC_THRESHOLD;
251
252 if (scratch > talloc)
253 talloc = scratch;
254
255 /* Cofactors u0 and u1 */
256 talloc += 2*(n+1);
257 }
258
259 tp = TMP_ALLOC_LIMBS(talloc);
260
261 if (an > n)
262 {
263 mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n);
264
265 if (mpn_zero_p (ap, n))
266 {
267 MPN_COPY (gp, bp, n);
268 *usizep = 0;
269 TMP_FREE;
270 return n;
271 }
272 }
273
274 if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
275 {
276 mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp);
277
278 TMP_FREE;
279 return gn;
280 }
281
282 MPN_ZERO (tp, 2*ualloc);
283 u0 = tp; tp += ualloc;
284 u1 = tp; tp += ualloc;
285
286 ctx.gp = gp;
287 ctx.up = up;
288 ctx.usize = usizep;
289
290 {
291 /* For the first hgcd call, there are no u updates, and it makes
292 some sense to use a different choice for p. */
293
294 /* FIXME: We could trim use of temporary storage, since u0 and u1
295 are not used yet. For the hgcd call, we could swap in the u0
296 and u1 pointers for the relevant matrix elements. */
297
298 struct hgcd_matrix M;
299 mp_size_t p = CHOOSE_P_1 (n);
300 mp_size_t nn;
301
302 mpn_hgcd_matrix_init (&M, n - p, tp);
303 nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
304 if (nn > 0)
305 {
306 ASSERT (M.n <= (n - p - 1)/2);
307 ASSERT (M.n + p <= (p + n - 1) / 2);
308
309 /* Temporary storage 2 (p + M->n) <= p + n - 1 */
310 n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch);
311
312 MPN_COPY (u0, M.p[1][0], M.n);
313 MPN_COPY (u1, M.p[1][1], M.n);
314 un = M.n;
315 while ( (u0[un-1] | u1[un-1] ) == 0)
316 un--;
317 }
318 else
319 {
320 /* mpn_hgcd has failed. Then either one of a or b is very
321 small, or the difference is very small. Perform one
322 subtraction followed by one division. */
323 u1[0] = 1;
324
325 ctx.u0 = u0;
326 ctx.u1 = u1;
327 ctx.tp = tp + n; /* ualloc */
328 ctx.un = 1;
329
330 /* Temporary storage n */
331 n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp);
332 if (n == 0)
333 {
334 TMP_FREE;
335 return ctx.gn;
336 }
337
338 un = ctx.un;
339 ASSERT (un < ualloc);
340 }
341 }
342
343 while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
344 {
345 struct hgcd_matrix M;
346 mp_size_t p = CHOOSE_P_2 (n);
347 mp_size_t nn;
348
349 mpn_hgcd_matrix_init (&M, n - p, tp);
350 nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
351 if (nn > 0)
352 {
353 mp_ptr t0;
354
355 t0 = tp + matrix_scratch;
356 ASSERT (M.n <= (n - p - 1)/2);
357 ASSERT (M.n + p <= (p + n - 1) / 2);
358
359 /* Temporary storage 2 (p + M->n) <= p + n - 1 */
360 n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0);
361
362 /* By the same analysis as for mpn_hgcd_matrix_mul */
363 ASSERT (M.n + un <= ualloc);
364
365 /* FIXME: This copying could be avoided by some swapping of
366 * pointers. May need more temporary storage, though. */
367 MPN_COPY (t0, u0, un);
368
369 /* Temporary storage ualloc */
370 un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un);
371
372 ASSERT (un < ualloc);
373 ASSERT ( (u0[un-1] | u1[un-1]) > 0);
374 }
375 else
376 {
377 /* mpn_hgcd has failed. Then either one of a or b is very
378 small, or the difference is very small. Perform one
379 subtraction followed by one division. */
380 ctx.u0 = u0;
381 ctx.u1 = u1;
382 ctx.tp = tp + n; /* ualloc */
383 ctx.un = un;
384
385 /* Temporary storage n */
386 n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp);
387 if (n == 0)
388 {
389 TMP_FREE;
390 return ctx.gn;
391 }
392
393 un = ctx.un;
394 ASSERT (un < ualloc);
395 }
396 }
397 /* We have A = ... a + ... b
398 B = u0 a + u1 b
399
400 a = u1 A + ... B
401 b = -u0 A + ... B
402
403 with bounds
404
405 |u0|, |u1| <= B / min(a, b)
406
407 We always have u1 > 0, and u0 == 0 is possible only if u1 == 1,
408 in which case the only reduction done so far is a = A - k B for
409 some k.
410
411 Compute g = u a + v b = (u u1 - v u0) A + (...) B
412 Here, u, v are bounded by
413
414 |u| <= b,
415 |v| <= a
416 */
417
418 ASSERT ( (ap[n-1] | bp[n-1]) > 0);
419
420 if (UNLIKELY (mpn_cmp (ap, bp, n) == 0))
421 {
422 /* Must return the smallest cofactor, +u1 or -u0 */
423 int c;
424
425 MPN_COPY (gp, ap, n);
426
427 MPN_CMP (c, u0, u1, un);
428 /* c == 0 can happen only when A = (2k+1) G, B = 2 G. And in
429 this case we choose the cofactor + 1, corresponding to G = A
430 - k B, rather than -1, corresponding to G = - A + (k+1) B. */
431 ASSERT (c != 0 || (un == 1 && u0[0] == 1 && u1[0] == 1));
432 if (c < 0)
433 {
434 MPN_NORMALIZE (u0, un);
435 MPN_COPY (up, u0, un);
436 *usizep = -un;
437 }
438 else
439 {
440 MPN_NORMALIZE_NOT_ZERO (u1, un);
441 MPN_COPY (up, u1, un);
442 *usizep = un;
443 }
444
445 TMP_FREE;
446 return n;
447 }
448 else if (UNLIKELY (u0[0] == 0) && un == 1)
449 {
450 mp_size_t gn;
451 ASSERT (u1[0] == 1);
452
453 /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */
454 gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp);
455
456 TMP_FREE;
457 return gn;
458 }
459 else
460 {
461 mp_size_t u0n;
462 mp_size_t u1n;
463 mp_size_t lehmer_un;
464 mp_size_t lehmer_vn;
465 mp_size_t gn;
466
467 mp_ptr lehmer_up;
468 mp_ptr lehmer_vp;
469 int negate;
470
471 lehmer_up = tp; tp += n;
472
473 /* Call mpn_gcdext_lehmer_n with copies of a and b. */
474 MPN_COPY (tp, ap, n);
475 MPN_COPY (tp + n, bp, n);
476 gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n);
477
478 u0n = un;
479 MPN_NORMALIZE (u0, u0n);
480 ASSERT (u0n > 0);
481
482 if (lehmer_un == 0)
483 {
484 /* u == 0 ==> v = g / b == 1 ==> g = - u0 A + (...) B */
485 MPN_COPY (up, u0, u0n);
486 *usizep = -u0n;
487
488 TMP_FREE;
489 return gn;
490 }
491
492 lehmer_vp = tp;
493 /* Compute v = (g - u a) / b */
494 lehmer_vn = compute_v (lehmer_vp,
495 ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1);
496
497 if (lehmer_un > 0)
498 negate = 0;
499 else
500 {
501 lehmer_un = -lehmer_un;
502 negate = 1;
503 }
504
505 u1n = un;
506 MPN_NORMALIZE (u1, u1n);
507 ASSERT (u1n > 0);
508
509 ASSERT (lehmer_un + u1n <= ualloc);
510 ASSERT (lehmer_vn + u0n <= ualloc);
511
512 /* We may still have v == 0 */
513
514 /* Compute u u0 */
515 if (lehmer_un <= u1n)
516 /* Should be the common case */
517 mpn_mul (up, u1, u1n, lehmer_up, lehmer_un);
518 else
519 mpn_mul (up, lehmer_up, lehmer_un, u1, u1n);
520
521 un = u1n + lehmer_un;
522 un -= (up[un - 1] == 0);
523
524 if (lehmer_vn > 0)
525 {
526 mp_limb_t cy;
527
528 /* Overwrites old u1 value */
529 if (lehmer_vn <= u0n)
530 /* Should be the common case */
531 mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn);
532 else
533 mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n);
534
535 u1n = u0n + lehmer_vn;
536 u1n -= (u1[u1n - 1] == 0);
537
538 if (u1n <= un)
539 {
540 cy = mpn_add (up, up, un, u1, u1n);
541 }
542 else
543 {
544 cy = mpn_add (up, u1, u1n, up, un);
545 un = u1n;
546 }
547 up[un] = cy;
548 un += (cy != 0);
549
550 ASSERT (un < ualloc);
551 }
552 *usizep = negate ? -un : un;
553
554 TMP_FREE;
555 return gn;
556 }
557}