Austin Schuh | bb1338c | 2024-06-15 19:31:16 -0700 | [diff] [blame] | 1 | /* mpn_gcdext -- Extended Greatest Common Divisor. |
| 2 | |
| 3 | Copyright 1996, 1998, 2000-2005, 2008, 2009, 2012 Free Software Foundation, |
| 4 | Inc. |
| 5 | |
| 6 | This file is part of the GNU MP Library. |
| 7 | |
| 8 | The GNU MP Library is free software; you can redistribute it and/or modify |
| 9 | it under the terms of either: |
| 10 | |
| 11 | * the GNU Lesser General Public License as published by the Free |
| 12 | Software Foundation; either version 3 of the License, or (at your |
| 13 | option) any later version. |
| 14 | |
| 15 | or |
| 16 | |
| 17 | * the GNU General Public License as published by the Free Software |
| 18 | Foundation; either version 2 of the License, or (at your option) any |
| 19 | later version. |
| 20 | |
| 21 | or both in parallel, as here. |
| 22 | |
| 23 | The GNU MP Library is distributed in the hope that it will be useful, but |
| 24 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| 25 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 26 | for more details. |
| 27 | |
| 28 | You should have received copies of the GNU General Public License and the |
| 29 | GNU Lesser General Public License along with the GNU MP Library. If not, |
| 30 | see https://www.gnu.org/licenses/. */ |
| 31 | |
| 32 | #include "gmp-impl.h" |
| 33 | #include "longlong.h" |
| 34 | |
| 35 | /* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and |
| 36 | the size is returned (if inputs are non-normalized, result may be |
| 37 | non-normalized too). Temporary space needed is M->n + n. |
| 38 | */ |
| 39 | static size_t |
| 40 | hgcd_mul_matrix_vector (struct hgcd_matrix *M, |
| 41 | mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp) |
| 42 | { |
| 43 | mp_limb_t ah, bh; |
| 44 | |
| 45 | /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as |
| 46 | |
| 47 | t = u00 * a |
| 48 | r = u10 * b |
| 49 | r += t; |
| 50 | |
| 51 | t = u11 * b |
| 52 | b = u01 * a |
| 53 | b += t; |
| 54 | */ |
| 55 | |
| 56 | if (M->n >= n) |
| 57 | { |
| 58 | mpn_mul (tp, M->p[0][0], M->n, ap, n); |
| 59 | mpn_mul (rp, M->p[1][0], M->n, bp, n); |
| 60 | } |
| 61 | else |
| 62 | { |
| 63 | mpn_mul (tp, ap, n, M->p[0][0], M->n); |
| 64 | mpn_mul (rp, bp, n, M->p[1][0], M->n); |
| 65 | } |
| 66 | |
| 67 | ah = mpn_add_n (rp, rp, tp, n + M->n); |
| 68 | |
| 69 | if (M->n >= n) |
| 70 | { |
| 71 | mpn_mul (tp, M->p[1][1], M->n, bp, n); |
| 72 | mpn_mul (bp, M->p[0][1], M->n, ap, n); |
| 73 | } |
| 74 | else |
| 75 | { |
| 76 | mpn_mul (tp, bp, n, M->p[1][1], M->n); |
| 77 | mpn_mul (bp, ap, n, M->p[0][1], M->n); |
| 78 | } |
| 79 | bh = mpn_add_n (bp, bp, tp, n + M->n); |
| 80 | |
| 81 | n += M->n; |
| 82 | if ( (ah | bh) > 0) |
| 83 | { |
| 84 | rp[n] = ah; |
| 85 | bp[n] = bh; |
| 86 | n++; |
| 87 | } |
| 88 | else |
| 89 | { |
| 90 | /* Normalize */ |
| 91 | while ( (rp[n-1] | bp[n-1]) == 0) |
| 92 | n--; |
| 93 | } |
| 94 | |
| 95 | return n; |
| 96 | } |
| 97 | |
| 98 | #define COMPUTE_V_ITCH(n) (2*(n)) |
| 99 | |
| 100 | /* Computes |v| = |(g - u a)| / b, where u may be positive or |
| 101 | negative, and v is of the opposite sign. max(a, b) is of size n, u and |
| 102 | v at most size n, and v must have space for n+1 limbs. */ |
| 103 | static mp_size_t |
| 104 | compute_v (mp_ptr vp, |
| 105 | mp_srcptr ap, mp_srcptr bp, mp_size_t n, |
| 106 | mp_srcptr gp, mp_size_t gn, |
| 107 | mp_srcptr up, mp_size_t usize, |
| 108 | mp_ptr tp) |
| 109 | { |
| 110 | mp_size_t size; |
| 111 | mp_size_t an; |
| 112 | mp_size_t bn; |
| 113 | mp_size_t vn; |
| 114 | |
| 115 | ASSERT (n > 0); |
| 116 | ASSERT (gn > 0); |
| 117 | ASSERT (usize != 0); |
| 118 | |
| 119 | size = ABS (usize); |
| 120 | ASSERT (size <= n); |
| 121 | ASSERT (up[size-1] > 0); |
| 122 | |
| 123 | an = n; |
| 124 | MPN_NORMALIZE (ap, an); |
| 125 | ASSERT (gn <= an); |
| 126 | |
| 127 | if (an >= size) |
| 128 | mpn_mul (tp, ap, an, up, size); |
| 129 | else |
| 130 | mpn_mul (tp, up, size, ap, an); |
| 131 | |
| 132 | size += an; |
| 133 | |
| 134 | if (usize > 0) |
| 135 | { |
| 136 | /* |v| = -v = (u a - g) / b */ |
| 137 | |
| 138 | ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn)); |
| 139 | MPN_NORMALIZE (tp, size); |
| 140 | if (size == 0) |
| 141 | return 0; |
| 142 | } |
| 143 | else |
| 144 | { /* |v| = v = (g - u a) / b = (g + |u| a) / b. Since g <= a, |
| 145 | (g + |u| a) always fits in (|usize| + an) limbs. */ |
| 146 | |
| 147 | ASSERT_NOCARRY (mpn_add (tp, tp, size, gp, gn)); |
| 148 | size -= (tp[size - 1] == 0); |
| 149 | } |
| 150 | |
| 151 | /* Now divide t / b. There must be no remainder */ |
| 152 | bn = n; |
| 153 | MPN_NORMALIZE (bp, bn); |
| 154 | ASSERT (size >= bn); |
| 155 | |
| 156 | vn = size + 1 - bn; |
| 157 | ASSERT (vn <= n + 1); |
| 158 | |
| 159 | mpn_divexact (vp, tp, size, bp, bn); |
| 160 | vn -= (vp[vn-1] == 0); |
| 161 | |
| 162 | return vn; |
| 163 | } |
| 164 | |
| 165 | /* Temporary storage: |
| 166 | |
| 167 | Initial division: Quotient of at most an - n + 1 <= an limbs. |
| 168 | |
| 169 | Storage for u0 and u1: 2(n+1). |
| 170 | |
| 171 | Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4) |
| 172 | |
| 173 | Storage for hgcd, input (n + 1)/2: 9 n/4 plus some. |
| 174 | |
| 175 | When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors. |
| 176 | |
| 177 | When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less. |
| 178 | |
| 179 | For the lehmer call after the loop, Let T denote |
| 180 | GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for |
| 181 | u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T |
| 182 | for u, T+1 for v and 2T scratch space. In all, 7T + 3 is |
| 183 | sufficient for both operations. |
| 184 | |
| 185 | */ |
| 186 | |
| 187 | /* Optimal choice of p seems difficult. In each iteration the division |
| 188 | * of work between hgcd and the updates of u0 and u1 depends on the |
| 189 | * current size of the u. It may be desirable to use a different |
| 190 | * choice of p in each iteration. Also the input size seems to matter; |
| 191 | * choosing p = n / 3 in the first iteration seems to improve |
| 192 | * performance slightly for input size just above the threshold, but |
| 193 | * degrade performance for larger inputs. */ |
| 194 | #define CHOOSE_P_1(n) ((n) / 2) |
| 195 | #define CHOOSE_P_2(n) ((n) / 3) |
| 196 | |
| 197 | mp_size_t |
| 198 | mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep, |
| 199 | mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n) |
| 200 | { |
| 201 | mp_size_t talloc; |
| 202 | mp_size_t scratch; |
| 203 | mp_size_t matrix_scratch; |
| 204 | mp_size_t ualloc = n + 1; |
| 205 | |
| 206 | struct gcdext_ctx ctx; |
| 207 | mp_size_t un; |
| 208 | mp_ptr u0; |
| 209 | mp_ptr u1; |
| 210 | |
| 211 | mp_ptr tp; |
| 212 | |
| 213 | TMP_DECL; |
| 214 | |
| 215 | ASSERT (an >= n); |
| 216 | ASSERT (n > 0); |
| 217 | ASSERT (bp[n-1] > 0); |
| 218 | |
| 219 | TMP_MARK; |
| 220 | |
| 221 | /* FIXME: Check for small sizes first, before setting up temporary |
| 222 | storage etc. */ |
| 223 | talloc = MPN_GCDEXT_LEHMER_N_ITCH(n); |
| 224 | |
| 225 | /* For initial division */ |
| 226 | scratch = an - n + 1; |
| 227 | if (scratch > talloc) |
| 228 | talloc = scratch; |
| 229 | |
| 230 | if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) |
| 231 | { |
| 232 | /* For hgcd loop. */ |
| 233 | mp_size_t hgcd_scratch; |
| 234 | mp_size_t update_scratch; |
| 235 | mp_size_t p1 = CHOOSE_P_1 (n); |
| 236 | mp_size_t p2 = CHOOSE_P_2 (n); |
| 237 | mp_size_t min_p = MIN(p1, p2); |
| 238 | mp_size_t max_p = MAX(p1, p2); |
| 239 | matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p); |
| 240 | hgcd_scratch = mpn_hgcd_itch (n - min_p); |
| 241 | update_scratch = max_p + n - 1; |
| 242 | |
| 243 | scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch); |
| 244 | if (scratch > talloc) |
| 245 | talloc = scratch; |
| 246 | |
| 247 | /* Final mpn_gcdext_lehmer_n call. Need space for u and for |
| 248 | copies of a and b. */ |
| 249 | scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD) |
| 250 | + 3*GCDEXT_DC_THRESHOLD; |
| 251 | |
| 252 | if (scratch > talloc) |
| 253 | talloc = scratch; |
| 254 | |
| 255 | /* Cofactors u0 and u1 */ |
| 256 | talloc += 2*(n+1); |
| 257 | } |
| 258 | |
| 259 | tp = TMP_ALLOC_LIMBS(talloc); |
| 260 | |
| 261 | if (an > n) |
| 262 | { |
| 263 | mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n); |
| 264 | |
| 265 | if (mpn_zero_p (ap, n)) |
| 266 | { |
| 267 | MPN_COPY (gp, bp, n); |
| 268 | *usizep = 0; |
| 269 | TMP_FREE; |
| 270 | return n; |
| 271 | } |
| 272 | } |
| 273 | |
| 274 | if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) |
| 275 | { |
| 276 | mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp); |
| 277 | |
| 278 | TMP_FREE; |
| 279 | return gn; |
| 280 | } |
| 281 | |
| 282 | MPN_ZERO (tp, 2*ualloc); |
| 283 | u0 = tp; tp += ualloc; |
| 284 | u1 = tp; tp += ualloc; |
| 285 | |
| 286 | ctx.gp = gp; |
| 287 | ctx.up = up; |
| 288 | ctx.usize = usizep; |
| 289 | |
| 290 | { |
| 291 | /* For the first hgcd call, there are no u updates, and it makes |
| 292 | some sense to use a different choice for p. */ |
| 293 | |
| 294 | /* FIXME: We could trim use of temporary storage, since u0 and u1 |
| 295 | are not used yet. For the hgcd call, we could swap in the u0 |
| 296 | and u1 pointers for the relevant matrix elements. */ |
| 297 | |
| 298 | struct hgcd_matrix M; |
| 299 | mp_size_t p = CHOOSE_P_1 (n); |
| 300 | mp_size_t nn; |
| 301 | |
| 302 | mpn_hgcd_matrix_init (&M, n - p, tp); |
| 303 | nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch); |
| 304 | if (nn > 0) |
| 305 | { |
| 306 | ASSERT (M.n <= (n - p - 1)/2); |
| 307 | ASSERT (M.n + p <= (p + n - 1) / 2); |
| 308 | |
| 309 | /* Temporary storage 2 (p + M->n) <= p + n - 1 */ |
| 310 | n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch); |
| 311 | |
| 312 | MPN_COPY (u0, M.p[1][0], M.n); |
| 313 | MPN_COPY (u1, M.p[1][1], M.n); |
| 314 | un = M.n; |
| 315 | while ( (u0[un-1] | u1[un-1] ) == 0) |
| 316 | un--; |
| 317 | } |
| 318 | else |
| 319 | { |
| 320 | /* mpn_hgcd has failed. Then either one of a or b is very |
| 321 | small, or the difference is very small. Perform one |
| 322 | subtraction followed by one division. */ |
| 323 | u1[0] = 1; |
| 324 | |
| 325 | ctx.u0 = u0; |
| 326 | ctx.u1 = u1; |
| 327 | ctx.tp = tp + n; /* ualloc */ |
| 328 | ctx.un = 1; |
| 329 | |
| 330 | /* Temporary storage n */ |
| 331 | n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp); |
| 332 | if (n == 0) |
| 333 | { |
| 334 | TMP_FREE; |
| 335 | return ctx.gn; |
| 336 | } |
| 337 | |
| 338 | un = ctx.un; |
| 339 | ASSERT (un < ualloc); |
| 340 | } |
| 341 | } |
| 342 | |
| 343 | while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) |
| 344 | { |
| 345 | struct hgcd_matrix M; |
| 346 | mp_size_t p = CHOOSE_P_2 (n); |
| 347 | mp_size_t nn; |
| 348 | |
| 349 | mpn_hgcd_matrix_init (&M, n - p, tp); |
| 350 | nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch); |
| 351 | if (nn > 0) |
| 352 | { |
| 353 | mp_ptr t0; |
| 354 | |
| 355 | t0 = tp + matrix_scratch; |
| 356 | ASSERT (M.n <= (n - p - 1)/2); |
| 357 | ASSERT (M.n + p <= (p + n - 1) / 2); |
| 358 | |
| 359 | /* Temporary storage 2 (p + M->n) <= p + n - 1 */ |
| 360 | n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0); |
| 361 | |
| 362 | /* By the same analysis as for mpn_hgcd_matrix_mul */ |
| 363 | ASSERT (M.n + un <= ualloc); |
| 364 | |
| 365 | /* FIXME: This copying could be avoided by some swapping of |
| 366 | * pointers. May need more temporary storage, though. */ |
| 367 | MPN_COPY (t0, u0, un); |
| 368 | |
| 369 | /* Temporary storage ualloc */ |
| 370 | un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un); |
| 371 | |
| 372 | ASSERT (un < ualloc); |
| 373 | ASSERT ( (u0[un-1] | u1[un-1]) > 0); |
| 374 | } |
| 375 | else |
| 376 | { |
| 377 | /* mpn_hgcd has failed. Then either one of a or b is very |
| 378 | small, or the difference is very small. Perform one |
| 379 | subtraction followed by one division. */ |
| 380 | ctx.u0 = u0; |
| 381 | ctx.u1 = u1; |
| 382 | ctx.tp = tp + n; /* ualloc */ |
| 383 | ctx.un = un; |
| 384 | |
| 385 | /* Temporary storage n */ |
| 386 | n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp); |
| 387 | if (n == 0) |
| 388 | { |
| 389 | TMP_FREE; |
| 390 | return ctx.gn; |
| 391 | } |
| 392 | |
| 393 | un = ctx.un; |
| 394 | ASSERT (un < ualloc); |
| 395 | } |
| 396 | } |
| 397 | /* We have A = ... a + ... b |
| 398 | B = u0 a + u1 b |
| 399 | |
| 400 | a = u1 A + ... B |
| 401 | b = -u0 A + ... B |
| 402 | |
| 403 | with bounds |
| 404 | |
| 405 | |u0|, |u1| <= B / min(a, b) |
| 406 | |
| 407 | We always have u1 > 0, and u0 == 0 is possible only if u1 == 1, |
| 408 | in which case the only reduction done so far is a = A - k B for |
| 409 | some k. |
| 410 | |
| 411 | Compute g = u a + v b = (u u1 - v u0) A + (...) B |
| 412 | Here, u, v are bounded by |
| 413 | |
| 414 | |u| <= b, |
| 415 | |v| <= a |
| 416 | */ |
| 417 | |
| 418 | ASSERT ( (ap[n-1] | bp[n-1]) > 0); |
| 419 | |
| 420 | if (UNLIKELY (mpn_cmp (ap, bp, n) == 0)) |
| 421 | { |
| 422 | /* Must return the smallest cofactor, +u1 or -u0 */ |
| 423 | int c; |
| 424 | |
| 425 | MPN_COPY (gp, ap, n); |
| 426 | |
| 427 | MPN_CMP (c, u0, u1, un); |
| 428 | /* c == 0 can happen only when A = (2k+1) G, B = 2 G. And in |
| 429 | this case we choose the cofactor + 1, corresponding to G = A |
| 430 | - k B, rather than -1, corresponding to G = - A + (k+1) B. */ |
| 431 | ASSERT (c != 0 || (un == 1 && u0[0] == 1 && u1[0] == 1)); |
| 432 | if (c < 0) |
| 433 | { |
| 434 | MPN_NORMALIZE (u0, un); |
| 435 | MPN_COPY (up, u0, un); |
| 436 | *usizep = -un; |
| 437 | } |
| 438 | else |
| 439 | { |
| 440 | MPN_NORMALIZE_NOT_ZERO (u1, un); |
| 441 | MPN_COPY (up, u1, un); |
| 442 | *usizep = un; |
| 443 | } |
| 444 | |
| 445 | TMP_FREE; |
| 446 | return n; |
| 447 | } |
| 448 | else if (UNLIKELY (u0[0] == 0) && un == 1) |
| 449 | { |
| 450 | mp_size_t gn; |
| 451 | ASSERT (u1[0] == 1); |
| 452 | |
| 453 | /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */ |
| 454 | gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp); |
| 455 | |
| 456 | TMP_FREE; |
| 457 | return gn; |
| 458 | } |
| 459 | else |
| 460 | { |
| 461 | mp_size_t u0n; |
| 462 | mp_size_t u1n; |
| 463 | mp_size_t lehmer_un; |
| 464 | mp_size_t lehmer_vn; |
| 465 | mp_size_t gn; |
| 466 | |
| 467 | mp_ptr lehmer_up; |
| 468 | mp_ptr lehmer_vp; |
| 469 | int negate; |
| 470 | |
| 471 | lehmer_up = tp; tp += n; |
| 472 | |
| 473 | /* Call mpn_gcdext_lehmer_n with copies of a and b. */ |
| 474 | MPN_COPY (tp, ap, n); |
| 475 | MPN_COPY (tp + n, bp, n); |
| 476 | gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n); |
| 477 | |
| 478 | u0n = un; |
| 479 | MPN_NORMALIZE (u0, u0n); |
| 480 | ASSERT (u0n > 0); |
| 481 | |
| 482 | if (lehmer_un == 0) |
| 483 | { |
| 484 | /* u == 0 ==> v = g / b == 1 ==> g = - u0 A + (...) B */ |
| 485 | MPN_COPY (up, u0, u0n); |
| 486 | *usizep = -u0n; |
| 487 | |
| 488 | TMP_FREE; |
| 489 | return gn; |
| 490 | } |
| 491 | |
| 492 | lehmer_vp = tp; |
| 493 | /* Compute v = (g - u a) / b */ |
| 494 | lehmer_vn = compute_v (lehmer_vp, |
| 495 | ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1); |
| 496 | |
| 497 | if (lehmer_un > 0) |
| 498 | negate = 0; |
| 499 | else |
| 500 | { |
| 501 | lehmer_un = -lehmer_un; |
| 502 | negate = 1; |
| 503 | } |
| 504 | |
| 505 | u1n = un; |
| 506 | MPN_NORMALIZE (u1, u1n); |
| 507 | ASSERT (u1n > 0); |
| 508 | |
| 509 | ASSERT (lehmer_un + u1n <= ualloc); |
| 510 | ASSERT (lehmer_vn + u0n <= ualloc); |
| 511 | |
| 512 | /* We may still have v == 0 */ |
| 513 | |
| 514 | /* Compute u u0 */ |
| 515 | if (lehmer_un <= u1n) |
| 516 | /* Should be the common case */ |
| 517 | mpn_mul (up, u1, u1n, lehmer_up, lehmer_un); |
| 518 | else |
| 519 | mpn_mul (up, lehmer_up, lehmer_un, u1, u1n); |
| 520 | |
| 521 | un = u1n + lehmer_un; |
| 522 | un -= (up[un - 1] == 0); |
| 523 | |
| 524 | if (lehmer_vn > 0) |
| 525 | { |
| 526 | mp_limb_t cy; |
| 527 | |
| 528 | /* Overwrites old u1 value */ |
| 529 | if (lehmer_vn <= u0n) |
| 530 | /* Should be the common case */ |
| 531 | mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn); |
| 532 | else |
| 533 | mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n); |
| 534 | |
| 535 | u1n = u0n + lehmer_vn; |
| 536 | u1n -= (u1[u1n - 1] == 0); |
| 537 | |
| 538 | if (u1n <= un) |
| 539 | { |
| 540 | cy = mpn_add (up, up, un, u1, u1n); |
| 541 | } |
| 542 | else |
| 543 | { |
| 544 | cy = mpn_add (up, u1, u1n, up, un); |
| 545 | un = u1n; |
| 546 | } |
| 547 | up[un] = cy; |
| 548 | un += (cy != 0); |
| 549 | |
| 550 | ASSERT (un < ualloc); |
| 551 | } |
| 552 | *usizep = negate ? -un : un; |
| 553 | |
| 554 | TMP_FREE; |
| 555 | return gn; |
| 556 | } |
| 557 | } |