Austin Schuh | bb1338c | 2024-06-15 19:31:16 -0700 | [diff] [blame] | 1 | /* mpz_bin_uiui - compute n over k. |
| 2 | |
| 3 | Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. |
| 4 | |
| 5 | Copyright 2010-2012, 2015-2018 Free Software Foundation, Inc. |
| 6 | |
| 7 | This file is part of the GNU MP Library. |
| 8 | |
| 9 | The GNU MP Library is free software; you can redistribute it and/or modify |
| 10 | it under the terms of either: |
| 11 | |
| 12 | * the GNU Lesser General Public License as published by the Free |
| 13 | Software Foundation; either version 3 of the License, or (at your |
| 14 | option) any later version. |
| 15 | |
| 16 | or |
| 17 | |
| 18 | * the GNU General Public License as published by the Free Software |
| 19 | Foundation; either version 2 of the License, or (at your option) any |
| 20 | later version. |
| 21 | |
| 22 | or both in parallel, as here. |
| 23 | |
| 24 | The GNU MP Library is distributed in the hope that it will be useful, but |
| 25 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| 26 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 27 | for more details. |
| 28 | |
| 29 | You should have received copies of the GNU General Public License and the |
| 30 | GNU Lesser General Public License along with the GNU MP Library. If not, |
| 31 | see https://www.gnu.org/licenses/. */ |
| 32 | |
| 33 | #include "gmp-impl.h" |
| 34 | #include "longlong.h" |
| 35 | |
| 36 | #ifndef BIN_GOETGHELUCK_THRESHOLD |
| 37 | #define BIN_GOETGHELUCK_THRESHOLD 512 |
| 38 | #endif |
| 39 | #ifndef BIN_UIUI_ENABLE_SMALLDC |
| 40 | #define BIN_UIUI_ENABLE_SMALLDC 1 |
| 41 | #endif |
| 42 | #ifndef BIN_UIUI_RECURSIVE_SMALLDC |
| 43 | #define BIN_UIUI_RECURSIVE_SMALLDC (GMP_NUMB_BITS > 32) |
| 44 | #endif |
| 45 | |
| 46 | /* Algorithm: |
| 47 | |
| 48 | Accumulate chunks of factors first limb-by-limb (using one of mul0-mul8) |
| 49 | which are then accumulated into mpn numbers. The first inner loop |
| 50 | accumulates divisor factors, the 2nd inner loop accumulates exactly the same |
| 51 | number of dividend factors. We avoid accumulating more for the divisor, |
| 52 | even with its smaller factors, since we else cannot guarantee divisibility. |
| 53 | |
| 54 | Since we know each division will yield an integer, we compute the quotient |
| 55 | using Hensel norm: If the quotient is limited by 2^t, we compute A / B mod |
| 56 | 2^t. |
| 57 | |
| 58 | Improvements: |
| 59 | |
| 60 | (1) An obvious improvement to this code would be to compute mod 2^t |
| 61 | everywhere. Unfortunately, we cannot determine t beforehand, unless we |
| 62 | invoke some approximation, such as Stirling's formula. Of course, we don't |
| 63 | need t to be tight. However, it is not clear that this would help much, |
| 64 | our numbers are kept reasonably small already. |
| 65 | |
| 66 | (2) Compute nmax/kmax semi-accurately, without scalar division or a loop. |
| 67 | Extracting the 3 msb, then doing a table lookup using cnt*8+msb as index, |
| 68 | would make it both reasonably accurate and fast. (We could use a table |
| 69 | stored into a limb, perhaps.) The table should take the removed factors of |
| 70 | 2 into account (those done on-the-fly in mulN). |
| 71 | |
| 72 | (3) The first time in the loop we compute the odd part of a |
| 73 | factorial in kp, we might use oddfac_1 for this task. |
| 74 | */ |
| 75 | |
| 76 | /* This threshold determines how large divisor to accumulate before we call |
| 77 | bdiv. Perhaps we should never call bdiv, and accumulate all we are told, |
| 78 | since we are just basecase code anyway? Presumably, this depends on the |
| 79 | relative speed of the asymptotically fast code and this code. */ |
| 80 | #define SOME_THRESHOLD 20 |
| 81 | |
| 82 | /* Multiply-into-limb functions. These remove factors of 2 on-the-fly. FIXME: |
| 83 | All versions of MAXFACS don't take this 2 removal into account now, meaning |
| 84 | that then, shifting just adds some overhead. (We remove factors from the |
| 85 | completed limb anyway.) */ |
| 86 | |
| 87 | static mp_limb_t |
| 88 | mul1 (mp_limb_t m) |
| 89 | { |
| 90 | return m; |
| 91 | } |
| 92 | |
| 93 | static mp_limb_t |
| 94 | mul2 (mp_limb_t m) |
| 95 | { |
| 96 | /* We need to shift before multiplying, to avoid an overflow. */ |
| 97 | mp_limb_t m01 = (m | 1) * ((m + 1) >> 1); |
| 98 | return m01; |
| 99 | } |
| 100 | |
| 101 | static mp_limb_t |
| 102 | mul3 (mp_limb_t m) |
| 103 | { |
| 104 | mp_limb_t m01 = (m + 0) * (m + 1) >> 1; |
| 105 | mp_limb_t m2 = (m + 2); |
| 106 | return m01 * m2; |
| 107 | } |
| 108 | |
| 109 | static mp_limb_t |
| 110 | mul4 (mp_limb_t m) |
| 111 | { |
| 112 | mp_limb_t m03 = (m + 0) * (m + 3) >> 1; |
| 113 | return m03 * (m03 + 1); /* mul2 (m03) ? */ |
| 114 | } |
| 115 | |
| 116 | static mp_limb_t |
| 117 | mul5 (mp_limb_t m) |
| 118 | { |
| 119 | mp_limb_t m03 = (m + 0) * (m + 3) >> 1; |
| 120 | mp_limb_t m034 = m03 * (m + 4); |
| 121 | return (m03 + 1) * m034; |
| 122 | } |
| 123 | |
| 124 | static mp_limb_t |
| 125 | mul6 (mp_limb_t m) |
| 126 | { |
| 127 | mp_limb_t m05 = (m + 0) * (m + 5); |
| 128 | mp_limb_t m1234 = (m05 + 5) * (m05 + 5) >> 3; |
| 129 | return m1234 * (m05 >> 1); |
| 130 | } |
| 131 | |
| 132 | static mp_limb_t |
| 133 | mul7 (mp_limb_t m) |
| 134 | { |
| 135 | mp_limb_t m05 = (m + 0) * (m + 5); |
| 136 | mp_limb_t m1234 = (m05 + 5) * (m05 + 5) >> 3; |
| 137 | mp_limb_t m056 = m05 * (m + 6) >> 1; |
| 138 | return m1234 * m056; |
| 139 | } |
| 140 | |
| 141 | static mp_limb_t |
| 142 | mul8 (mp_limb_t m) |
| 143 | { |
| 144 | mp_limb_t m07 = (m + 0) * (m + 7); |
| 145 | mp_limb_t m0257 = m07 * (m07 + 10) >> 3; |
| 146 | mp_limb_t m1346 = m07 + 9 + m0257; |
| 147 | return m0257 * m1346; |
| 148 | } |
| 149 | |
| 150 | /* |
| 151 | static mp_limb_t |
| 152 | mul9 (mp_limb_t m) |
| 153 | { |
| 154 | return (m + 8) * mul8 (m) ; |
| 155 | } |
| 156 | |
| 157 | static mp_limb_t |
| 158 | mul10 (mp_limb_t m) |
| 159 | { |
| 160 | mp_limb_t m09 = (m + 0) * (m + 9); |
| 161 | mp_limb_t m18 = (m09 >> 1) + 4; |
| 162 | mp_limb_t m0369 = m09 * (m09 + 18) >> 3; |
| 163 | mp_limb_t m2457 = m09 * 2 + 35 + m0369; |
| 164 | return ((m0369 * m2457) >> 1) * m18; |
| 165 | } |
| 166 | */ |
| 167 | |
| 168 | typedef mp_limb_t (* mulfunc_t) (mp_limb_t); |
| 169 | |
| 170 | static const mulfunc_t mulfunc[] = {mul1,mul2,mul3,mul4,mul5,mul6,mul7,mul8 /* ,mul9,mul10 */}; |
| 171 | #define M (numberof(mulfunc)) |
| 172 | |
| 173 | /* Number of factors-of-2 removed by the corresponding mulN function. */ |
| 174 | static const unsigned char tcnttab[] = {0, 1, 1, 2, 2, 4, 4, 6 /*,6,8*/}; |
| 175 | |
| 176 | #if 1 |
| 177 | /* This variant is inaccurate but share the code with other functions. */ |
| 178 | #define MAXFACS(max,l) \ |
| 179 | do { \ |
| 180 | (max) = log_n_max (l); \ |
| 181 | } while (0) |
| 182 | #else |
| 183 | |
| 184 | /* This variant is exact(?) but uses a loop. It takes the 2 removal |
| 185 | of mulN into account. */ |
| 186 | static const unsigned long ftab[] = |
| 187 | #if GMP_NUMB_BITS == 64 |
| 188 | /* 1 to 8 factors per iteration */ |
| 189 | {CNST_LIMB(0xffffffffffffffff),CNST_LIMB(0x16a09e667),0x32cbfc,0x16a08,0x24c0,0xa11,0x345,0x1ab /*,0xe9,0x8e */}; |
| 190 | #endif |
| 191 | #if GMP_NUMB_BITS == 32 |
| 192 | /* 1 to 7 factors per iteration */ |
| 193 | {0xffffffff,0x16a09,0x7ff,0x168,0x6f,0x3d,0x20 /* ,0x17 */}; |
| 194 | #endif |
| 195 | |
| 196 | #define MAXFACS(max,l) \ |
| 197 | do { \ |
| 198 | int __i; \ |
| 199 | for (__i = numberof (ftab) - 1; l > ftab[__i]; __i--) \ |
| 200 | ; \ |
| 201 | (max) = __i + 1; \ |
| 202 | } while (0) |
| 203 | #endif |
| 204 | |
| 205 | /* Entry i contains (i!/2^t)^(-1) where t is chosen such that the parenthesis |
| 206 | is an odd integer. */ |
| 207 | static const mp_limb_t facinv[] = { ONE_LIMB_ODD_FACTORIAL_INVERSES_TABLE }; |
| 208 | |
| 209 | static void |
| 210 | mpz_bdiv_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k) |
| 211 | { |
| 212 | unsigned nmax, kmax, nmaxnow, numfac; |
| 213 | mp_ptr np, kp; |
| 214 | mp_size_t nn, kn, alloc; |
| 215 | mp_limb_t i, j, t, iii, jjj, cy, dinv; |
| 216 | int cnt; |
| 217 | mp_size_t maxn; |
| 218 | TMP_DECL; |
| 219 | |
| 220 | ASSERT (k > ODD_FACTORIAL_TABLE_LIMIT); |
| 221 | TMP_MARK; |
| 222 | |
| 223 | maxn = 1 + n / GMP_NUMB_BITS; /* absolutely largest result size (limbs) */ |
| 224 | |
| 225 | /* FIXME: This allocation might be insufficient, but is usually way too |
| 226 | large. */ |
| 227 | alloc = SOME_THRESHOLD - 1 + MAX (3 * maxn / 2, SOME_THRESHOLD); |
| 228 | alloc = MIN (alloc, (mp_size_t) k) + 1; |
| 229 | TMP_ALLOC_LIMBS_2 (np, alloc, kp, SOME_THRESHOLD + 1); |
| 230 | |
| 231 | MAXFACS (nmax, n); |
| 232 | ASSERT (nmax <= M); |
| 233 | MAXFACS (kmax, k); |
| 234 | ASSERT (kmax <= M); |
| 235 | ASSERT (k >= M); |
| 236 | |
| 237 | i = n - k + 1; |
| 238 | |
| 239 | np[0] = 1; nn = 1; |
| 240 | |
| 241 | numfac = 1; |
| 242 | j = ODD_FACTORIAL_TABLE_LIMIT + 1; |
| 243 | jjj = ODD_FACTORIAL_TABLE_MAX; |
| 244 | ASSERT (__gmp_oddfac_table[ODD_FACTORIAL_TABLE_LIMIT] == ODD_FACTORIAL_TABLE_MAX); |
| 245 | |
| 246 | while (1) |
| 247 | { |
| 248 | kp[0] = jjj; /* store new factors */ |
| 249 | kn = 1; |
| 250 | t = k - j + 1; |
| 251 | kmax = MIN (kmax, t); |
| 252 | |
| 253 | while (kmax != 0 && kn < SOME_THRESHOLD) |
| 254 | { |
| 255 | jjj = mulfunc[kmax - 1] (j); |
| 256 | j += kmax; /* number of factors used */ |
| 257 | count_trailing_zeros (cnt, jjj); /* count low zeros */ |
| 258 | jjj >>= cnt; /* remove remaining low zeros */ |
| 259 | cy = mpn_mul_1 (kp, kp, kn, jjj); /* accumulate new factors */ |
| 260 | kp[kn] = cy; |
| 261 | kn += cy != 0; |
| 262 | t = k - j + 1; |
| 263 | kmax = MIN (kmax, t); |
| 264 | } |
| 265 | numfac = j - numfac; |
| 266 | |
| 267 | while (numfac != 0) |
| 268 | { |
| 269 | nmaxnow = MIN (nmax, numfac); |
| 270 | iii = mulfunc[nmaxnow - 1] (i); |
| 271 | i += nmaxnow; /* number of factors used */ |
| 272 | count_trailing_zeros (cnt, iii); /* count low zeros */ |
| 273 | iii >>= cnt; /* remove remaining low zeros */ |
| 274 | cy = mpn_mul_1 (np, np, nn, iii); /* accumulate new factors */ |
| 275 | np[nn] = cy; |
| 276 | nn += cy != 0; |
| 277 | numfac -= nmaxnow; |
| 278 | } |
| 279 | |
| 280 | ASSERT (nn < alloc); |
| 281 | |
| 282 | binvert_limb (dinv, kp[0]); |
| 283 | nn += (np[nn - 1] >= kp[kn - 1]); |
| 284 | nn -= kn; |
| 285 | mpn_sbpi1_bdiv_q (np, np, nn, kp, MIN(kn,nn), -dinv); |
| 286 | mpn_neg (np, np, nn); |
| 287 | |
| 288 | if (kmax == 0) |
| 289 | break; |
| 290 | numfac = j; |
| 291 | |
| 292 | jjj = mulfunc[kmax - 1] (j); |
| 293 | j += kmax; /* number of factors used */ |
| 294 | count_trailing_zeros (cnt, jjj); /* count low zeros */ |
| 295 | jjj >>= cnt; /* remove remaining low zeros */ |
| 296 | } |
| 297 | |
| 298 | /* Put back the right number of factors of 2. */ |
| 299 | popc_limb (cnt, n - k); |
| 300 | popc_limb (j, k); |
| 301 | cnt += j; |
| 302 | popc_limb (j, n); |
| 303 | cnt -= j; |
| 304 | if (cnt != 0) |
| 305 | { |
| 306 | ASSERT (cnt < GMP_NUMB_BITS); /* can happen, but not for intended use */ |
| 307 | cy = mpn_lshift (np, np, nn, cnt); |
| 308 | np[nn] = cy; |
| 309 | nn += cy != 0; |
| 310 | } |
| 311 | |
| 312 | nn -= np[nn - 1] == 0; /* normalisation */ |
| 313 | |
| 314 | kp = MPZ_NEWALLOC (r, nn); |
| 315 | SIZ(r) = nn; |
| 316 | MPN_COPY (kp, np, nn); |
| 317 | TMP_FREE; |
| 318 | } |
| 319 | |
| 320 | static void |
| 321 | mpz_smallk_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k) |
| 322 | { |
| 323 | unsigned nmax, numfac; |
| 324 | mp_ptr rp; |
| 325 | mp_size_t rn, alloc; |
| 326 | mp_limb_t i, iii, cy; |
| 327 | unsigned i2cnt, cnt; |
| 328 | |
| 329 | MAXFACS (nmax, n); |
| 330 | nmax = MIN (nmax, M); |
| 331 | |
| 332 | i = n - k + 1; |
| 333 | |
| 334 | i2cnt = __gmp_fac2cnt_table[k / 2 - 1]; /* low zeros count */ |
| 335 | if (nmax >= k) |
| 336 | { |
| 337 | MPZ_NEWALLOC (r, 1) [0] = mulfunc[k - 1] (i) * facinv[k - 2] >> |
| 338 | (i2cnt - tcnttab[k - 1]); |
| 339 | SIZ(r) = 1; |
| 340 | return; |
| 341 | } |
| 342 | |
| 343 | count_leading_zeros (cnt, (mp_limb_t) n); |
| 344 | cnt = GMP_LIMB_BITS - cnt; |
| 345 | alloc = cnt * k / GMP_NUMB_BITS + 3; /* FIXME: ensure rounding is enough. */ |
| 346 | rp = MPZ_NEWALLOC (r, alloc); |
| 347 | |
| 348 | rp[0] = mulfunc[nmax - 1] (i); |
| 349 | rn = 1; |
| 350 | i += nmax; /* number of factors used */ |
| 351 | i2cnt -= tcnttab[nmax - 1]; /* low zeros count */ |
| 352 | numfac = k - nmax; |
| 353 | do |
| 354 | { |
| 355 | nmax = MIN (nmax, numfac); |
| 356 | iii = mulfunc[nmax - 1] (i); |
| 357 | i += nmax; /* number of factors used */ |
| 358 | i2cnt -= tcnttab[nmax - 1]; /* update low zeros count */ |
| 359 | cy = mpn_mul_1 (rp, rp, rn, iii); /* accumulate new factors */ |
| 360 | rp[rn] = cy; |
| 361 | rn += cy != 0; |
| 362 | numfac -= nmax; |
| 363 | } while (numfac != 0); |
| 364 | |
| 365 | ASSERT (rn < alloc); |
| 366 | |
| 367 | mpn_pi1_bdiv_q_1 (rp, rp, rn, __gmp_oddfac_table[k], facinv[k - 2], i2cnt); |
| 368 | /* A two-fold, branch-free normalisation is possible :*/ |
| 369 | /* rn -= rp[rn - 1] == 0; */ |
| 370 | /* rn -= rp[rn - 1] == 0; */ |
| 371 | MPN_NORMALIZE_NOT_ZERO (rp, rn); |
| 372 | |
| 373 | SIZ(r) = rn; |
| 374 | } |
| 375 | |
| 376 | /* Algorithm: |
| 377 | |
| 378 | Plain and simply multiply things together. |
| 379 | |
| 380 | We tabulate factorials (k!/2^t)^(-1) mod B (where t is chosen such |
| 381 | that k!/2^t is odd). |
| 382 | |
| 383 | */ |
| 384 | |
| 385 | static mp_limb_t |
| 386 | bc_bin_uiui (unsigned int n, unsigned int k) |
| 387 | { |
| 388 | return ((__gmp_oddfac_table[n] * facinv[k - 2] * facinv[n - k - 2]) |
| 389 | << (__gmp_fac2cnt_table[n / 2 - 1] - __gmp_fac2cnt_table[k / 2 - 1] - __gmp_fac2cnt_table[(n-k) / 2 - 1])) |
| 390 | & GMP_NUMB_MASK; |
| 391 | } |
| 392 | |
| 393 | /* Algorithm: |
| 394 | |
| 395 | Recursively exploit the relation |
| 396 | bin(n,k) = bin(n,k>>1)*bin(n-k>>1,k-k>>1)/bin(k,k>>1) . |
| 397 | |
| 398 | Values for binomial(k,k>>1) that fit in a limb are precomputed |
| 399 | (with inverses). |
| 400 | */ |
| 401 | |
| 402 | /* bin2kk[i - ODD_CENTRAL_BINOMIAL_OFFSET] = |
| 403 | binomial(i*2,i)/2^t (where t is chosen so that it is odd). */ |
| 404 | static const mp_limb_t bin2kk[] = { ONE_LIMB_ODD_CENTRAL_BINOMIAL_TABLE }; |
| 405 | |
| 406 | /* bin2kkinv[i] = bin2kk[i]^-1 mod B */ |
| 407 | static const mp_limb_t bin2kkinv[] = { ONE_LIMB_ODD_CENTRAL_BINOMIAL_INVERSE_TABLE }; |
| 408 | |
| 409 | /* bin2kk[i] = binomial((i+MIN_S)*2,i+MIN_S)/2^t. This table contains the t values. */ |
| 410 | static const unsigned char fac2bin[] = { CENTRAL_BINOMIAL_2FAC_TABLE }; |
| 411 | |
| 412 | static void |
| 413 | mpz_smallkdc_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k) |
| 414 | { |
| 415 | mp_ptr rp; |
| 416 | mp_size_t rn; |
| 417 | unsigned long int hk; |
| 418 | |
| 419 | hk = k >> 1; |
| 420 | |
| 421 | if ((! BIN_UIUI_RECURSIVE_SMALLDC) || hk <= ODD_FACTORIAL_TABLE_LIMIT) |
| 422 | mpz_smallk_bin_uiui (r, n, hk); |
| 423 | else |
| 424 | mpz_smallkdc_bin_uiui (r, n, hk); |
| 425 | k -= hk; |
| 426 | n -= hk; |
| 427 | if (n <= ODD_FACTORIAL_EXTTABLE_LIMIT) { |
| 428 | mp_limb_t cy; |
| 429 | rn = SIZ (r); |
| 430 | rp = MPZ_REALLOC (r, rn + 1); |
| 431 | cy = mpn_mul_1 (rp, rp, rn, bc_bin_uiui (n, k)); |
| 432 | rp [rn] = cy; |
| 433 | rn += cy != 0; |
| 434 | } else { |
| 435 | mp_limb_t buffer[ODD_CENTRAL_BINOMIAL_TABLE_LIMIT + 3]; |
| 436 | mpz_t t; |
| 437 | |
| 438 | ALLOC (t) = ODD_CENTRAL_BINOMIAL_TABLE_LIMIT + 3; |
| 439 | PTR (t) = buffer; |
| 440 | if ((! BIN_UIUI_RECURSIVE_SMALLDC) || k <= ODD_FACTORIAL_TABLE_LIMIT) |
| 441 | mpz_smallk_bin_uiui (t, n, k); |
| 442 | else |
| 443 | mpz_smallkdc_bin_uiui (t, n, k); |
| 444 | mpz_mul (r, r, t); |
| 445 | rp = PTR (r); |
| 446 | rn = SIZ (r); |
| 447 | } |
| 448 | |
| 449 | mpn_pi1_bdiv_q_1 (rp, rp, rn, bin2kk[k - ODD_CENTRAL_BINOMIAL_OFFSET], |
| 450 | bin2kkinv[k - ODD_CENTRAL_BINOMIAL_OFFSET], |
| 451 | fac2bin[k - ODD_CENTRAL_BINOMIAL_OFFSET] - (k != hk)); |
| 452 | /* A two-fold, branch-free normalisation is possible :*/ |
| 453 | /* rn -= rp[rn - 1] == 0; */ |
| 454 | /* rn -= rp[rn - 1] == 0; */ |
| 455 | MPN_NORMALIZE_NOT_ZERO (rp, rn); |
| 456 | |
| 457 | SIZ(r) = rn; |
| 458 | } |
| 459 | |
| 460 | /* mpz_goetgheluck_bin_uiui(RESULT, N, K) -- Set RESULT to binomial(N,K). |
| 461 | * |
| 462 | * Contributed to the GNU project by Marco Bodrato. |
| 463 | * |
| 464 | * Implementation of the algorithm by P. Goetgheluck, "Computing |
| 465 | * Binomial Coefficients", The American Mathematical Monthly, Vol. 94, |
| 466 | * No. 4 (April 1987), pp. 360-365. |
| 467 | * |
| 468 | * Acknowledgment: Peter Luschny did spot the slowness of the previous |
| 469 | * code and suggested the reference. |
| 470 | */ |
| 471 | |
| 472 | /* TODO: Remove duplicated constants / macros / static functions... |
| 473 | */ |
| 474 | |
| 475 | /*************************************************************/ |
| 476 | /* Section macros: common macros, for swing/fac/bin (&sieve) */ |
| 477 | /*************************************************************/ |
| 478 | |
| 479 | #define FACTOR_LIST_APPEND(PR, MAX_PR, VEC, I) \ |
| 480 | if ((PR) > (MAX_PR)) { \ |
| 481 | (VEC)[(I)++] = (PR); \ |
| 482 | (PR) = 1; \ |
| 483 | } |
| 484 | |
| 485 | #define FACTOR_LIST_STORE(P, PR, MAX_PR, VEC, I) \ |
| 486 | do { \ |
| 487 | if ((PR) > (MAX_PR)) { \ |
| 488 | (VEC)[(I)++] = (PR); \ |
| 489 | (PR) = (P); \ |
| 490 | } else \ |
| 491 | (PR) *= (P); \ |
| 492 | } while (0) |
| 493 | |
| 494 | #define LOOP_ON_SIEVE_CONTINUE(prime,end,sieve) \ |
| 495 | __max_i = (end); \ |
| 496 | \ |
| 497 | do { \ |
| 498 | ++__i; \ |
| 499 | if (((sieve)[__index] & __mask) == 0) \ |
| 500 | { \ |
| 501 | mp_limb_t prime; \ |
| 502 | prime = id_to_n(__i) |
| 503 | |
| 504 | #define LOOP_ON_SIEVE_BEGIN(prime,start,end,off,sieve) \ |
| 505 | do { \ |
| 506 | mp_limb_t __mask, __index, __max_i, __i; \ |
| 507 | \ |
| 508 | __i = (start)-(off); \ |
| 509 | __index = __i / GMP_LIMB_BITS; \ |
| 510 | __mask = CNST_LIMB(1) << (__i % GMP_LIMB_BITS); \ |
| 511 | __i += (off); \ |
| 512 | \ |
| 513 | LOOP_ON_SIEVE_CONTINUE(prime,end,sieve) |
| 514 | |
| 515 | #define LOOP_ON_SIEVE_STOP \ |
| 516 | } \ |
| 517 | __mask = __mask << 1 | __mask >> (GMP_LIMB_BITS-1); \ |
| 518 | __index += __mask & 1; \ |
| 519 | } while (__i <= __max_i) |
| 520 | |
| 521 | #define LOOP_ON_SIEVE_END \ |
| 522 | LOOP_ON_SIEVE_STOP; \ |
| 523 | } while (0) |
| 524 | |
| 525 | /*********************************************************/ |
| 526 | /* Section sieve: sieving functions and tools for primes */ |
| 527 | /*********************************************************/ |
| 528 | |
| 529 | #if WANT_ASSERT |
| 530 | static mp_limb_t |
| 531 | bit_to_n (mp_limb_t bit) { return (bit*3+4)|1; } |
| 532 | #endif |
| 533 | |
| 534 | /* id_to_n (x) = bit_to_n (x-1) = (id*3+1)|1*/ |
| 535 | static mp_limb_t |
| 536 | id_to_n (mp_limb_t id) { return id*3+1+(id&1); } |
| 537 | |
| 538 | /* n_to_bit (n) = ((n-1)&(-CNST_LIMB(2)))/3U-1 */ |
| 539 | static mp_limb_t |
| 540 | n_to_bit (mp_limb_t n) { return ((n-5)|1)/3U; } |
| 541 | |
| 542 | static mp_size_t |
| 543 | primesieve_size (mp_limb_t n) { return n_to_bit(n) / GMP_LIMB_BITS + 1; } |
| 544 | |
| 545 | /*********************************************************/ |
| 546 | /* Section binomial: fast binomial implementation */ |
| 547 | /*********************************************************/ |
| 548 | |
| 549 | #define COUNT_A_PRIME(P, N, K, PR, MAX_PR, VEC, I) \ |
| 550 | do { \ |
| 551 | mp_limb_t __a, __b, __prime, __ma,__mb; \ |
| 552 | __prime = (P); \ |
| 553 | __a = (N); __b = (K); __mb = 0; \ |
| 554 | FACTOR_LIST_APPEND(PR, MAX_PR, VEC, I); \ |
| 555 | do { \ |
| 556 | __mb += __b % __prime; __b /= __prime; \ |
| 557 | __ma = __a % __prime; __a /= __prime; \ |
| 558 | if (__ma < __mb) { \ |
| 559 | __mb = 1; (PR) *= __prime; \ |
| 560 | } else __mb = 0; \ |
| 561 | } while (__a >= __prime); \ |
| 562 | } while (0) |
| 563 | |
| 564 | #define SH_COUNT_A_PRIME(P, N, K, PR, MAX_PR, VEC, I) \ |
| 565 | do { \ |
| 566 | mp_limb_t __prime; \ |
| 567 | __prime = (P); \ |
| 568 | if (((N) % __prime) < ((K) % __prime)) { \ |
| 569 | FACTOR_LIST_STORE (__prime, PR, MAX_PR, VEC, I); \ |
| 570 | } \ |
| 571 | } while (0) |
| 572 | |
| 573 | /* Returns an approximation of the sqare root of x. |
| 574 | * It gives: |
| 575 | * limb_apprsqrt (x) ^ 2 <= x < (limb_apprsqrt (x)+1) ^ 2 |
| 576 | * or |
| 577 | * x <= limb_apprsqrt (x) ^ 2 <= x * 9/8 |
| 578 | */ |
| 579 | static mp_limb_t |
| 580 | limb_apprsqrt (mp_limb_t x) |
| 581 | { |
| 582 | int s; |
| 583 | |
| 584 | ASSERT (x > 2); |
| 585 | count_leading_zeros (s, x); |
| 586 | s = (GMP_LIMB_BITS - s) >> 1; |
| 587 | return ((CNST_LIMB(1) << s) + (x >> s)) >> 1; |
| 588 | } |
| 589 | |
| 590 | static void |
| 591 | mpz_goetgheluck_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k) |
| 592 | { |
| 593 | mp_limb_t *sieve, *factors, count; |
| 594 | mp_limb_t prod, max_prod; |
| 595 | mp_size_t j; |
| 596 | TMP_DECL; |
| 597 | |
| 598 | ASSERT (BIN_GOETGHELUCK_THRESHOLD >= 13); |
| 599 | ASSERT (n >= 25); |
| 600 | |
| 601 | TMP_MARK; |
| 602 | sieve = TMP_ALLOC_LIMBS (primesieve_size (n)); |
| 603 | |
| 604 | count = gmp_primesieve (sieve, n) + 1; |
| 605 | factors = TMP_ALLOC_LIMBS (count / log_n_max (n) + 1); |
| 606 | |
| 607 | max_prod = GMP_NUMB_MAX / n; |
| 608 | |
| 609 | /* Handle primes = 2, 3 separately. */ |
| 610 | popc_limb (count, n - k); |
| 611 | popc_limb (j, k); |
| 612 | count += j; |
| 613 | popc_limb (j, n); |
| 614 | count -= j; |
| 615 | prod = CNST_LIMB(1) << count; |
| 616 | |
| 617 | j = 0; |
| 618 | COUNT_A_PRIME (3, n, k, prod, max_prod, factors, j); |
| 619 | |
| 620 | /* Accumulate prime factors from 5 to n/2 */ |
| 621 | { |
| 622 | mp_limb_t s; |
| 623 | |
| 624 | s = limb_apprsqrt(n); |
| 625 | s = n_to_bit (s); |
| 626 | ASSERT (bit_to_n (s+1) * bit_to_n (s+1) > n); |
| 627 | ASSERT (s <= n_to_bit (n >> 1)); |
| 628 | LOOP_ON_SIEVE_BEGIN (prime, n_to_bit (5), s, 0,sieve); |
| 629 | COUNT_A_PRIME (prime, n, k, prod, max_prod, factors, j); |
| 630 | LOOP_ON_SIEVE_STOP; |
| 631 | |
| 632 | ASSERT (max_prod <= GMP_NUMB_MAX / 2); |
| 633 | max_prod <<= 1; |
| 634 | |
| 635 | LOOP_ON_SIEVE_CONTINUE (prime, n_to_bit (n >> 1),sieve); |
| 636 | SH_COUNT_A_PRIME (prime, n, k, prod, max_prod, factors, j); |
| 637 | LOOP_ON_SIEVE_END; |
| 638 | |
| 639 | max_prod >>= 1; |
| 640 | } |
| 641 | |
| 642 | /* Store primes from (n-k)+1 to n */ |
| 643 | ASSERT (n_to_bit (n - k) < n_to_bit (n)); |
| 644 | |
| 645 | LOOP_ON_SIEVE_BEGIN (prime, n_to_bit (n - k) + 1, n_to_bit (n), 0,sieve); |
| 646 | FACTOR_LIST_STORE (prime, prod, max_prod, factors, j); |
| 647 | LOOP_ON_SIEVE_END; |
| 648 | |
| 649 | if (LIKELY (j != 0)) |
| 650 | { |
| 651 | factors[j++] = prod; |
| 652 | mpz_prodlimbs (r, factors, j); |
| 653 | } |
| 654 | else |
| 655 | { |
| 656 | MPZ_NEWALLOC (r, 1)[0] = prod; |
| 657 | SIZ (r) = 1; |
| 658 | } |
| 659 | TMP_FREE; |
| 660 | } |
| 661 | |
| 662 | #undef COUNT_A_PRIME |
| 663 | #undef SH_COUNT_A_PRIME |
| 664 | #undef LOOP_ON_SIEVE_END |
| 665 | #undef LOOP_ON_SIEVE_STOP |
| 666 | #undef LOOP_ON_SIEVE_BEGIN |
| 667 | #undef LOOP_ON_SIEVE_CONTINUE |
| 668 | |
| 669 | /*********************************************************/ |
| 670 | /* End of implementation of Goetgheluck's algorithm */ |
| 671 | /*********************************************************/ |
| 672 | |
| 673 | void |
| 674 | mpz_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k) |
| 675 | { |
| 676 | if (UNLIKELY (n < k)) { |
| 677 | SIZ (r) = 0; |
| 678 | #if BITS_PER_ULONG > GMP_NUMB_BITS |
| 679 | } else if (UNLIKELY (n > GMP_NUMB_MAX)) { |
| 680 | mpz_t tmp; |
| 681 | |
| 682 | mpz_init_set_ui (tmp, n); |
| 683 | mpz_bin_ui (r, tmp, k); |
| 684 | mpz_clear (tmp); |
| 685 | #endif |
| 686 | } else { |
| 687 | ASSERT (n <= GMP_NUMB_MAX); |
| 688 | /* Rewrite bin(n,k) as bin(n,n-k) if that is smaller. */ |
| 689 | k = MIN (k, n - k); |
| 690 | if (k < 2) { |
| 691 | MPZ_NEWALLOC (r, 1)[0] = k ? n : 1; /* 1 + ((-k) & (n-1)); */ |
| 692 | SIZ(r) = 1; |
| 693 | } else if (n <= ODD_FACTORIAL_EXTTABLE_LIMIT) { /* k >= 2, n >= 4 */ |
| 694 | MPZ_NEWALLOC (r, 1)[0] = bc_bin_uiui (n, k); |
| 695 | SIZ(r) = 1; |
| 696 | } else if (k <= ODD_FACTORIAL_TABLE_LIMIT) |
| 697 | mpz_smallk_bin_uiui (r, n, k); |
| 698 | else if (BIN_UIUI_ENABLE_SMALLDC && |
| 699 | k <= (BIN_UIUI_RECURSIVE_SMALLDC ? ODD_CENTRAL_BINOMIAL_TABLE_LIMIT : ODD_FACTORIAL_TABLE_LIMIT)* 2) |
| 700 | mpz_smallkdc_bin_uiui (r, n, k); |
| 701 | else if (ABOVE_THRESHOLD (k, BIN_GOETGHELUCK_THRESHOLD) && |
| 702 | k > (n >> 4)) /* k > ODD_FACTORIAL_TABLE_LIMIT */ |
| 703 | mpz_goetgheluck_bin_uiui (r, n, k); |
| 704 | else |
| 705 | mpz_bdiv_bin_uiui (r, n, k); |
| 706 | } |
| 707 | } |