Austin Schuh | dace2a6 | 2020-08-18 10:56:48 -0700 | [diff] [blame] | 1 | /* mpn_invertappr and helper functions. Compute I such that |
| 2 | floor((B^{2n}-1)/U - 1 <= I + B^n <= floor((B^{2n}-1)/U. |
| 3 | |
| 4 | Contributed to the GNU project by Marco Bodrato. |
| 5 | |
| 6 | The algorithm used here was inspired by ApproximateReciprocal from "Modern |
| 7 | Computer Arithmetic", by Richard P. Brent and Paul Zimmermann. Special |
| 8 | thanks to Paul Zimmermann for his very valuable suggestions on all the |
| 9 | theoretical aspects during the work on this code. |
| 10 | |
| 11 | THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY |
| 12 | SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST |
| 13 | GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. |
| 14 | |
| 15 | Copyright (C) 2007, 2009, 2010, 2012, 2015, 2016 Free Software |
| 16 | Foundation, Inc. |
| 17 | |
| 18 | This file is part of the GNU MP Library. |
| 19 | |
| 20 | The GNU MP Library is free software; you can redistribute it and/or modify |
| 21 | it under the terms of either: |
| 22 | |
| 23 | * the GNU Lesser General Public License as published by the Free |
| 24 | Software Foundation; either version 3 of the License, or (at your |
| 25 | option) any later version. |
| 26 | |
| 27 | or |
| 28 | |
| 29 | * the GNU General Public License as published by the Free Software |
| 30 | Foundation; either version 2 of the License, or (at your option) any |
| 31 | later version. |
| 32 | |
| 33 | or both in parallel, as here. |
| 34 | |
| 35 | The GNU MP Library is distributed in the hope that it will be useful, but |
| 36 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| 37 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 38 | for more details. |
| 39 | |
| 40 | You should have received copies of the GNU General Public License and the |
| 41 | GNU Lesser General Public License along with the GNU MP Library. If not, |
| 42 | see https://www.gnu.org/licenses/. */ |
| 43 | |
| 44 | #include "gmp-impl.h" |
| 45 | #include "longlong.h" |
| 46 | |
| 47 | /* FIXME: The iterative version splits the operand in two slightly unbalanced |
| 48 | parts, the use of log_2 (or counting the bits) underestimate the maximum |
| 49 | number of iterations. */ |
| 50 | |
| 51 | #if TUNE_PROGRAM_BUILD |
| 52 | #define NPOWS \ |
| 53 | ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t))) |
| 54 | #define MAYBE_dcpi1_divappr 1 |
| 55 | #else |
| 56 | #define NPOWS \ |
| 57 | ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)) - LOG2C (INV_NEWTON_THRESHOLD)) |
| 58 | #define MAYBE_dcpi1_divappr \ |
| 59 | (INV_NEWTON_THRESHOLD < DC_DIVAPPR_Q_THRESHOLD) |
| 60 | #if (INV_NEWTON_THRESHOLD > INV_MULMOD_BNM1_THRESHOLD) && \ |
| 61 | (INV_APPR_THRESHOLD > INV_MULMOD_BNM1_THRESHOLD) |
| 62 | #undef INV_MULMOD_BNM1_THRESHOLD |
| 63 | #define INV_MULMOD_BNM1_THRESHOLD 0 /* always when Newton */ |
| 64 | #endif |
| 65 | #endif |
| 66 | |
| 67 | /* All the three functions mpn{,_bc,_ni}_invertappr (ip, dp, n, scratch), take |
| 68 | the strictly normalised value {dp,n} (i.e., most significant bit must be set) |
| 69 | as an input, and compute {ip,n}: the approximate reciprocal of {dp,n}. |
| 70 | |
| 71 | Let e = mpn*_invertappr (ip, dp, n, scratch) be the returned value; the |
| 72 | following conditions are satisfied by the output: |
| 73 | 0 <= e <= 1; |
| 74 | {dp,n}*(B^n+{ip,n}) < B^{2n} <= {dp,n}*(B^n+{ip,n}+1+e) . |
| 75 | I.e. e=0 means that the result {ip,n} equals the one given by mpn_invert. |
| 76 | e=1 means that the result _may_ be one less than expected. |
| 77 | |
| 78 | The _bc version returns e=1 most of the time. |
| 79 | The _ni version should return e=0 most of the time; only about 1% of |
| 80 | possible random input should give e=1. |
| 81 | |
| 82 | When the strict result is needed, i.e., e=0 in the relation above: |
| 83 | {dp,n}*(B^n+{ip,n}) < B^{2n} <= {dp,n}*(B^n+{ip,n}+1) ; |
| 84 | the function mpn_invert (ip, dp, n, scratch) should be used instead. */ |
| 85 | |
| 86 | /* Maximum scratch needed by this branch (at xp): 2*n */ |
| 87 | static mp_limb_t |
| 88 | mpn_bc_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr xp) |
| 89 | { |
| 90 | ASSERT (n > 0); |
| 91 | ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT); |
| 92 | ASSERT (! MPN_OVERLAP_P (ip, n, dp, n)); |
| 93 | ASSERT (! MPN_OVERLAP_P (ip, n, xp, mpn_invertappr_itch(n))); |
| 94 | ASSERT (! MPN_OVERLAP_P (dp, n, xp, mpn_invertappr_itch(n))); |
| 95 | |
| 96 | /* Compute a base value of r limbs. */ |
| 97 | if (n == 1) |
| 98 | invert_limb (*ip, *dp); |
| 99 | else { |
| 100 | /* n > 1 here */ |
| 101 | MPN_FILL (xp, n, GMP_NUMB_MAX); |
| 102 | mpn_com (xp + n, dp, n); |
| 103 | |
| 104 | /* Now xp contains B^2n - {dp,n}*B^n - 1 */ |
| 105 | |
| 106 | /* FIXME: if mpn_*pi1_divappr_q handles n==2, use it! */ |
| 107 | if (n == 2) { |
| 108 | mpn_divrem_2 (ip, 0, xp, 4, dp); |
| 109 | } else { |
| 110 | gmp_pi1_t inv; |
| 111 | invert_pi1 (inv, dp[n-1], dp[n-2]); |
| 112 | if (! MAYBE_dcpi1_divappr |
| 113 | || BELOW_THRESHOLD (n, DC_DIVAPPR_Q_THRESHOLD)) |
| 114 | mpn_sbpi1_divappr_q (ip, xp, 2 * n, dp, n, inv.inv32); |
| 115 | else |
| 116 | mpn_dcpi1_divappr_q (ip, xp, 2 * n, dp, n, &inv); |
| 117 | MPN_DECR_U(ip, n, CNST_LIMB (1)); |
| 118 | return 1; |
| 119 | } |
| 120 | } |
| 121 | return 0; |
| 122 | } |
| 123 | |
| 124 | /* mpn_ni_invertappr: computes the approximate reciprocal using Newton's |
| 125 | iterations (at least one). |
| 126 | |
| 127 | Inspired by Algorithm "ApproximateReciprocal", published in "Modern Computer |
| 128 | Arithmetic" by Richard P. Brent and Paul Zimmermann, algorithm 3.5, page 121 |
| 129 | in version 0.4 of the book. |
| 130 | |
| 131 | Some adaptations were introduced, to allow product mod B^m-1 and return the |
| 132 | value e. |
| 133 | |
| 134 | We introduced a correction in such a way that "the value of |
| 135 | B^{n+h}-T computed at step 8 cannot exceed B^n-1" (the book reads |
| 136 | "2B^n-1"). |
| 137 | |
| 138 | Maximum scratch needed by this branch <= 2*n, but have to fit 3*rn |
| 139 | in the scratch, i.e. 3*rn <= 2*n: we require n>4. |
| 140 | |
| 141 | We use a wrapped product modulo B^m-1. NOTE: is there any normalisation |
| 142 | problem for the [0] class? It shouldn't: we compute 2*|A*X_h - B^{n+h}| < |
| 143 | B^m-1. We may get [0] if and only if we get AX_h = B^{n+h}. This can |
| 144 | happen only if A=B^{n}/2, but this implies X_h = B^{h}*2-1 i.e., AX_h = |
| 145 | B^{n+h} - A, then we get into the "negative" branch, where X_h is not |
| 146 | incremented (because A < B^n). |
| 147 | |
| 148 | FIXME: the scratch for mulmod_bnm1 does not currently fit in the scratch, it |
| 149 | is allocated apart. |
| 150 | */ |
| 151 | |
| 152 | mp_limb_t |
| 153 | mpn_ni_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch) |
| 154 | { |
| 155 | mp_limb_t cy; |
| 156 | mp_size_t rn, mn; |
| 157 | mp_size_t sizes[NPOWS], *sizp; |
| 158 | mp_ptr tp; |
| 159 | TMP_DECL; |
| 160 | #define xp scratch |
| 161 | |
| 162 | ASSERT (n > 4); |
| 163 | ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT); |
| 164 | ASSERT (! MPN_OVERLAP_P (ip, n, dp, n)); |
| 165 | ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n))); |
| 166 | ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n))); |
| 167 | |
| 168 | /* Compute the computation precisions from highest to lowest, leaving the |
| 169 | base case size in 'rn'. */ |
| 170 | sizp = sizes; |
| 171 | rn = n; |
| 172 | do { |
| 173 | *sizp = rn; |
| 174 | rn = (rn >> 1) + 1; |
| 175 | ++sizp; |
| 176 | } while (ABOVE_THRESHOLD (rn, INV_NEWTON_THRESHOLD)); |
| 177 | |
| 178 | /* We search the inverse of 0.{dp,n}, we compute it as 1.{ip,n} */ |
| 179 | dp += n; |
| 180 | ip += n; |
| 181 | |
| 182 | /* Compute a base value of rn limbs. */ |
| 183 | mpn_bc_invertappr (ip - rn, dp - rn, rn, scratch); |
| 184 | |
| 185 | TMP_MARK; |
| 186 | |
| 187 | if (ABOVE_THRESHOLD (n, INV_MULMOD_BNM1_THRESHOLD)) |
| 188 | { |
| 189 | mn = mpn_mulmod_bnm1_next_size (n + 1); |
| 190 | tp = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (mn, n, (n >> 1) + 1)); |
| 191 | } |
| 192 | /* Use Newton's iterations to get the desired precision.*/ |
| 193 | |
| 194 | while (1) { |
| 195 | n = *--sizp; |
| 196 | /* |
| 197 | v n v |
| 198 | +----+--+ |
| 199 | ^ rn ^ |
| 200 | */ |
| 201 | |
| 202 | /* Compute i_jd . */ |
| 203 | if (BELOW_THRESHOLD (n, INV_MULMOD_BNM1_THRESHOLD) |
| 204 | || ((mn = mpn_mulmod_bnm1_next_size (n + 1)) > (n + rn))) { |
| 205 | /* FIXME: We do only need {xp,n+1}*/ |
| 206 | mpn_mul (xp, dp - n, n, ip - rn, rn); |
| 207 | mpn_add_n (xp + rn, xp + rn, dp - n, n - rn + 1); |
| 208 | cy = CNST_LIMB(1); /* Remember we truncated, Mod B^(n+1) */ |
| 209 | /* We computed (truncated) {xp,n+1} <- 1.{ip,rn} * 0.{dp,n} */ |
| 210 | } else { /* Use B^mn-1 wraparound */ |
| 211 | mpn_mulmod_bnm1 (xp, mn, dp - n, n, ip - rn, rn, tp); |
| 212 | /* We computed {xp,mn} <- {ip,rn} * {dp,n} mod (B^mn-1) */ |
| 213 | /* We know that 2*|ip*dp + dp*B^rn - B^{rn+n}| < B^mn-1 */ |
| 214 | /* Add dp*B^rn mod (B^mn-1) */ |
| 215 | ASSERT (n >= mn - rn); |
| 216 | cy = mpn_add_n (xp + rn, xp + rn, dp - n, mn - rn); |
| 217 | cy = mpn_add_nc (xp, xp, dp - (n - (mn - rn)), n - (mn - rn), cy); |
| 218 | /* Subtract B^{rn+n}, maybe only compensate the carry*/ |
| 219 | xp[mn] = CNST_LIMB (1); /* set a limit for DECR_U */ |
| 220 | MPN_DECR_U (xp + rn + n - mn, 2 * mn + 1 - rn - n, CNST_LIMB (1) - cy); |
| 221 | MPN_DECR_U (xp, mn, CNST_LIMB (1) - xp[mn]); /* if DECR_U eroded xp[mn] */ |
| 222 | cy = CNST_LIMB(0); /* Remember we are working Mod B^mn-1 */ |
| 223 | } |
| 224 | |
| 225 | if (xp[n] < CNST_LIMB (2)) { /* "positive" residue class */ |
| 226 | cy = xp[n]; /* 0 <= cy <= 1 here. */ |
| 227 | #if HAVE_NATIVE_mpn_sublsh1_n |
| 228 | if (cy++) { |
| 229 | if (mpn_cmp (xp, dp - n, n) > 0) { |
| 230 | mp_limb_t chk; |
| 231 | chk = mpn_sublsh1_n (xp, xp, dp - n, n); |
| 232 | ASSERT (chk == xp[n]); |
| 233 | ++ cy; |
| 234 | } else |
| 235 | ASSERT_CARRY (mpn_sub_n (xp, xp, dp - n, n)); |
| 236 | } |
| 237 | #else /* no mpn_sublsh1_n*/ |
| 238 | if (cy++ && !mpn_sub_n (xp, xp, dp - n, n)) { |
| 239 | ASSERT_CARRY (mpn_sub_n (xp, xp, dp - n, n)); |
| 240 | ++cy; |
| 241 | } |
| 242 | #endif |
| 243 | /* 1 <= cy <= 3 here. */ |
| 244 | #if HAVE_NATIVE_mpn_rsblsh1_n |
| 245 | if (mpn_cmp (xp, dp - n, n) > 0) { |
| 246 | ASSERT_NOCARRY (mpn_rsblsh1_n (xp + n, xp, dp - n, n)); |
| 247 | ++cy; |
| 248 | } else |
| 249 | ASSERT_NOCARRY (mpn_sub_nc (xp + 2 * n - rn, dp - rn, xp + n - rn, rn, mpn_cmp (xp, dp - n, n - rn) > 0)); |
| 250 | #else /* no mpn_rsblsh1_n*/ |
| 251 | if (mpn_cmp (xp, dp - n, n) > 0) { |
| 252 | ASSERT_NOCARRY (mpn_sub_n (xp, xp, dp - n, n)); |
| 253 | ++cy; |
| 254 | } |
| 255 | ASSERT_NOCARRY (mpn_sub_nc (xp + 2 * n - rn, dp - rn, xp + n - rn, rn, mpn_cmp (xp, dp - n, n - rn) > 0)); |
| 256 | #endif |
| 257 | MPN_DECR_U(ip - rn, rn, cy); /* 1 <= cy <= 4 here. */ |
| 258 | } else { /* "negative" residue class */ |
| 259 | ASSERT (xp[n] >= GMP_NUMB_MAX - CNST_LIMB(1)); |
| 260 | MPN_DECR_U(xp, n + 1, cy); |
| 261 | if (xp[n] != GMP_NUMB_MAX) { |
| 262 | MPN_INCR_U(ip - rn, rn, CNST_LIMB (1)); |
| 263 | ASSERT_CARRY (mpn_add_n (xp, xp, dp - n, n)); |
| 264 | } |
| 265 | mpn_com (xp + 2 * n - rn, xp + n - rn, rn); |
| 266 | } |
| 267 | |
| 268 | /* Compute x_ju_j. FIXME:We need {xp+rn,rn}, mulhi? */ |
| 269 | mpn_mul_n (xp, xp + 2 * n - rn, ip - rn, rn); |
| 270 | cy = mpn_add_n (xp + rn, xp + rn, xp + 2 * n - rn, 2 * rn - n); |
| 271 | cy = mpn_add_nc (ip - n, xp + 3 * rn - n, xp + n + rn, n - rn, cy); |
| 272 | MPN_INCR_U (ip - rn, rn, cy); |
| 273 | if (sizp == sizes) { /* Get out of the cycle */ |
| 274 | /* Check for possible carry propagation from below. */ |
| 275 | cy = xp[3 * rn - n - 1] > GMP_NUMB_MAX - CNST_LIMB (7); /* Be conservative. */ |
| 276 | /* cy = mpn_add_1 (xp + rn, xp + rn, 2*rn - n, 4); */ |
| 277 | break; |
| 278 | } |
| 279 | rn = n; |
| 280 | } |
| 281 | TMP_FREE; |
| 282 | |
| 283 | return cy; |
| 284 | #undef xp |
| 285 | } |
| 286 | |
| 287 | mp_limb_t |
| 288 | mpn_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch) |
| 289 | { |
| 290 | ASSERT (n > 0); |
| 291 | ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT); |
| 292 | ASSERT (! MPN_OVERLAP_P (ip, n, dp, n)); |
| 293 | ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n))); |
| 294 | ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n))); |
| 295 | |
| 296 | if (BELOW_THRESHOLD (n, INV_NEWTON_THRESHOLD)) |
| 297 | return mpn_bc_invertappr (ip, dp, n, scratch); |
| 298 | else |
| 299 | return mpn_ni_invertappr (ip, dp, n, scratch); |
| 300 | } |