blob: 3be5596bf7091c4176b09d0d480d44092504ad42 [file] [log] [blame]
Austin Schuhdace2a62020-08-18 10:56:48 -07001/* mpn_invertappr and helper functions. Compute I such that
2 floor((B^{2n}-1)/U - 1 <= I + B^n <= floor((B^{2n}-1)/U.
3
4 Contributed to the GNU project by Marco Bodrato.
5
6 The algorithm used here was inspired by ApproximateReciprocal from "Modern
7 Computer Arithmetic", by Richard P. Brent and Paul Zimmermann. Special
8 thanks to Paul Zimmermann for his very valuable suggestions on all the
9 theoretical aspects during the work on this code.
10
11 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
12 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
13 GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
14
15Copyright (C) 2007, 2009, 2010, 2012, 2015, 2016 Free Software
16Foundation, Inc.
17
18This file is part of the GNU MP Library.
19
20The GNU MP Library is free software; you can redistribute it and/or modify
21it under the terms of either:
22
23 * the GNU Lesser General Public License as published by the Free
24 Software Foundation; either version 3 of the License, or (at your
25 option) any later version.
26
27or
28
29 * the GNU General Public License as published by the Free Software
30 Foundation; either version 2 of the License, or (at your option) any
31 later version.
32
33or both in parallel, as here.
34
35The GNU MP Library is distributed in the hope that it will be useful, but
36WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
37or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
38for more details.
39
40You should have received copies of the GNU General Public License and the
41GNU Lesser General Public License along with the GNU MP Library. If not,
42see https://www.gnu.org/licenses/. */
43
44#include "gmp-impl.h"
45#include "longlong.h"
46
47/* FIXME: The iterative version splits the operand in two slightly unbalanced
48 parts, the use of log_2 (or counting the bits) underestimate the maximum
49 number of iterations. */
50
51#if TUNE_PROGRAM_BUILD
52#define NPOWS \
53 ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)))
54#define MAYBE_dcpi1_divappr 1
55#else
56#define NPOWS \
57 ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)) - LOG2C (INV_NEWTON_THRESHOLD))
58#define MAYBE_dcpi1_divappr \
59 (INV_NEWTON_THRESHOLD < DC_DIVAPPR_Q_THRESHOLD)
60#if (INV_NEWTON_THRESHOLD > INV_MULMOD_BNM1_THRESHOLD) && \
61 (INV_APPR_THRESHOLD > INV_MULMOD_BNM1_THRESHOLD)
62#undef INV_MULMOD_BNM1_THRESHOLD
63#define INV_MULMOD_BNM1_THRESHOLD 0 /* always when Newton */
64#endif
65#endif
66
67/* All the three functions mpn{,_bc,_ni}_invertappr (ip, dp, n, scratch), take
68 the strictly normalised value {dp,n} (i.e., most significant bit must be set)
69 as an input, and compute {ip,n}: the approximate reciprocal of {dp,n}.
70
71 Let e = mpn*_invertappr (ip, dp, n, scratch) be the returned value; the
72 following conditions are satisfied by the output:
73 0 <= e <= 1;
74 {dp,n}*(B^n+{ip,n}) < B^{2n} <= {dp,n}*(B^n+{ip,n}+1+e) .
75 I.e. e=0 means that the result {ip,n} equals the one given by mpn_invert.
76 e=1 means that the result _may_ be one less than expected.
77
78 The _bc version returns e=1 most of the time.
79 The _ni version should return e=0 most of the time; only about 1% of
80 possible random input should give e=1.
81
82 When the strict result is needed, i.e., e=0 in the relation above:
83 {dp,n}*(B^n+{ip,n}) < B^{2n} <= {dp,n}*(B^n+{ip,n}+1) ;
84 the function mpn_invert (ip, dp, n, scratch) should be used instead. */
85
86/* Maximum scratch needed by this branch (at xp): 2*n */
87static mp_limb_t
88mpn_bc_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr xp)
89{
90 ASSERT (n > 0);
91 ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
92 ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
93 ASSERT (! MPN_OVERLAP_P (ip, n, xp, mpn_invertappr_itch(n)));
94 ASSERT (! MPN_OVERLAP_P (dp, n, xp, mpn_invertappr_itch(n)));
95
96 /* Compute a base value of r limbs. */
97 if (n == 1)
98 invert_limb (*ip, *dp);
99 else {
100 /* n > 1 here */
101 MPN_FILL (xp, n, GMP_NUMB_MAX);
102 mpn_com (xp + n, dp, n);
103
104 /* Now xp contains B^2n - {dp,n}*B^n - 1 */
105
106 /* FIXME: if mpn_*pi1_divappr_q handles n==2, use it! */
107 if (n == 2) {
108 mpn_divrem_2 (ip, 0, xp, 4, dp);
109 } else {
110 gmp_pi1_t inv;
111 invert_pi1 (inv, dp[n-1], dp[n-2]);
112 if (! MAYBE_dcpi1_divappr
113 || BELOW_THRESHOLD (n, DC_DIVAPPR_Q_THRESHOLD))
114 mpn_sbpi1_divappr_q (ip, xp, 2 * n, dp, n, inv.inv32);
115 else
116 mpn_dcpi1_divappr_q (ip, xp, 2 * n, dp, n, &inv);
117 MPN_DECR_U(ip, n, CNST_LIMB (1));
118 return 1;
119 }
120 }
121 return 0;
122}
123
124/* mpn_ni_invertappr: computes the approximate reciprocal using Newton's
125 iterations (at least one).
126
127 Inspired by Algorithm "ApproximateReciprocal", published in "Modern Computer
128 Arithmetic" by Richard P. Brent and Paul Zimmermann, algorithm 3.5, page 121
129 in version 0.4 of the book.
130
131 Some adaptations were introduced, to allow product mod B^m-1 and return the
132 value e.
133
134 We introduced a correction in such a way that "the value of
135 B^{n+h}-T computed at step 8 cannot exceed B^n-1" (the book reads
136 "2B^n-1").
137
138 Maximum scratch needed by this branch <= 2*n, but have to fit 3*rn
139 in the scratch, i.e. 3*rn <= 2*n: we require n>4.
140
141 We use a wrapped product modulo B^m-1. NOTE: is there any normalisation
142 problem for the [0] class? It shouldn't: we compute 2*|A*X_h - B^{n+h}| <
143 B^m-1. We may get [0] if and only if we get AX_h = B^{n+h}. This can
144 happen only if A=B^{n}/2, but this implies X_h = B^{h}*2-1 i.e., AX_h =
145 B^{n+h} - A, then we get into the "negative" branch, where X_h is not
146 incremented (because A < B^n).
147
148 FIXME: the scratch for mulmod_bnm1 does not currently fit in the scratch, it
149 is allocated apart.
150 */
151
152mp_limb_t
153mpn_ni_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch)
154{
155 mp_limb_t cy;
156 mp_size_t rn, mn;
157 mp_size_t sizes[NPOWS], *sizp;
158 mp_ptr tp;
159 TMP_DECL;
160#define xp scratch
161
162 ASSERT (n > 4);
163 ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
164 ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
165 ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n)));
166 ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n)));
167
168 /* Compute the computation precisions from highest to lowest, leaving the
169 base case size in 'rn'. */
170 sizp = sizes;
171 rn = n;
172 do {
173 *sizp = rn;
174 rn = (rn >> 1) + 1;
175 ++sizp;
176 } while (ABOVE_THRESHOLD (rn, INV_NEWTON_THRESHOLD));
177
178 /* We search the inverse of 0.{dp,n}, we compute it as 1.{ip,n} */
179 dp += n;
180 ip += n;
181
182 /* Compute a base value of rn limbs. */
183 mpn_bc_invertappr (ip - rn, dp - rn, rn, scratch);
184
185 TMP_MARK;
186
187 if (ABOVE_THRESHOLD (n, INV_MULMOD_BNM1_THRESHOLD))
188 {
189 mn = mpn_mulmod_bnm1_next_size (n + 1);
190 tp = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (mn, n, (n >> 1) + 1));
191 }
192 /* Use Newton's iterations to get the desired precision.*/
193
194 while (1) {
195 n = *--sizp;
196 /*
197 v n v
198 +----+--+
199 ^ rn ^
200 */
201
202 /* Compute i_jd . */
203 if (BELOW_THRESHOLD (n, INV_MULMOD_BNM1_THRESHOLD)
204 || ((mn = mpn_mulmod_bnm1_next_size (n + 1)) > (n + rn))) {
205 /* FIXME: We do only need {xp,n+1}*/
206 mpn_mul (xp, dp - n, n, ip - rn, rn);
207 mpn_add_n (xp + rn, xp + rn, dp - n, n - rn + 1);
208 cy = CNST_LIMB(1); /* Remember we truncated, Mod B^(n+1) */
209 /* We computed (truncated) {xp,n+1} <- 1.{ip,rn} * 0.{dp,n} */
210 } else { /* Use B^mn-1 wraparound */
211 mpn_mulmod_bnm1 (xp, mn, dp - n, n, ip - rn, rn, tp);
212 /* We computed {xp,mn} <- {ip,rn} * {dp,n} mod (B^mn-1) */
213 /* We know that 2*|ip*dp + dp*B^rn - B^{rn+n}| < B^mn-1 */
214 /* Add dp*B^rn mod (B^mn-1) */
215 ASSERT (n >= mn - rn);
216 cy = mpn_add_n (xp + rn, xp + rn, dp - n, mn - rn);
217 cy = mpn_add_nc (xp, xp, dp - (n - (mn - rn)), n - (mn - rn), cy);
218 /* Subtract B^{rn+n}, maybe only compensate the carry*/
219 xp[mn] = CNST_LIMB (1); /* set a limit for DECR_U */
220 MPN_DECR_U (xp + rn + n - mn, 2 * mn + 1 - rn - n, CNST_LIMB (1) - cy);
221 MPN_DECR_U (xp, mn, CNST_LIMB (1) - xp[mn]); /* if DECR_U eroded xp[mn] */
222 cy = CNST_LIMB(0); /* Remember we are working Mod B^mn-1 */
223 }
224
225 if (xp[n] < CNST_LIMB (2)) { /* "positive" residue class */
226 cy = xp[n]; /* 0 <= cy <= 1 here. */
227#if HAVE_NATIVE_mpn_sublsh1_n
228 if (cy++) {
229 if (mpn_cmp (xp, dp - n, n) > 0) {
230 mp_limb_t chk;
231 chk = mpn_sublsh1_n (xp, xp, dp - n, n);
232 ASSERT (chk == xp[n]);
233 ++ cy;
234 } else
235 ASSERT_CARRY (mpn_sub_n (xp, xp, dp - n, n));
236 }
237#else /* no mpn_sublsh1_n*/
238 if (cy++ && !mpn_sub_n (xp, xp, dp - n, n)) {
239 ASSERT_CARRY (mpn_sub_n (xp, xp, dp - n, n));
240 ++cy;
241 }
242#endif
243 /* 1 <= cy <= 3 here. */
244#if HAVE_NATIVE_mpn_rsblsh1_n
245 if (mpn_cmp (xp, dp - n, n) > 0) {
246 ASSERT_NOCARRY (mpn_rsblsh1_n (xp + n, xp, dp - n, n));
247 ++cy;
248 } else
249 ASSERT_NOCARRY (mpn_sub_nc (xp + 2 * n - rn, dp - rn, xp + n - rn, rn, mpn_cmp (xp, dp - n, n - rn) > 0));
250#else /* no mpn_rsblsh1_n*/
251 if (mpn_cmp (xp, dp - n, n) > 0) {
252 ASSERT_NOCARRY (mpn_sub_n (xp, xp, dp - n, n));
253 ++cy;
254 }
255 ASSERT_NOCARRY (mpn_sub_nc (xp + 2 * n - rn, dp - rn, xp + n - rn, rn, mpn_cmp (xp, dp - n, n - rn) > 0));
256#endif
257 MPN_DECR_U(ip - rn, rn, cy); /* 1 <= cy <= 4 here. */
258 } else { /* "negative" residue class */
259 ASSERT (xp[n] >= GMP_NUMB_MAX - CNST_LIMB(1));
260 MPN_DECR_U(xp, n + 1, cy);
261 if (xp[n] != GMP_NUMB_MAX) {
262 MPN_INCR_U(ip - rn, rn, CNST_LIMB (1));
263 ASSERT_CARRY (mpn_add_n (xp, xp, dp - n, n));
264 }
265 mpn_com (xp + 2 * n - rn, xp + n - rn, rn);
266 }
267
268 /* Compute x_ju_j. FIXME:We need {xp+rn,rn}, mulhi? */
269 mpn_mul_n (xp, xp + 2 * n - rn, ip - rn, rn);
270 cy = mpn_add_n (xp + rn, xp + rn, xp + 2 * n - rn, 2 * rn - n);
271 cy = mpn_add_nc (ip - n, xp + 3 * rn - n, xp + n + rn, n - rn, cy);
272 MPN_INCR_U (ip - rn, rn, cy);
273 if (sizp == sizes) { /* Get out of the cycle */
274 /* Check for possible carry propagation from below. */
275 cy = xp[3 * rn - n - 1] > GMP_NUMB_MAX - CNST_LIMB (7); /* Be conservative. */
276 /* cy = mpn_add_1 (xp + rn, xp + rn, 2*rn - n, 4); */
277 break;
278 }
279 rn = n;
280 }
281 TMP_FREE;
282
283 return cy;
284#undef xp
285}
286
287mp_limb_t
288mpn_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch)
289{
290 ASSERT (n > 0);
291 ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
292 ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
293 ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n)));
294 ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n)));
295
296 if (BELOW_THRESHOLD (n, INV_NEWTON_THRESHOLD))
297 return mpn_bc_invertappr (ip, dp, n, scratch);
298 else
299 return mpn_ni_invertappr (ip, dp, n, scratch);
300}