blob: 9ba0ae1ccf9219af763dd3b30785a47c8ddf5b94 [file] [log] [blame]
Austin Schuhdace2a62020-08-18 10:56:48 -07001/* mpn_modexact_1c_odd -- mpn by limb exact division style remainder.
2
3 THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST
4 CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5 FUTURE GNU MP RELEASES.
6
7Copyright 2000-2004 Free Software Foundation, Inc.
8
9This file is part of the GNU MP Library.
10
11The GNU MP Library is free software; you can redistribute it and/or modify
12it under the terms of either:
13
14 * the GNU Lesser General Public License as published by the Free
15 Software Foundation; either version 3 of the License, or (at your
16 option) any later version.
17
18or
19
20 * the GNU General Public License as published by the Free Software
21 Foundation; either version 2 of the License, or (at your option) any
22 later version.
23
24or both in parallel, as here.
25
26The GNU MP Library is distributed in the hope that it will be useful, but
27WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
29for more details.
30
31You should have received copies of the GNU General Public License and the
32GNU Lesser General Public License along with the GNU MP Library. If not,
33see https://www.gnu.org/licenses/. */
34
35#include "gmp-impl.h"
36#include "longlong.h"
37
38
39/* Calculate an r satisfying
40
41 r*B^k + a - c == q*d
42
43 where B=2^GMP_LIMB_BITS, a is {src,size}, k is either size or size-1
44 (the caller won't know which), and q is the quotient (discarded). d must
45 be odd, c can be any limb value.
46
47 If c<d then r will be in the range 0<=r<d, or if c>=d then 0<=r<=d.
48
49 This slightly strange function suits the initial Nx1 reduction for GCDs
50 or Jacobi symbols since the factors of 2 in B^k can be ignored, leaving
51 -r == a mod d (by passing c=0). For a GCD the factor of -1 on r can be
52 ignored, or for the Jacobi symbol it can be accounted for. The function
53 also suits divisibility and congruence testing since if r=0 (or r=d) is
54 obtained then a==c mod d.
55
56
57 r is a bit like the remainder returned by mpn_divexact_by3c, and is the
58 sort of remainder mpn_divexact_1 might return. Like mpn_divexact_by3c, r
59 represents a borrow, since effectively quotient limbs are chosen so that
60 subtracting that multiple of d from src at each step will produce a zero
61 limb.
62
63 A long calculation can be done piece by piece from low to high by passing
64 the return value from one part as the carry parameter to the next part.
65 The effective final k becomes anything between size and size-n, if n
66 pieces are used.
67
68
69 A similar sort of routine could be constructed based on adding multiples
70 of d at each limb, much like redc in mpz_powm does. Subtracting however
71 has a small advantage that when subtracting to cancel out l there's never
72 a borrow into h, whereas using an addition would put a carry into h
73 depending whether l==0 or l!=0.
74
75
76 In terms of efficiency, this function is similar to a mul-by-inverse
77 mpn_mod_1. Both are essentially two multiplies and are best suited to
78 CPUs with low latency multipliers (in comparison to a divide instruction
79 at least.) But modexact has a few less supplementary operations, only
80 needs low part and high part multiplies, and has fewer working quantities
81 (helping CPUs with few registers).
82
83
84 In the main loop it will be noted that the new carry (call it r) is the
85 sum of the high product h and any borrow from l=s-c. If c<d then we will
86 have r<d too, for the following reasons. Let q=l*inverse be the quotient
87 limb, so that q*d = B*h + l, where B=2^GMP_NUMB_BITS. Now if h=d-1 then
88
89 l = q*d - B*(d-1) <= (B-1)*d - B*(d-1) = B-d
90
91 But if l=s-c produces a borrow when c<d, then l>=B-d+1 and hence will
92 never have h=d-1 and so r=h+borrow <= d-1.
93
94 When c>=d, on the other hand, h=d-1 can certainly occur together with a
95 borrow, thereby giving only r<=d, as per the function definition above.
96
97 As a design decision it's left to the caller to check for r=d if it might
98 be passing c>=d. Several applications have c<d initially so the extra
99 test is often unnecessary, for example the GCDs or a plain divisibility
100 d|a test will pass c=0.
101
102
103 The special case for size==1 is so that it can be assumed c<=d in the
104 high<=divisor test at the end. c<=d is only guaranteed after at least
105 one iteration of the main loop. There's also a decent chance one % is
106 faster than a binvert_limb, though that will depend on the processor.
107
108 A CPU specific implementation might want to omit the size==1 code or the
109 high<divisor test. mpn/x86/k6/mode1o.asm for instance finds neither
110 useful. */
111
112
113mp_limb_t
114mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d,
115 mp_limb_t orig_c)
116{
117 mp_limb_t s, h, l, inverse, dummy, dmul, ret;
118 mp_limb_t c = orig_c;
119 mp_size_t i;
120
121 ASSERT (size >= 1);
122 ASSERT (d & 1);
123 ASSERT_MPN (src, size);
124 ASSERT_LIMB (d);
125 ASSERT_LIMB (c);
126
127 if (size == 1)
128 {
129 s = src[0];
130 if (s > c)
131 {
132 l = s-c;
133 h = l % d;
134 if (h != 0)
135 h = d - h;
136 }
137 else
138 {
139 l = c-s;
140 h = l % d;
141 }
142 return h;
143 }
144
145
146 binvert_limb (inverse, d);
147 dmul = d << GMP_NAIL_BITS;
148
149 i = 0;
150 do
151 {
152 s = src[i];
153 SUBC_LIMB (c, l, s, c);
154 l = (l * inverse) & GMP_NUMB_MASK;
155 umul_ppmm (h, dummy, l, dmul);
156 c += h;
157 }
158 while (++i < size-1);
159
160
161 s = src[i];
162 if (s <= d)
163 {
164 /* With high<=d the final step can be a subtract and addback. If c==0
165 then the addback will restore to l>=0. If c==d then will get l==d
166 if s==0, but that's ok per the function definition. */
167
168 l = c - s;
169 if (c < s)
170 l += d;
171
172 ret = l;
173 }
174 else
175 {
176 /* Can't skip a divide, just do the loop code once more. */
177
178 SUBC_LIMB (c, l, s, c);
179 l = (l * inverse) & GMP_NUMB_MASK;
180 umul_ppmm (h, dummy, l, dmul);
181 c += h;
182 ret = c;
183 }
184
185 ASSERT (orig_c < d ? ret < d : ret <= d);
186 return ret;
187}
188
189
190
191#if 0
192
193/* The following is an alternate form that might shave one cycle on a
194 superscalar processor since it takes c+=h off the dependent chain,
195 leaving just a low product, high product, and a subtract.
196
197 This is for CPU specific implementations to consider. A special case for
198 high<divisor and/or size==1 can be added if desired.
199
200 Notice that c is only ever 0 or 1, since if s-c produces a borrow then
201 x=0xFF..FF and x-h cannot produce a borrow. The c=(x>s) could become
202 c=(x==0xFF..FF) too, if that helped. */
203
204mp_limb_t
205mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t h)
206{
207 mp_limb_t s, x, y, inverse, dummy, dmul, c1, c2;
208 mp_limb_t c = 0;
209 mp_size_t i;
210
211 ASSERT (size >= 1);
212 ASSERT (d & 1);
213
214 binvert_limb (inverse, d);
215 dmul = d << GMP_NAIL_BITS;
216
217 for (i = 0; i < size; i++)
218 {
219 ASSERT (c==0 || c==1);
220
221 s = src[i];
222 SUBC_LIMB (c1, x, s, c);
223
224 SUBC_LIMB (c2, y, x, h);
225 c = c1 + c2;
226
227 y = (y * inverse) & GMP_NUMB_MASK;
228 umul_ppmm (h, dummy, y, dmul);
229 }
230
231 h += c;
232 return h;
233}
234
235#endif