blob: 3fa40122e42f7a5ec85d3970dfc48d8e5210a4c2 [file] [log] [blame]
Austin Schuhbb1338c2024-06-15 19:31:16 -07001/* hgcd2.c
2
3 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
4 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
5 GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
6
7Copyright 1996, 1998, 2000-2004, 2008, 2012, 2019 Free Software Foundation,
8Inc.
9
10This file is part of the GNU MP Library.
11
12The GNU MP Library is free software; you can redistribute it and/or modify
13it under the terms of either:
14
15 * the GNU Lesser General Public License as published by the Free
16 Software Foundation; either version 3 of the License, or (at your
17 option) any later version.
18
19or
20
21 * the GNU General Public License as published by the Free Software
22 Foundation; either version 2 of the License, or (at your option) any
23 later version.
24
25or both in parallel, as here.
26
27The GNU MP Library is distributed in the hope that it will be useful, but
28WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
29or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
30for more details.
31
32You should have received copies of the GNU General Public License and the
33GNU Lesser General Public License along with the GNU MP Library. If not,
34see https://www.gnu.org/licenses/. */
35
36#include "gmp-impl.h"
37#include "longlong.h"
38
39#ifndef HGCD2_DIV1_METHOD
40#define HGCD2_DIV1_METHOD 3
41#endif
42
43#ifndef HGCD2_DIV2_METHOD
44#define HGCD2_DIV2_METHOD 2
45#endif
46
47#if GMP_NAIL_BITS != 0
48#error Nails not implemented
49#endif
50
51#if HAVE_NATIVE_mpn_div_11
52
53#define div1 mpn_div_11
54/* Single-limb division optimized for small quotients.
55 Returned value holds d0 = r, d1 = q. */
56mp_double_limb_t div1 (mp_limb_t, mp_limb_t);
57
58#elif HGCD2_DIV1_METHOD == 1
59
60static inline mp_double_limb_t
61div1 (mp_limb_t n0, mp_limb_t d0)
62{
63 mp_double_limb_t res;
64 res.d1 = n0 / d0;
65 res.d0 = n0 - res.d1 * d0;
66
67 return res;
68}
69
70#elif HGCD2_DIV1_METHOD == 2
71
72static mp_double_limb_t
73div1 (mp_limb_t n0, mp_limb_t d0)
74{
75 mp_double_limb_t res;
76 int ncnt, dcnt, cnt;
77 mp_limb_t q;
78 mp_limb_t mask;
79
80 ASSERT (n0 >= d0);
81
82 count_leading_zeros (ncnt, n0);
83 count_leading_zeros (dcnt, d0);
84 cnt = dcnt - ncnt;
85
86 d0 <<= cnt;
87
88 q = -(mp_limb_t) (n0 >= d0);
89 n0 -= d0 & q;
90 d0 >>= 1;
91 q = -q;
92
93 while (--cnt >= 0)
94 {
95 mask = -(mp_limb_t) (n0 >= d0);
96 n0 -= d0 & mask;
97 d0 >>= 1;
98 q = (q << 1) - mask;
99 }
100
101 res.d0 = n0;
102 res.d1 = q;
103 return res;
104}
105
106#elif HGCD2_DIV1_METHOD == 3
107
108static inline mp_double_limb_t
109div1 (mp_limb_t n0, mp_limb_t d0)
110{
111 mp_double_limb_t res;
112 if (UNLIKELY ((d0 >> (GMP_LIMB_BITS - 3)) != 0)
113 || UNLIKELY (n0 >= (d0 << 3)))
114 {
115 res.d1 = n0 / d0;
116 res.d0 = n0 - res.d1 * d0;
117 }
118 else
119 {
120 mp_limb_t q, mask;
121
122 d0 <<= 2;
123
124 mask = -(mp_limb_t) (n0 >= d0);
125 n0 -= d0 & mask;
126 q = 4 & mask;
127
128 d0 >>= 1;
129 mask = -(mp_limb_t) (n0 >= d0);
130 n0 -= d0 & mask;
131 q += 2 & mask;
132
133 d0 >>= 1;
134 mask = -(mp_limb_t) (n0 >= d0);
135 n0 -= d0 & mask;
136 q -= mask;
137
138 res.d0 = n0;
139 res.d1 = q;
140 }
141 return res;
142}
143
144#elif HGCD2_DIV1_METHOD == 4
145
146/* Table quotients. We extract the NBITS most significant bits of the
147 numerator limb, and the corresponding bits from the divisor limb, and use
148 these to form an index into the table. This method is probably only useful
149 for short pipelines with slow multiplication.
150
151 Possible improvements:
152
153 * Perhaps extract the highest NBITS of the divisor instead of the same bits
154 as from the numerator. That would require another count_leading_zeros,
155 and a post-multiply shift of the quotient.
156
157 * Compress tables? Their values are tiny, and there are lots of zero
158 entries (which are never used).
159
160 * Round the table entries more cleverly?
161*/
162
163#ifndef NBITS
164#define NBITS 5
165#endif
166
167#if NBITS == 5
168/* This needs full division about 13.2% of the time. */
169static const unsigned char tab[512] = {
17017, 9, 5,4,3,2,2,2,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
17118, 9, 6,4,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
17219,10, 6,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
17320,10, 6,5,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
17421,11, 7,5,4,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
17522,11, 7,5,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
17623,12, 7,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
17724,12, 8,6,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
17825,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
17926,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
18027,14, 9,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
18128,14, 9,7,5,4,3,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
18229,15,10,7,5,4,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
18330,15,10,7,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
18431,16,10,7,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
18532,16,11,8,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
186};
187#elif NBITS == 6
188/* This needs full division about 9.8% of the time. */
189static const unsigned char tab[2048] = {
19033,17,11, 8, 6, 5,4,4,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
191 1, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
19234,17,11, 8, 6, 5,4,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
193 1, 1, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
19435,18,12, 9, 7, 5,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
195 1, 1, 1, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
19636,18,12, 9, 7, 6,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
197 1, 1, 1, 1, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
19837,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
199 1, 1, 1, 1, 1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
20038,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
201 1, 1, 1, 1, 1, 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
20239,20,13,10, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
203 1, 1, 1, 1, 1, 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
20440,20,14,10, 8, 6,5,5,4,3,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
205 1, 1, 1, 1, 1, 1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
20641,21,14,10, 8, 6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
207 1, 1, 1, 1, 1, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
20842,21,14,10, 8, 7,6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
209 1, 1, 1, 1, 1, 1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
21043,22,15,11, 8, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
211 1, 1, 1, 1, 1, 1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
21244,22,15,11, 9, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
213 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
21445,23,15,11, 9, 7,6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
215 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
21646,23,16,11, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
217 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
21847,24,16,12, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
219 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
22048,24,16,12, 9, 8,6,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
221 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
22249,25,17,12,10, 8,7,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
223 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
22450,25,17,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
225 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
22651,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
227 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
22852,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
229 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
23053,27,18,13,10, 9,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
231 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
23254,27,19,14,11, 9,7,6,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
233 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
23455,28,19,14,11, 9,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
235 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
23656,28,19,14,11, 9,8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
237 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
23857,29,20,14,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,1,
239 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
24058,29,20,15,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,
241 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
24259,30,20,15,12,10,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
243 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
24460,30,21,15,12,10,8,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
245 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
24661,31,21,15,12,10,8,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
247 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
24862,31,22,16,12,10,9,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
249 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
25063,32,22,16,13,10,9,7,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,1,
251 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
25264,32,22,16,13,10,9,8,7,6,5,5,4,4,4,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,
253 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
254};
255#else
256#error No table for provided NBITS
257#endif
258
259static const unsigned char *tabp = tab - (1 << (NBITS - 1) << NBITS);
260
261static inline mp_double_limb_t
262div1 (mp_limb_t n0, mp_limb_t d0)
263{
264 int ncnt;
265 size_t nbi, dbi;
266 mp_limb_t q0;
267 mp_limb_t r0;
268 mp_limb_t mask;
269 mp_double_limb_t res;
270
271 ASSERT (n0 >= d0); /* Actually only msb position is critical. */
272
273 count_leading_zeros (ncnt, n0);
274 nbi = n0 << ncnt >> (GMP_LIMB_BITS - NBITS);
275 dbi = d0 << ncnt >> (GMP_LIMB_BITS - NBITS);
276
277 q0 = tabp[(nbi << NBITS) + dbi];
278 r0 = n0 - q0 * d0;
279 mask = -(mp_limb_t) (r0 >= d0);
280 q0 -= mask;
281 r0 -= d0 & mask;
282
283 if (UNLIKELY (r0 >= d0))
284 {
285 q0 = n0 / d0;
286 r0 = n0 - q0 * d0;
287 }
288
289 res.d1 = q0;
290 res.d0 = r0;
291 return res;
292}
293
294#elif HGCD2_DIV1_METHOD == 5
295
296/* Table inverses of divisors. We don't bother with suppressing the msb from
297 the tables. We index with the NBITS most significant divisor bits,
298 including the always-set highest bit, but use addressing trickery via tabp
299 to suppress it.
300
301 Possible improvements:
302
303 * Do first multiply using 32-bit operations on 64-bit computers. At least
304 on most Arm64 cores, that uses 3 times less resources. It also saves on
305 many x86-64 processors.
306*/
307
308#ifndef NBITS
309#define NBITS 7
310#endif
311
312#if NBITS == 5
313/* This needs full division about 1.63% of the time. */
314static const unsigned char tab[16] = {
315 63, 59, 55, 52, 50, 47, 45, 43, 41, 39, 38, 36, 35, 34, 33, 32
316};
317static const unsigned char *tabp = tab - (1 << (NBITS - 1));
318#elif NBITS == 6
319/* This needs full division about 0.93% of the time. */
320static const unsigned char tab[32] = {
321127,123,119,116,112,109,106,104,101, 98, 96, 94, 92, 90, 88, 86,
322 84, 82, 80, 79, 77, 76, 74, 73, 72, 70, 69, 68, 67, 66, 65, 64
323};
324static const unsigned char *tabp = tab - (1 << (NBITS - 1));
325#elif NBITS == 7
326/* This needs full division about 0.49% of the time. */
327static const unsigned char tab[64] = {
328255,251,247,243,239,236,233,229,226,223,220,217,214,211,209,206,
329203,201,198,196,194,191,189,187,185,183,181,179,177,175,173,171,
330169,167,166,164,162,161,159,158,156,155,153,152,150,149,147,146,
331145,143,142,141,140,139,137,136,135,134,133,132,131,130,129,128
332};
333static const unsigned char *tabp = tab - (1 << (NBITS - 1));
334#elif NBITS == 8
335/* This needs full division about 0.26% of the time. */
336static const unsigned short tab[128] = {
337511,507,503,499,495,491,488,484,480,477,473,470,467,463,460,457,
338454,450,447,444,441,438,435,433,430,427,424,421,419,416,413,411,
339408,406,403,401,398,396,393,391,389,386,384,382,380,377,375,373,
340371,369,367,365,363,361,359,357,355,353,351,349,347,345,343,342,
341340,338,336,335,333,331,329,328,326,325,323,321,320,318,317,315,
342314,312,311,309,308,306,305,303,302,301,299,298,296,295,294,292,
343291,290,288,287,286,285,283,282,281,280,279,277,276,275,274,273,
344272,270,269,268,267,266,265,264,263,262,261,260,259,258,257,256
345};
346static const unsigned short *tabp = tab - (1 << (NBITS - 1));
347#else
348#error No table for provided NBITS
349#endif
350
351static inline mp_double_limb_t
352div1 (mp_limb_t n0, mp_limb_t d0)
353{
354 int ncnt, dcnt;
355 size_t dbi;
356 mp_limb_t inv;
357 mp_limb_t q0;
358 mp_limb_t r0;
359 mp_limb_t mask;
360 mp_double_limb_t res;
361
362 count_leading_zeros (ncnt, n0);
363 count_leading_zeros (dcnt, d0);
364
365 dbi = d0 << dcnt >> (GMP_LIMB_BITS - NBITS);
366 inv = tabp[dbi];
367 q0 = ((n0 << ncnt) >> (NBITS + 1)) * inv >> (GMP_LIMB_BITS - 1 + ncnt - dcnt);
368 r0 = n0 - q0 * d0;
369 mask = -(mp_limb_t) (r0 >= d0);
370 q0 -= mask;
371 r0 -= d0 & mask;
372
373 if (UNLIKELY (r0 >= d0))
374 {
375 q0 = n0 / d0;
376 r0 = n0 - q0 * d0;
377 }
378
379 res.d1 = q0;
380 res.d0 = r0;
381 return res;
382}
383
384#else
385#error Unknown HGCD2_DIV1_METHOD
386#endif
387
388#if HAVE_NATIVE_mpn_div_22
389
390#define div2 mpn_div_22
391/* Two-limb division optimized for small quotients. */
392mp_limb_t div2 (mp_ptr, mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t);
393
394#elif HGCD2_DIV2_METHOD == 1
395
396static mp_limb_t
397div2 (mp_ptr rp,
398 mp_limb_t n1, mp_limb_t n0,
399 mp_limb_t d1, mp_limb_t d0)
400{
401 mp_double_limb_t rq = div1 (n1, d1);
402 if (UNLIKELY (rq.d1 > d1))
403 {
404 mp_limb_t n2, q, t1, t0;
405 int c;
406
407 /* Normalize */
408 count_leading_zeros (c, d1);
409 ASSERT (c > 0);
410
411 n2 = n1 >> (GMP_LIMB_BITS - c);
412 n1 = (n1 << c) | (n0 >> (GMP_LIMB_BITS - c));
413 n0 <<= c;
414 d1 = (d1 << c) | (d0 >> (GMP_LIMB_BITS - c));
415 d0 <<= c;
416
417 udiv_qrnnd (q, n1, n2, n1, d1);
418 umul_ppmm (t1, t0, q, d0);
419 if (t1 > n1 || (t1 == n1 && t0 > n0))
420 {
421 ASSERT (q > 0);
422 q--;
423 sub_ddmmss (t1, t0, t1, t0, d1, d0);
424 }
425 sub_ddmmss (n1, n0, n1, n0, t1, t0);
426
427 /* Undo normalization */
428 rp[0] = (n0 >> c) | (n1 << (GMP_LIMB_BITS - c));
429 rp[1] = n1 >> c;
430
431 return q;
432 }
433 else
434 {
435 mp_limb_t q, t1, t0;
436 n1 = rq.d0;
437 q = rq.d1;
438 umul_ppmm (t1, t0, q, d0);
439 if (UNLIKELY (t1 >= n1) && (t1 > n1 || t0 > n0))
440 {
441 ASSERT (q > 0);
442 q--;
443 sub_ddmmss (t1, t0, t1, t0, d1, d0);
444 }
445 sub_ddmmss (rp[1], rp[0], n1, n0, t1, t0);
446 return q;
447 }
448}
449
450#elif HGCD2_DIV2_METHOD == 2
451
452/* Bit-wise div2. Relies on fast count_leading_zeros. */
453static mp_limb_t
454div2 (mp_ptr rp,
455 mp_limb_t n1, mp_limb_t n0,
456 mp_limb_t d1, mp_limb_t d0)
457{
458 mp_limb_t q = 0;
459 int ncnt;
460 int dcnt;
461
462 count_leading_zeros (ncnt, n1);
463 count_leading_zeros (dcnt, d1);
464 dcnt -= ncnt;
465
466 d1 = (d1 << dcnt) + (d0 >> 1 >> (GMP_LIMB_BITS - 1 - dcnt));
467 d0 <<= dcnt;
468
469 do
470 {
471 mp_limb_t mask;
472 q <<= 1;
473 if (UNLIKELY (n1 == d1))
474 mask = -(n0 >= d0);
475 else
476 mask = -(n1 > d1);
477
478 q -= mask;
479
480 sub_ddmmss (n1, n0, n1, n0, mask & d1, mask & d0);
481
482 d0 = (d1 << (GMP_LIMB_BITS - 1)) | (d0 >> 1);
483 d1 = d1 >> 1;
484 }
485 while (dcnt--);
486
487 rp[0] = n0;
488 rp[1] = n1;
489
490 return q;
491}
492#else
493#error Unknown HGCD2_DIV2_METHOD
494#endif
495
496/* Reduces a,b until |a-b| (almost) fits in one limb + 1 bit. Constructs
497 matrix M. Returns 1 if we make progress, i.e. can perform at least
498 one subtraction. Otherwise returns zero. */
499
500/* FIXME: Possible optimizations:
501
502 The div2 function starts with checking the most significant bit of
503 the numerator. We can maintained normalized operands here, call
504 hgcd with normalized operands only, which should make the code
505 simpler and possibly faster.
506
507 Experiment with table lookups on the most significant bits.
508
509 This function is also a candidate for assembler implementation.
510*/
511int
512mpn_hgcd2 (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl,
513 struct hgcd_matrix1 *M)
514{
515 mp_limb_t u00, u01, u10, u11;
516
517 if (ah < 2 || bh < 2)
518 return 0;
519
520 if (ah > bh || (ah == bh && al > bl))
521 {
522 sub_ddmmss (ah, al, ah, al, bh, bl);
523 if (ah < 2)
524 return 0;
525
526 u00 = u01 = u11 = 1;
527 u10 = 0;
528 }
529 else
530 {
531 sub_ddmmss (bh, bl, bh, bl, ah, al);
532 if (bh < 2)
533 return 0;
534
535 u00 = u10 = u11 = 1;
536 u01 = 0;
537 }
538
539 if (ah < bh)
540 goto subtract_a;
541
542 for (;;)
543 {
544 ASSERT (ah >= bh);
545 if (ah == bh)
546 goto done;
547
548 if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
549 {
550 ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
551 bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));
552
553 break;
554 }
555
556 /* Subtract a -= q b, and multiply M from the right by (1 q ; 0
557 1), affecting the second column of M. */
558 ASSERT (ah > bh);
559 sub_ddmmss (ah, al, ah, al, bh, bl);
560
561 if (ah < 2)
562 goto done;
563
564 if (ah <= bh)
565 {
566 /* Use q = 1 */
567 u01 += u00;
568 u11 += u10;
569 }
570 else
571 {
572 mp_limb_t r[2];
573 mp_limb_t q = div2 (r, ah, al, bh, bl);
574 al = r[0]; ah = r[1];
575 if (ah < 2)
576 {
577 /* A is too small, but q is correct. */
578 u01 += q * u00;
579 u11 += q * u10;
580 goto done;
581 }
582 q++;
583 u01 += q * u00;
584 u11 += q * u10;
585 }
586 subtract_a:
587 ASSERT (bh >= ah);
588 if (ah == bh)
589 goto done;
590
591 if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
592 {
593 ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
594 bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));
595
596 goto subtract_a1;
597 }
598
599 /* Subtract b -= q a, and multiply M from the right by (1 0 ; q
600 1), affecting the first column of M. */
601 sub_ddmmss (bh, bl, bh, bl, ah, al);
602
603 if (bh < 2)
604 goto done;
605
606 if (bh <= ah)
607 {
608 /* Use q = 1 */
609 u00 += u01;
610 u10 += u11;
611 }
612 else
613 {
614 mp_limb_t r[2];
615 mp_limb_t q = div2 (r, bh, bl, ah, al);
616 bl = r[0]; bh = r[1];
617 if (bh < 2)
618 {
619 /* B is too small, but q is correct. */
620 u00 += q * u01;
621 u10 += q * u11;
622 goto done;
623 }
624 q++;
625 u00 += q * u01;
626 u10 += q * u11;
627 }
628 }
629
630 /* NOTE: Since we discard the least significant half limb, we don't get a
631 truly maximal M (corresponding to |a - b| < 2^{GMP_LIMB_BITS +1}). */
632 /* Single precision loop */
633 for (;;)
634 {
635 ASSERT (ah >= bh);
636
637 ah -= bh;
638 if (ah < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
639 break;
640
641 if (ah <= bh)
642 {
643 /* Use q = 1 */
644 u01 += u00;
645 u11 += u10;
646 }
647 else
648 {
649 mp_double_limb_t rq = div1 (ah, bh);
650 mp_limb_t q = rq.d1;
651 ah = rq.d0;
652
653 if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
654 {
655 /* A is too small, but q is correct. */
656 u01 += q * u00;
657 u11 += q * u10;
658 break;
659 }
660 q++;
661 u01 += q * u00;
662 u11 += q * u10;
663 }
664 subtract_a1:
665 ASSERT (bh >= ah);
666
667 bh -= ah;
668 if (bh < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
669 break;
670
671 if (bh <= ah)
672 {
673 /* Use q = 1 */
674 u00 += u01;
675 u10 += u11;
676 }
677 else
678 {
679 mp_double_limb_t rq = div1 (bh, ah);
680 mp_limb_t q = rq.d1;
681 bh = rq.d0;
682
683 if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
684 {
685 /* B is too small, but q is correct. */
686 u00 += q * u01;
687 u10 += q * u11;
688 break;
689 }
690 q++;
691 u00 += q * u01;
692 u10 += q * u11;
693 }
694 }
695
696 done:
697 M->u[0][0] = u00; M->u[0][1] = u01;
698 M->u[1][0] = u10; M->u[1][1] = u11;
699
700 return 1;
701}
702
703/* Sets (r;b) = (a;b) M, with M = (u00, u01; u10, u11). Vector must
704 * have space for n + 1 limbs. Uses three buffers to avoid a copy*/
705mp_size_t
706mpn_hgcd_mul_matrix1_vector (const struct hgcd_matrix1 *M,
707 mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n)
708{
709 mp_limb_t ah, bh;
710
711 /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
712
713 r = u00 * a
714 r += u10 * b
715 b *= u11
716 b += u01 * a
717 */
718
719#if HAVE_NATIVE_mpn_addaddmul_1msb0
720 ah = mpn_addaddmul_1msb0 (rp, ap, bp, n, M->u[0][0], M->u[1][0]);
721 bh = mpn_addaddmul_1msb0 (bp, bp, ap, n, M->u[1][1], M->u[0][1]);
722#else
723 ah = mpn_mul_1 (rp, ap, n, M->u[0][0]);
724 ah += mpn_addmul_1 (rp, bp, n, M->u[1][0]);
725
726 bh = mpn_mul_1 (bp, bp, n, M->u[1][1]);
727 bh += mpn_addmul_1 (bp, ap, n, M->u[0][1]);
728#endif
729 rp[n] = ah;
730 bp[n] = bh;
731
732 n += (ah | bh) > 0;
733 return n;
734}