Austin Schuh | bb1338c | 2024-06-15 19:31:16 -0700 | [diff] [blame^] | 1 | /* hgcd2.c |
| 2 | |
| 3 | THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY |
| 4 | SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST |
| 5 | GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. |
| 6 | |
| 7 | Copyright 1996, 1998, 2000-2004, 2008, 2012, 2019 Free Software Foundation, |
| 8 | Inc. |
| 9 | |
| 10 | This file is part of the GNU MP Library. |
| 11 | |
| 12 | The GNU MP Library is free software; you can redistribute it and/or modify |
| 13 | it under the terms of either: |
| 14 | |
| 15 | * the GNU Lesser General Public License as published by the Free |
| 16 | Software Foundation; either version 3 of the License, or (at your |
| 17 | option) any later version. |
| 18 | |
| 19 | or |
| 20 | |
| 21 | * the GNU General Public License as published by the Free Software |
| 22 | Foundation; either version 2 of the License, or (at your option) any |
| 23 | later version. |
| 24 | |
| 25 | or both in parallel, as here. |
| 26 | |
| 27 | The GNU MP Library is distributed in the hope that it will be useful, but |
| 28 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| 29 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 30 | for more details. |
| 31 | |
| 32 | You should have received copies of the GNU General Public License and the |
| 33 | GNU Lesser General Public License along with the GNU MP Library. If not, |
| 34 | see https://www.gnu.org/licenses/. */ |
| 35 | |
| 36 | #include "gmp-impl.h" |
| 37 | #include "longlong.h" |
| 38 | |
| 39 | #ifndef HGCD2_DIV1_METHOD |
| 40 | #define HGCD2_DIV1_METHOD 3 |
| 41 | #endif |
| 42 | |
| 43 | #ifndef HGCD2_DIV2_METHOD |
| 44 | #define HGCD2_DIV2_METHOD 2 |
| 45 | #endif |
| 46 | |
| 47 | #if GMP_NAIL_BITS != 0 |
| 48 | #error Nails not implemented |
| 49 | #endif |
| 50 | |
| 51 | #if HAVE_NATIVE_mpn_div_11 |
| 52 | |
| 53 | #define div1 mpn_div_11 |
| 54 | /* Single-limb division optimized for small quotients. |
| 55 | Returned value holds d0 = r, d1 = q. */ |
| 56 | mp_double_limb_t div1 (mp_limb_t, mp_limb_t); |
| 57 | |
| 58 | #elif HGCD2_DIV1_METHOD == 1 |
| 59 | |
| 60 | static inline mp_double_limb_t |
| 61 | div1 (mp_limb_t n0, mp_limb_t d0) |
| 62 | { |
| 63 | mp_double_limb_t res; |
| 64 | res.d1 = n0 / d0; |
| 65 | res.d0 = n0 - res.d1 * d0; |
| 66 | |
| 67 | return res; |
| 68 | } |
| 69 | |
| 70 | #elif HGCD2_DIV1_METHOD == 2 |
| 71 | |
| 72 | static mp_double_limb_t |
| 73 | div1 (mp_limb_t n0, mp_limb_t d0) |
| 74 | { |
| 75 | mp_double_limb_t res; |
| 76 | int ncnt, dcnt, cnt; |
| 77 | mp_limb_t q; |
| 78 | mp_limb_t mask; |
| 79 | |
| 80 | ASSERT (n0 >= d0); |
| 81 | |
| 82 | count_leading_zeros (ncnt, n0); |
| 83 | count_leading_zeros (dcnt, d0); |
| 84 | cnt = dcnt - ncnt; |
| 85 | |
| 86 | d0 <<= cnt; |
| 87 | |
| 88 | q = -(mp_limb_t) (n0 >= d0); |
| 89 | n0 -= d0 & q; |
| 90 | d0 >>= 1; |
| 91 | q = -q; |
| 92 | |
| 93 | while (--cnt >= 0) |
| 94 | { |
| 95 | mask = -(mp_limb_t) (n0 >= d0); |
| 96 | n0 -= d0 & mask; |
| 97 | d0 >>= 1; |
| 98 | q = (q << 1) - mask; |
| 99 | } |
| 100 | |
| 101 | res.d0 = n0; |
| 102 | res.d1 = q; |
| 103 | return res; |
| 104 | } |
| 105 | |
| 106 | #elif HGCD2_DIV1_METHOD == 3 |
| 107 | |
| 108 | static inline mp_double_limb_t |
| 109 | div1 (mp_limb_t n0, mp_limb_t d0) |
| 110 | { |
| 111 | mp_double_limb_t res; |
| 112 | if (UNLIKELY ((d0 >> (GMP_LIMB_BITS - 3)) != 0) |
| 113 | || UNLIKELY (n0 >= (d0 << 3))) |
| 114 | { |
| 115 | res.d1 = n0 / d0; |
| 116 | res.d0 = n0 - res.d1 * d0; |
| 117 | } |
| 118 | else |
| 119 | { |
| 120 | mp_limb_t q, mask; |
| 121 | |
| 122 | d0 <<= 2; |
| 123 | |
| 124 | mask = -(mp_limb_t) (n0 >= d0); |
| 125 | n0 -= d0 & mask; |
| 126 | q = 4 & mask; |
| 127 | |
| 128 | d0 >>= 1; |
| 129 | mask = -(mp_limb_t) (n0 >= d0); |
| 130 | n0 -= d0 & mask; |
| 131 | q += 2 & mask; |
| 132 | |
| 133 | d0 >>= 1; |
| 134 | mask = -(mp_limb_t) (n0 >= d0); |
| 135 | n0 -= d0 & mask; |
| 136 | q -= mask; |
| 137 | |
| 138 | res.d0 = n0; |
| 139 | res.d1 = q; |
| 140 | } |
| 141 | return res; |
| 142 | } |
| 143 | |
| 144 | #elif HGCD2_DIV1_METHOD == 4 |
| 145 | |
| 146 | /* Table quotients. We extract the NBITS most significant bits of the |
| 147 | numerator limb, and the corresponding bits from the divisor limb, and use |
| 148 | these to form an index into the table. This method is probably only useful |
| 149 | for short pipelines with slow multiplication. |
| 150 | |
| 151 | Possible improvements: |
| 152 | |
| 153 | * Perhaps extract the highest NBITS of the divisor instead of the same bits |
| 154 | as from the numerator. That would require another count_leading_zeros, |
| 155 | and a post-multiply shift of the quotient. |
| 156 | |
| 157 | * Compress tables? Their values are tiny, and there are lots of zero |
| 158 | entries (which are never used). |
| 159 | |
| 160 | * Round the table entries more cleverly? |
| 161 | */ |
| 162 | |
| 163 | #ifndef NBITS |
| 164 | #define NBITS 5 |
| 165 | #endif |
| 166 | |
| 167 | #if NBITS == 5 |
| 168 | /* This needs full division about 13.2% of the time. */ |
| 169 | static const unsigned char tab[512] = { |
| 170 | 17, 9, 5,4,3,2,2,2,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 171 | 18, 9, 6,4,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 172 | 19,10, 6,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 173 | 20,10, 6,5,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0, |
| 174 | 21,11, 7,5,4,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0, |
| 175 | 22,11, 7,5,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0, |
| 176 | 23,12, 7,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0, |
| 177 | 24,12, 8,6,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0, |
| 178 | 25,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0, |
| 179 | 26,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0, |
| 180 | 27,14, 9,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, |
| 181 | 28,14, 9,7,5,4,3,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0, |
| 182 | 29,15,10,7,5,4,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0, |
| 183 | 30,15,10,7,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0, |
| 184 | 31,16,10,7,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0, |
| 185 | 32,16,11,8,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 |
| 186 | }; |
| 187 | #elif NBITS == 6 |
| 188 | /* This needs full division about 9.8% of the time. */ |
| 189 | static const unsigned char tab[2048] = { |
| 190 | 33,17,11, 8, 6, 5,4,4,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 191 | 1, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 192 | 34,17,11, 8, 6, 5,4,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 193 | 1, 1, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 194 | 35,18,12, 9, 7, 5,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 195 | 1, 1, 1, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 196 | 36,18,12, 9, 7, 6,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 197 | 1, 1, 1, 1, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 198 | 37,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 199 | 1, 1, 1, 1, 1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 200 | 38,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 201 | 1, 1, 1, 1, 1, 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 202 | 39,20,13,10, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 203 | 1, 1, 1, 1, 1, 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 204 | 40,20,14,10, 8, 6,5,5,4,3,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 205 | 1, 1, 1, 1, 1, 1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 206 | 41,21,14,10, 8, 6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1, |
| 207 | 1, 1, 1, 1, 1, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 208 | 42,21,14,10, 8, 7,6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1, |
| 209 | 1, 1, 1, 1, 1, 1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 210 | 43,22,15,11, 8, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1, |
| 211 | 1, 1, 1, 1, 1, 1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 212 | 44,22,15,11, 9, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1, |
| 213 | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 214 | 45,23,15,11, 9, 7,6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1, |
| 215 | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 216 | 46,23,16,11, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1, |
| 217 | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 218 | 47,24,16,12, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1, |
| 219 | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 220 | 48,24,16,12, 9, 8,6,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1, |
| 221 | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 222 | 49,25,17,12,10, 8,7,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1, |
| 223 | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 224 | 50,25,17,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1, |
| 225 | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 226 | 51,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1, |
| 227 | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 228 | 52,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1, |
| 229 | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0, |
| 230 | 53,27,18,13,10, 9,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1, |
| 231 | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0, |
| 232 | 54,27,19,14,11, 9,7,6,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1, |
| 233 | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0, |
| 234 | 55,28,19,14,11, 9,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1, |
| 235 | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0, |
| 236 | 56,28,19,14,11, 9,8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1, |
| 237 | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0, |
| 238 | 57,29,20,14,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,1, |
| 239 | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0, |
| 240 | 58,29,20,15,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1, |
| 241 | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0, |
| 242 | 59,30,20,15,12,10,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1, |
| 243 | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, |
| 244 | 60,30,21,15,12,10,8,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1, |
| 245 | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0, |
| 246 | 61,31,21,15,12,10,8,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1, |
| 247 | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0, |
| 248 | 62,31,22,16,12,10,9,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1, |
| 249 | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0, |
| 250 | 63,32,22,16,13,10,9,7,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,1, |
| 251 | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0, |
| 252 | 64,32,22,16,13,10,9,8,7,6,5,5,4,4,4,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1, |
| 253 | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 |
| 254 | }; |
| 255 | #else |
| 256 | #error No table for provided NBITS |
| 257 | #endif |
| 258 | |
| 259 | static const unsigned char *tabp = tab - (1 << (NBITS - 1) << NBITS); |
| 260 | |
| 261 | static inline mp_double_limb_t |
| 262 | div1 (mp_limb_t n0, mp_limb_t d0) |
| 263 | { |
| 264 | int ncnt; |
| 265 | size_t nbi, dbi; |
| 266 | mp_limb_t q0; |
| 267 | mp_limb_t r0; |
| 268 | mp_limb_t mask; |
| 269 | mp_double_limb_t res; |
| 270 | |
| 271 | ASSERT (n0 >= d0); /* Actually only msb position is critical. */ |
| 272 | |
| 273 | count_leading_zeros (ncnt, n0); |
| 274 | nbi = n0 << ncnt >> (GMP_LIMB_BITS - NBITS); |
| 275 | dbi = d0 << ncnt >> (GMP_LIMB_BITS - NBITS); |
| 276 | |
| 277 | q0 = tabp[(nbi << NBITS) + dbi]; |
| 278 | r0 = n0 - q0 * d0; |
| 279 | mask = -(mp_limb_t) (r0 >= d0); |
| 280 | q0 -= mask; |
| 281 | r0 -= d0 & mask; |
| 282 | |
| 283 | if (UNLIKELY (r0 >= d0)) |
| 284 | { |
| 285 | q0 = n0 / d0; |
| 286 | r0 = n0 - q0 * d0; |
| 287 | } |
| 288 | |
| 289 | res.d1 = q0; |
| 290 | res.d0 = r0; |
| 291 | return res; |
| 292 | } |
| 293 | |
| 294 | #elif HGCD2_DIV1_METHOD == 5 |
| 295 | |
| 296 | /* Table inverses of divisors. We don't bother with suppressing the msb from |
| 297 | the tables. We index with the NBITS most significant divisor bits, |
| 298 | including the always-set highest bit, but use addressing trickery via tabp |
| 299 | to suppress it. |
| 300 | |
| 301 | Possible improvements: |
| 302 | |
| 303 | * Do first multiply using 32-bit operations on 64-bit computers. At least |
| 304 | on most Arm64 cores, that uses 3 times less resources. It also saves on |
| 305 | many x86-64 processors. |
| 306 | */ |
| 307 | |
| 308 | #ifndef NBITS |
| 309 | #define NBITS 7 |
| 310 | #endif |
| 311 | |
| 312 | #if NBITS == 5 |
| 313 | /* This needs full division about 1.63% of the time. */ |
| 314 | static const unsigned char tab[16] = { |
| 315 | 63, 59, 55, 52, 50, 47, 45, 43, 41, 39, 38, 36, 35, 34, 33, 32 |
| 316 | }; |
| 317 | static const unsigned char *tabp = tab - (1 << (NBITS - 1)); |
| 318 | #elif NBITS == 6 |
| 319 | /* This needs full division about 0.93% of the time. */ |
| 320 | static const unsigned char tab[32] = { |
| 321 | 127,123,119,116,112,109,106,104,101, 98, 96, 94, 92, 90, 88, 86, |
| 322 | 84, 82, 80, 79, 77, 76, 74, 73, 72, 70, 69, 68, 67, 66, 65, 64 |
| 323 | }; |
| 324 | static const unsigned char *tabp = tab - (1 << (NBITS - 1)); |
| 325 | #elif NBITS == 7 |
| 326 | /* This needs full division about 0.49% of the time. */ |
| 327 | static const unsigned char tab[64] = { |
| 328 | 255,251,247,243,239,236,233,229,226,223,220,217,214,211,209,206, |
| 329 | 203,201,198,196,194,191,189,187,185,183,181,179,177,175,173,171, |
| 330 | 169,167,166,164,162,161,159,158,156,155,153,152,150,149,147,146, |
| 331 | 145,143,142,141,140,139,137,136,135,134,133,132,131,130,129,128 |
| 332 | }; |
| 333 | static const unsigned char *tabp = tab - (1 << (NBITS - 1)); |
| 334 | #elif NBITS == 8 |
| 335 | /* This needs full division about 0.26% of the time. */ |
| 336 | static const unsigned short tab[128] = { |
| 337 | 511,507,503,499,495,491,488,484,480,477,473,470,467,463,460,457, |
| 338 | 454,450,447,444,441,438,435,433,430,427,424,421,419,416,413,411, |
| 339 | 408,406,403,401,398,396,393,391,389,386,384,382,380,377,375,373, |
| 340 | 371,369,367,365,363,361,359,357,355,353,351,349,347,345,343,342, |
| 341 | 340,338,336,335,333,331,329,328,326,325,323,321,320,318,317,315, |
| 342 | 314,312,311,309,308,306,305,303,302,301,299,298,296,295,294,292, |
| 343 | 291,290,288,287,286,285,283,282,281,280,279,277,276,275,274,273, |
| 344 | 272,270,269,268,267,266,265,264,263,262,261,260,259,258,257,256 |
| 345 | }; |
| 346 | static const unsigned short *tabp = tab - (1 << (NBITS - 1)); |
| 347 | #else |
| 348 | #error No table for provided NBITS |
| 349 | #endif |
| 350 | |
| 351 | static inline mp_double_limb_t |
| 352 | div1 (mp_limb_t n0, mp_limb_t d0) |
| 353 | { |
| 354 | int ncnt, dcnt; |
| 355 | size_t dbi; |
| 356 | mp_limb_t inv; |
| 357 | mp_limb_t q0; |
| 358 | mp_limb_t r0; |
| 359 | mp_limb_t mask; |
| 360 | mp_double_limb_t res; |
| 361 | |
| 362 | count_leading_zeros (ncnt, n0); |
| 363 | count_leading_zeros (dcnt, d0); |
| 364 | |
| 365 | dbi = d0 << dcnt >> (GMP_LIMB_BITS - NBITS); |
| 366 | inv = tabp[dbi]; |
| 367 | q0 = ((n0 << ncnt) >> (NBITS + 1)) * inv >> (GMP_LIMB_BITS - 1 + ncnt - dcnt); |
| 368 | r0 = n0 - q0 * d0; |
| 369 | mask = -(mp_limb_t) (r0 >= d0); |
| 370 | q0 -= mask; |
| 371 | r0 -= d0 & mask; |
| 372 | |
| 373 | if (UNLIKELY (r0 >= d0)) |
| 374 | { |
| 375 | q0 = n0 / d0; |
| 376 | r0 = n0 - q0 * d0; |
| 377 | } |
| 378 | |
| 379 | res.d1 = q0; |
| 380 | res.d0 = r0; |
| 381 | return res; |
| 382 | } |
| 383 | |
| 384 | #else |
| 385 | #error Unknown HGCD2_DIV1_METHOD |
| 386 | #endif |
| 387 | |
| 388 | #if HAVE_NATIVE_mpn_div_22 |
| 389 | |
| 390 | #define div2 mpn_div_22 |
| 391 | /* Two-limb division optimized for small quotients. */ |
| 392 | mp_limb_t div2 (mp_ptr, mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t); |
| 393 | |
| 394 | #elif HGCD2_DIV2_METHOD == 1 |
| 395 | |
| 396 | static mp_limb_t |
| 397 | div2 (mp_ptr rp, |
| 398 | mp_limb_t n1, mp_limb_t n0, |
| 399 | mp_limb_t d1, mp_limb_t d0) |
| 400 | { |
| 401 | mp_double_limb_t rq = div1 (n1, d1); |
| 402 | if (UNLIKELY (rq.d1 > d1)) |
| 403 | { |
| 404 | mp_limb_t n2, q, t1, t0; |
| 405 | int c; |
| 406 | |
| 407 | /* Normalize */ |
| 408 | count_leading_zeros (c, d1); |
| 409 | ASSERT (c > 0); |
| 410 | |
| 411 | n2 = n1 >> (GMP_LIMB_BITS - c); |
| 412 | n1 = (n1 << c) | (n0 >> (GMP_LIMB_BITS - c)); |
| 413 | n0 <<= c; |
| 414 | d1 = (d1 << c) | (d0 >> (GMP_LIMB_BITS - c)); |
| 415 | d0 <<= c; |
| 416 | |
| 417 | udiv_qrnnd (q, n1, n2, n1, d1); |
| 418 | umul_ppmm (t1, t0, q, d0); |
| 419 | if (t1 > n1 || (t1 == n1 && t0 > n0)) |
| 420 | { |
| 421 | ASSERT (q > 0); |
| 422 | q--; |
| 423 | sub_ddmmss (t1, t0, t1, t0, d1, d0); |
| 424 | } |
| 425 | sub_ddmmss (n1, n0, n1, n0, t1, t0); |
| 426 | |
| 427 | /* Undo normalization */ |
| 428 | rp[0] = (n0 >> c) | (n1 << (GMP_LIMB_BITS - c)); |
| 429 | rp[1] = n1 >> c; |
| 430 | |
| 431 | return q; |
| 432 | } |
| 433 | else |
| 434 | { |
| 435 | mp_limb_t q, t1, t0; |
| 436 | n1 = rq.d0; |
| 437 | q = rq.d1; |
| 438 | umul_ppmm (t1, t0, q, d0); |
| 439 | if (UNLIKELY (t1 >= n1) && (t1 > n1 || t0 > n0)) |
| 440 | { |
| 441 | ASSERT (q > 0); |
| 442 | q--; |
| 443 | sub_ddmmss (t1, t0, t1, t0, d1, d0); |
| 444 | } |
| 445 | sub_ddmmss (rp[1], rp[0], n1, n0, t1, t0); |
| 446 | return q; |
| 447 | } |
| 448 | } |
| 449 | |
| 450 | #elif HGCD2_DIV2_METHOD == 2 |
| 451 | |
| 452 | /* Bit-wise div2. Relies on fast count_leading_zeros. */ |
| 453 | static mp_limb_t |
| 454 | div2 (mp_ptr rp, |
| 455 | mp_limb_t n1, mp_limb_t n0, |
| 456 | mp_limb_t d1, mp_limb_t d0) |
| 457 | { |
| 458 | mp_limb_t q = 0; |
| 459 | int ncnt; |
| 460 | int dcnt; |
| 461 | |
| 462 | count_leading_zeros (ncnt, n1); |
| 463 | count_leading_zeros (dcnt, d1); |
| 464 | dcnt -= ncnt; |
| 465 | |
| 466 | d1 = (d1 << dcnt) + (d0 >> 1 >> (GMP_LIMB_BITS - 1 - dcnt)); |
| 467 | d0 <<= dcnt; |
| 468 | |
| 469 | do |
| 470 | { |
| 471 | mp_limb_t mask; |
| 472 | q <<= 1; |
| 473 | if (UNLIKELY (n1 == d1)) |
| 474 | mask = -(n0 >= d0); |
| 475 | else |
| 476 | mask = -(n1 > d1); |
| 477 | |
| 478 | q -= mask; |
| 479 | |
| 480 | sub_ddmmss (n1, n0, n1, n0, mask & d1, mask & d0); |
| 481 | |
| 482 | d0 = (d1 << (GMP_LIMB_BITS - 1)) | (d0 >> 1); |
| 483 | d1 = d1 >> 1; |
| 484 | } |
| 485 | while (dcnt--); |
| 486 | |
| 487 | rp[0] = n0; |
| 488 | rp[1] = n1; |
| 489 | |
| 490 | return q; |
| 491 | } |
| 492 | #else |
| 493 | #error Unknown HGCD2_DIV2_METHOD |
| 494 | #endif |
| 495 | |
| 496 | /* Reduces a,b until |a-b| (almost) fits in one limb + 1 bit. Constructs |
| 497 | matrix M. Returns 1 if we make progress, i.e. can perform at least |
| 498 | one subtraction. Otherwise returns zero. */ |
| 499 | |
| 500 | /* FIXME: Possible optimizations: |
| 501 | |
| 502 | The div2 function starts with checking the most significant bit of |
| 503 | the numerator. We can maintained normalized operands here, call |
| 504 | hgcd with normalized operands only, which should make the code |
| 505 | simpler and possibly faster. |
| 506 | |
| 507 | Experiment with table lookups on the most significant bits. |
| 508 | |
| 509 | This function is also a candidate for assembler implementation. |
| 510 | */ |
| 511 | int |
| 512 | mpn_hgcd2 (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl, |
| 513 | struct hgcd_matrix1 *M) |
| 514 | { |
| 515 | mp_limb_t u00, u01, u10, u11; |
| 516 | |
| 517 | if (ah < 2 || bh < 2) |
| 518 | return 0; |
| 519 | |
| 520 | if (ah > bh || (ah == bh && al > bl)) |
| 521 | { |
| 522 | sub_ddmmss (ah, al, ah, al, bh, bl); |
| 523 | if (ah < 2) |
| 524 | return 0; |
| 525 | |
| 526 | u00 = u01 = u11 = 1; |
| 527 | u10 = 0; |
| 528 | } |
| 529 | else |
| 530 | { |
| 531 | sub_ddmmss (bh, bl, bh, bl, ah, al); |
| 532 | if (bh < 2) |
| 533 | return 0; |
| 534 | |
| 535 | u00 = u10 = u11 = 1; |
| 536 | u01 = 0; |
| 537 | } |
| 538 | |
| 539 | if (ah < bh) |
| 540 | goto subtract_a; |
| 541 | |
| 542 | for (;;) |
| 543 | { |
| 544 | ASSERT (ah >= bh); |
| 545 | if (ah == bh) |
| 546 | goto done; |
| 547 | |
| 548 | if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2))) |
| 549 | { |
| 550 | ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2)); |
| 551 | bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2)); |
| 552 | |
| 553 | break; |
| 554 | } |
| 555 | |
| 556 | /* Subtract a -= q b, and multiply M from the right by (1 q ; 0 |
| 557 | 1), affecting the second column of M. */ |
| 558 | ASSERT (ah > bh); |
| 559 | sub_ddmmss (ah, al, ah, al, bh, bl); |
| 560 | |
| 561 | if (ah < 2) |
| 562 | goto done; |
| 563 | |
| 564 | if (ah <= bh) |
| 565 | { |
| 566 | /* Use q = 1 */ |
| 567 | u01 += u00; |
| 568 | u11 += u10; |
| 569 | } |
| 570 | else |
| 571 | { |
| 572 | mp_limb_t r[2]; |
| 573 | mp_limb_t q = div2 (r, ah, al, bh, bl); |
| 574 | al = r[0]; ah = r[1]; |
| 575 | if (ah < 2) |
| 576 | { |
| 577 | /* A is too small, but q is correct. */ |
| 578 | u01 += q * u00; |
| 579 | u11 += q * u10; |
| 580 | goto done; |
| 581 | } |
| 582 | q++; |
| 583 | u01 += q * u00; |
| 584 | u11 += q * u10; |
| 585 | } |
| 586 | subtract_a: |
| 587 | ASSERT (bh >= ah); |
| 588 | if (ah == bh) |
| 589 | goto done; |
| 590 | |
| 591 | if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2))) |
| 592 | { |
| 593 | ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2)); |
| 594 | bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2)); |
| 595 | |
| 596 | goto subtract_a1; |
| 597 | } |
| 598 | |
| 599 | /* Subtract b -= q a, and multiply M from the right by (1 0 ; q |
| 600 | 1), affecting the first column of M. */ |
| 601 | sub_ddmmss (bh, bl, bh, bl, ah, al); |
| 602 | |
| 603 | if (bh < 2) |
| 604 | goto done; |
| 605 | |
| 606 | if (bh <= ah) |
| 607 | { |
| 608 | /* Use q = 1 */ |
| 609 | u00 += u01; |
| 610 | u10 += u11; |
| 611 | } |
| 612 | else |
| 613 | { |
| 614 | mp_limb_t r[2]; |
| 615 | mp_limb_t q = div2 (r, bh, bl, ah, al); |
| 616 | bl = r[0]; bh = r[1]; |
| 617 | if (bh < 2) |
| 618 | { |
| 619 | /* B is too small, but q is correct. */ |
| 620 | u00 += q * u01; |
| 621 | u10 += q * u11; |
| 622 | goto done; |
| 623 | } |
| 624 | q++; |
| 625 | u00 += q * u01; |
| 626 | u10 += q * u11; |
| 627 | } |
| 628 | } |
| 629 | |
| 630 | /* NOTE: Since we discard the least significant half limb, we don't get a |
| 631 | truly maximal M (corresponding to |a - b| < 2^{GMP_LIMB_BITS +1}). */ |
| 632 | /* Single precision loop */ |
| 633 | for (;;) |
| 634 | { |
| 635 | ASSERT (ah >= bh); |
| 636 | |
| 637 | ah -= bh; |
| 638 | if (ah < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1))) |
| 639 | break; |
| 640 | |
| 641 | if (ah <= bh) |
| 642 | { |
| 643 | /* Use q = 1 */ |
| 644 | u01 += u00; |
| 645 | u11 += u10; |
| 646 | } |
| 647 | else |
| 648 | { |
| 649 | mp_double_limb_t rq = div1 (ah, bh); |
| 650 | mp_limb_t q = rq.d1; |
| 651 | ah = rq.d0; |
| 652 | |
| 653 | if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1))) |
| 654 | { |
| 655 | /* A is too small, but q is correct. */ |
| 656 | u01 += q * u00; |
| 657 | u11 += q * u10; |
| 658 | break; |
| 659 | } |
| 660 | q++; |
| 661 | u01 += q * u00; |
| 662 | u11 += q * u10; |
| 663 | } |
| 664 | subtract_a1: |
| 665 | ASSERT (bh >= ah); |
| 666 | |
| 667 | bh -= ah; |
| 668 | if (bh < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1))) |
| 669 | break; |
| 670 | |
| 671 | if (bh <= ah) |
| 672 | { |
| 673 | /* Use q = 1 */ |
| 674 | u00 += u01; |
| 675 | u10 += u11; |
| 676 | } |
| 677 | else |
| 678 | { |
| 679 | mp_double_limb_t rq = div1 (bh, ah); |
| 680 | mp_limb_t q = rq.d1; |
| 681 | bh = rq.d0; |
| 682 | |
| 683 | if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1))) |
| 684 | { |
| 685 | /* B is too small, but q is correct. */ |
| 686 | u00 += q * u01; |
| 687 | u10 += q * u11; |
| 688 | break; |
| 689 | } |
| 690 | q++; |
| 691 | u00 += q * u01; |
| 692 | u10 += q * u11; |
| 693 | } |
| 694 | } |
| 695 | |
| 696 | done: |
| 697 | M->u[0][0] = u00; M->u[0][1] = u01; |
| 698 | M->u[1][0] = u10; M->u[1][1] = u11; |
| 699 | |
| 700 | return 1; |
| 701 | } |
| 702 | |
| 703 | /* Sets (r;b) = (a;b) M, with M = (u00, u01; u10, u11). Vector must |
| 704 | * have space for n + 1 limbs. Uses three buffers to avoid a copy*/ |
| 705 | mp_size_t |
| 706 | mpn_hgcd_mul_matrix1_vector (const struct hgcd_matrix1 *M, |
| 707 | mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n) |
| 708 | { |
| 709 | mp_limb_t ah, bh; |
| 710 | |
| 711 | /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as |
| 712 | |
| 713 | r = u00 * a |
| 714 | r += u10 * b |
| 715 | b *= u11 |
| 716 | b += u01 * a |
| 717 | */ |
| 718 | |
| 719 | #if HAVE_NATIVE_mpn_addaddmul_1msb0 |
| 720 | ah = mpn_addaddmul_1msb0 (rp, ap, bp, n, M->u[0][0], M->u[1][0]); |
| 721 | bh = mpn_addaddmul_1msb0 (bp, bp, ap, n, M->u[1][1], M->u[0][1]); |
| 722 | #else |
| 723 | ah = mpn_mul_1 (rp, ap, n, M->u[0][0]); |
| 724 | ah += mpn_addmul_1 (rp, bp, n, M->u[1][0]); |
| 725 | |
| 726 | bh = mpn_mul_1 (bp, bp, n, M->u[1][1]); |
| 727 | bh += mpn_addmul_1 (bp, ap, n, M->u[0][1]); |
| 728 | #endif |
| 729 | rp[n] = ah; |
| 730 | bp[n] = bh; |
| 731 | |
| 732 | n += (ah | bh) > 0; |
| 733 | return n; |
| 734 | } |