Austin Schuh | dace2a6 | 2020-08-18 10:56:48 -0700 | [diff] [blame^] | 1 | /* mpn_modexact_1c_odd -- mpn by limb exact division style remainder. |
| 2 | |
| 3 | THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST |
| 4 | CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN |
| 5 | FUTURE GNU MP RELEASES. |
| 6 | |
| 7 | Copyright 2000-2004 Free Software Foundation, Inc. |
| 8 | |
| 9 | This file is part of the GNU MP Library. |
| 10 | |
| 11 | The GNU MP Library is free software; you can redistribute it and/or modify |
| 12 | it under the terms of either: |
| 13 | |
| 14 | * the GNU Lesser General Public License as published by the Free |
| 15 | Software Foundation; either version 3 of the License, or (at your |
| 16 | option) any later version. |
| 17 | |
| 18 | or |
| 19 | |
| 20 | * the GNU General Public License as published by the Free Software |
| 21 | Foundation; either version 2 of the License, or (at your option) any |
| 22 | later version. |
| 23 | |
| 24 | or both in parallel, as here. |
| 25 | |
| 26 | The GNU MP Library is distributed in the hope that it will be useful, but |
| 27 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| 28 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 29 | for more details. |
| 30 | |
| 31 | You should have received copies of the GNU General Public License and the |
| 32 | GNU Lesser General Public License along with the GNU MP Library. If not, |
| 33 | see https://www.gnu.org/licenses/. */ |
| 34 | |
| 35 | #include "gmp-impl.h" |
| 36 | #include "longlong.h" |
| 37 | |
| 38 | |
| 39 | /* Calculate an r satisfying |
| 40 | |
| 41 | r*B^k + a - c == q*d |
| 42 | |
| 43 | where B=2^GMP_LIMB_BITS, a is {src,size}, k is either size or size-1 |
| 44 | (the caller won't know which), and q is the quotient (discarded). d must |
| 45 | be odd, c can be any limb value. |
| 46 | |
| 47 | If c<d then r will be in the range 0<=r<d, or if c>=d then 0<=r<=d. |
| 48 | |
| 49 | This slightly strange function suits the initial Nx1 reduction for GCDs |
| 50 | or Jacobi symbols since the factors of 2 in B^k can be ignored, leaving |
| 51 | -r == a mod d (by passing c=0). For a GCD the factor of -1 on r can be |
| 52 | ignored, or for the Jacobi symbol it can be accounted for. The function |
| 53 | also suits divisibility and congruence testing since if r=0 (or r=d) is |
| 54 | obtained then a==c mod d. |
| 55 | |
| 56 | |
| 57 | r is a bit like the remainder returned by mpn_divexact_by3c, and is the |
| 58 | sort of remainder mpn_divexact_1 might return. Like mpn_divexact_by3c, r |
| 59 | represents a borrow, since effectively quotient limbs are chosen so that |
| 60 | subtracting that multiple of d from src at each step will produce a zero |
| 61 | limb. |
| 62 | |
| 63 | A long calculation can be done piece by piece from low to high by passing |
| 64 | the return value from one part as the carry parameter to the next part. |
| 65 | The effective final k becomes anything between size and size-n, if n |
| 66 | pieces are used. |
| 67 | |
| 68 | |
| 69 | A similar sort of routine could be constructed based on adding multiples |
| 70 | of d at each limb, much like redc in mpz_powm does. Subtracting however |
| 71 | has a small advantage that when subtracting to cancel out l there's never |
| 72 | a borrow into h, whereas using an addition would put a carry into h |
| 73 | depending whether l==0 or l!=0. |
| 74 | |
| 75 | |
| 76 | In terms of efficiency, this function is similar to a mul-by-inverse |
| 77 | mpn_mod_1. Both are essentially two multiplies and are best suited to |
| 78 | CPUs with low latency multipliers (in comparison to a divide instruction |
| 79 | at least.) But modexact has a few less supplementary operations, only |
| 80 | needs low part and high part multiplies, and has fewer working quantities |
| 81 | (helping CPUs with few registers). |
| 82 | |
| 83 | |
| 84 | In the main loop it will be noted that the new carry (call it r) is the |
| 85 | sum of the high product h and any borrow from l=s-c. If c<d then we will |
| 86 | have r<d too, for the following reasons. Let q=l*inverse be the quotient |
| 87 | limb, so that q*d = B*h + l, where B=2^GMP_NUMB_BITS. Now if h=d-1 then |
| 88 | |
| 89 | l = q*d - B*(d-1) <= (B-1)*d - B*(d-1) = B-d |
| 90 | |
| 91 | But if l=s-c produces a borrow when c<d, then l>=B-d+1 and hence will |
| 92 | never have h=d-1 and so r=h+borrow <= d-1. |
| 93 | |
| 94 | When c>=d, on the other hand, h=d-1 can certainly occur together with a |
| 95 | borrow, thereby giving only r<=d, as per the function definition above. |
| 96 | |
| 97 | As a design decision it's left to the caller to check for r=d if it might |
| 98 | be passing c>=d. Several applications have c<d initially so the extra |
| 99 | test is often unnecessary, for example the GCDs or a plain divisibility |
| 100 | d|a test will pass c=0. |
| 101 | |
| 102 | |
| 103 | The special case for size==1 is so that it can be assumed c<=d in the |
| 104 | high<=divisor test at the end. c<=d is only guaranteed after at least |
| 105 | one iteration of the main loop. There's also a decent chance one % is |
| 106 | faster than a binvert_limb, though that will depend on the processor. |
| 107 | |
| 108 | A CPU specific implementation might want to omit the size==1 code or the |
| 109 | high<divisor test. mpn/x86/k6/mode1o.asm for instance finds neither |
| 110 | useful. */ |
| 111 | |
| 112 | |
| 113 | mp_limb_t |
| 114 | mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, |
| 115 | mp_limb_t orig_c) |
| 116 | { |
| 117 | mp_limb_t s, h, l, inverse, dummy, dmul, ret; |
| 118 | mp_limb_t c = orig_c; |
| 119 | mp_size_t i; |
| 120 | |
| 121 | ASSERT (size >= 1); |
| 122 | ASSERT (d & 1); |
| 123 | ASSERT_MPN (src, size); |
| 124 | ASSERT_LIMB (d); |
| 125 | ASSERT_LIMB (c); |
| 126 | |
| 127 | if (size == 1) |
| 128 | { |
| 129 | s = src[0]; |
| 130 | if (s > c) |
| 131 | { |
| 132 | l = s-c; |
| 133 | h = l % d; |
| 134 | if (h != 0) |
| 135 | h = d - h; |
| 136 | } |
| 137 | else |
| 138 | { |
| 139 | l = c-s; |
| 140 | h = l % d; |
| 141 | } |
| 142 | return h; |
| 143 | } |
| 144 | |
| 145 | |
| 146 | binvert_limb (inverse, d); |
| 147 | dmul = d << GMP_NAIL_BITS; |
| 148 | |
| 149 | i = 0; |
| 150 | do |
| 151 | { |
| 152 | s = src[i]; |
| 153 | SUBC_LIMB (c, l, s, c); |
| 154 | l = (l * inverse) & GMP_NUMB_MASK; |
| 155 | umul_ppmm (h, dummy, l, dmul); |
| 156 | c += h; |
| 157 | } |
| 158 | while (++i < size-1); |
| 159 | |
| 160 | |
| 161 | s = src[i]; |
| 162 | if (s <= d) |
| 163 | { |
| 164 | /* With high<=d the final step can be a subtract and addback. If c==0 |
| 165 | then the addback will restore to l>=0. If c==d then will get l==d |
| 166 | if s==0, but that's ok per the function definition. */ |
| 167 | |
| 168 | l = c - s; |
| 169 | if (c < s) |
| 170 | l += d; |
| 171 | |
| 172 | ret = l; |
| 173 | } |
| 174 | else |
| 175 | { |
| 176 | /* Can't skip a divide, just do the loop code once more. */ |
| 177 | |
| 178 | SUBC_LIMB (c, l, s, c); |
| 179 | l = (l * inverse) & GMP_NUMB_MASK; |
| 180 | umul_ppmm (h, dummy, l, dmul); |
| 181 | c += h; |
| 182 | ret = c; |
| 183 | } |
| 184 | |
| 185 | ASSERT (orig_c < d ? ret < d : ret <= d); |
| 186 | return ret; |
| 187 | } |
| 188 | |
| 189 | |
| 190 | |
| 191 | #if 0 |
| 192 | |
| 193 | /* The following is an alternate form that might shave one cycle on a |
| 194 | superscalar processor since it takes c+=h off the dependent chain, |
| 195 | leaving just a low product, high product, and a subtract. |
| 196 | |
| 197 | This is for CPU specific implementations to consider. A special case for |
| 198 | high<divisor and/or size==1 can be added if desired. |
| 199 | |
| 200 | Notice that c is only ever 0 or 1, since if s-c produces a borrow then |
| 201 | x=0xFF..FF and x-h cannot produce a borrow. The c=(x>s) could become |
| 202 | c=(x==0xFF..FF) too, if that helped. */ |
| 203 | |
| 204 | mp_limb_t |
| 205 | mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t h) |
| 206 | { |
| 207 | mp_limb_t s, x, y, inverse, dummy, dmul, c1, c2; |
| 208 | mp_limb_t c = 0; |
| 209 | mp_size_t i; |
| 210 | |
| 211 | ASSERT (size >= 1); |
| 212 | ASSERT (d & 1); |
| 213 | |
| 214 | binvert_limb (inverse, d); |
| 215 | dmul = d << GMP_NAIL_BITS; |
| 216 | |
| 217 | for (i = 0; i < size; i++) |
| 218 | { |
| 219 | ASSERT (c==0 || c==1); |
| 220 | |
| 221 | s = src[i]; |
| 222 | SUBC_LIMB (c1, x, s, c); |
| 223 | |
| 224 | SUBC_LIMB (c2, y, x, h); |
| 225 | c = c1 + c2; |
| 226 | |
| 227 | y = (y * inverse) & GMP_NUMB_MASK; |
| 228 | umul_ppmm (h, dummy, y, dmul); |
| 229 | } |
| 230 | |
| 231 | h += c; |
| 232 | return h; |
| 233 | } |
| 234 | |
| 235 | #endif |