Austin Schuh | dace2a6 | 2020-08-18 10:56:48 -0700 | [diff] [blame] | 1 | /* mpn_perfect_square_p(u,usize) -- Return non-zero if U is a perfect square, |
| 2 | zero otherwise. |
| 3 | |
| 4 | Copyright 1991, 1993, 1994, 1996, 1997, 2000-2002, 2005, 2012 Free Software |
| 5 | Foundation, Inc. |
| 6 | |
| 7 | This file is part of the GNU MP Library. |
| 8 | |
| 9 | The GNU MP Library is free software; you can redistribute it and/or modify |
| 10 | it under the terms of either: |
| 11 | |
| 12 | * the GNU Lesser General Public License as published by the Free |
| 13 | Software Foundation; either version 3 of the License, or (at your |
| 14 | option) any later version. |
| 15 | |
| 16 | or |
| 17 | |
| 18 | * the GNU General Public License as published by the Free Software |
| 19 | Foundation; either version 2 of the License, or (at your option) any |
| 20 | later version. |
| 21 | |
| 22 | or both in parallel, as here. |
| 23 | |
| 24 | The GNU MP Library is distributed in the hope that it will be useful, but |
| 25 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| 26 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 27 | for more details. |
| 28 | |
| 29 | You should have received copies of the GNU General Public License and the |
| 30 | GNU Lesser General Public License along with the GNU MP Library. If not, |
| 31 | see https://www.gnu.org/licenses/. */ |
| 32 | |
| 33 | #include <stdio.h> /* for NULL */ |
| 34 | #include "gmp-impl.h" |
| 35 | #include "longlong.h" |
| 36 | |
| 37 | #include "perfsqr.h" |
| 38 | |
| 39 | |
| 40 | /* change this to "#define TRACE(x) x" for diagnostics */ |
| 41 | #define TRACE(x) |
| 42 | |
| 43 | |
| 44 | |
| 45 | /* PERFSQR_MOD_* detects non-squares using residue tests. |
| 46 | |
| 47 | A macro PERFSQR_MOD_TEST is setup by gen-psqr.c in perfsqr.h. It takes |
| 48 | {up,usize} modulo a selected modulus to get a remainder r. For 32-bit or |
| 49 | 64-bit limbs this modulus will be 2^24-1 or 2^48-1 using PERFSQR_MOD_34, |
| 50 | or for other limb or nail sizes a PERFSQR_PP is chosen and PERFSQR_MOD_PP |
| 51 | used. PERFSQR_PP_NORM and PERFSQR_PP_INVERTED are pre-calculated in this |
| 52 | case too. |
| 53 | |
| 54 | PERFSQR_MOD_TEST then makes various calls to PERFSQR_MOD_1 or |
| 55 | PERFSQR_MOD_2 with divisors d which are factors of the modulus, and table |
| 56 | data indicating residues and non-residues modulo those divisors. The |
| 57 | table data is in 1 or 2 limbs worth of bits respectively, per the size of |
| 58 | each d. |
| 59 | |
| 60 | A "modexact" style remainder is taken to reduce r modulo d. |
| 61 | PERFSQR_MOD_IDX implements this, producing an index "idx" for use with |
| 62 | the table data. Notice there's just one multiplication by a constant |
| 63 | "inv", for each d. |
| 64 | |
| 65 | The modexact doesn't produce a true r%d remainder, instead idx satisfies |
| 66 | "-(idx<<PERFSQR_MOD_BITS) == r mod d". Because d is odd, this factor |
| 67 | -2^PERFSQR_MOD_BITS is a one-to-one mapping between r and idx, and is |
| 68 | accounted for by having the table data suitably permuted. |
| 69 | |
| 70 | The remainder r fits within PERFSQR_MOD_BITS which is less than a limb. |
| 71 | In fact the GMP_LIMB_BITS - PERFSQR_MOD_BITS spare bits are enough to fit |
| 72 | each divisor d meaning the modexact multiply can take place entirely |
| 73 | within one limb, giving the compiler the chance to optimize it, in a way |
| 74 | that say umul_ppmm would not give. |
| 75 | |
| 76 | There's no need for the divisors d to be prime, in fact gen-psqr.c makes |
| 77 | a deliberate effort to combine factors so as to reduce the number of |
| 78 | separate tests done on r. But such combining is limited to d <= |
| 79 | 2*GMP_LIMB_BITS so that the table data fits in at most 2 limbs. |
| 80 | |
| 81 | Alternatives: |
| 82 | |
| 83 | It'd be possible to use bigger divisors d, and more than 2 limbs of table |
| 84 | data, but this doesn't look like it would be of much help to the prime |
| 85 | factors in the usual moduli 2^24-1 or 2^48-1. |
| 86 | |
| 87 | The moduli 2^24-1 or 2^48-1 are nothing particularly special, they're |
| 88 | just easy to calculate (see mpn_mod_34lsub1) and have a nice set of prime |
| 89 | factors. 2^32-1 and 2^64-1 would be equally easy to calculate, but have |
| 90 | fewer prime factors. |
| 91 | |
| 92 | The nails case usually ends up using mpn_mod_1, which is a lot slower |
| 93 | than mpn_mod_34lsub1. Perhaps other such special moduli could be found |
| 94 | for the nails case. Two-term things like 2^30-2^15-1 might be |
| 95 | candidates. Or at worst some on-the-fly de-nailing would allow the plain |
| 96 | 2^24-1 to be used. Currently nails are too preliminary to be worried |
| 97 | about. |
| 98 | |
| 99 | */ |
| 100 | |
| 101 | #define PERFSQR_MOD_MASK ((CNST_LIMB(1) << PERFSQR_MOD_BITS) - 1) |
| 102 | |
| 103 | #define MOD34_BITS (GMP_NUMB_BITS / 4 * 3) |
| 104 | #define MOD34_MASK ((CNST_LIMB(1) << MOD34_BITS) - 1) |
| 105 | |
| 106 | #define PERFSQR_MOD_34(r, up, usize) \ |
| 107 | do { \ |
| 108 | (r) = mpn_mod_34lsub1 (up, usize); \ |
| 109 | (r) = ((r) & MOD34_MASK) + ((r) >> MOD34_BITS); \ |
| 110 | } while (0) |
| 111 | |
| 112 | /* FIXME: The %= here isn't good, and might destroy any savings from keeping |
| 113 | the PERFSQR_MOD_IDX stuff within a limb (rather than needing umul_ppmm). |
| 114 | Maybe a new sort of mpn_preinv_mod_1 could accept an unnormalized divisor |
| 115 | and a shift count, like mpn_preinv_divrem_1. But mod_34lsub1 is our |
| 116 | normal case, so lets not worry too much about mod_1. */ |
| 117 | #define PERFSQR_MOD_PP(r, up, usize) \ |
| 118 | do { \ |
| 119 | if (BELOW_THRESHOLD (usize, PREINV_MOD_1_TO_MOD_1_THRESHOLD)) \ |
| 120 | { \ |
| 121 | (r) = mpn_preinv_mod_1 (up, usize, PERFSQR_PP_NORM, \ |
| 122 | PERFSQR_PP_INVERTED); \ |
| 123 | (r) %= PERFSQR_PP; \ |
| 124 | } \ |
| 125 | else \ |
| 126 | { \ |
| 127 | (r) = mpn_mod_1 (up, usize, PERFSQR_PP); \ |
| 128 | } \ |
| 129 | } while (0) |
| 130 | |
| 131 | #define PERFSQR_MOD_IDX(idx, r, d, inv) \ |
| 132 | do { \ |
| 133 | mp_limb_t q; \ |
| 134 | ASSERT ((r) <= PERFSQR_MOD_MASK); \ |
| 135 | ASSERT ((((inv) * (d)) & PERFSQR_MOD_MASK) == 1); \ |
| 136 | ASSERT (MP_LIMB_T_MAX / (d) >= PERFSQR_MOD_MASK); \ |
| 137 | \ |
| 138 | q = ((r) * (inv)) & PERFSQR_MOD_MASK; \ |
| 139 | ASSERT (r == ((q * (d)) & PERFSQR_MOD_MASK)); \ |
| 140 | (idx) = (q * (d)) >> PERFSQR_MOD_BITS; \ |
| 141 | } while (0) |
| 142 | |
| 143 | #define PERFSQR_MOD_1(r, d, inv, mask) \ |
| 144 | do { \ |
| 145 | unsigned idx; \ |
| 146 | ASSERT ((d) <= GMP_LIMB_BITS); \ |
| 147 | PERFSQR_MOD_IDX(idx, r, d, inv); \ |
| 148 | TRACE (printf (" PERFSQR_MOD_1 d=%u r=%lu idx=%u\n", \ |
| 149 | d, r%d, idx)); \ |
| 150 | if ((((mask) >> idx) & 1) == 0) \ |
| 151 | { \ |
| 152 | TRACE (printf (" non-square\n")); \ |
| 153 | return 0; \ |
| 154 | } \ |
| 155 | } while (0) |
| 156 | |
| 157 | /* The expression "(int) idx - GMP_LIMB_BITS < 0" lets the compiler use the |
| 158 | sign bit from "idx-GMP_LIMB_BITS", which might help avoid a branch. */ |
| 159 | #define PERFSQR_MOD_2(r, d, inv, mhi, mlo) \ |
| 160 | do { \ |
| 161 | mp_limb_t m; \ |
| 162 | unsigned idx; \ |
| 163 | ASSERT ((d) <= 2*GMP_LIMB_BITS); \ |
| 164 | \ |
| 165 | PERFSQR_MOD_IDX (idx, r, d, inv); \ |
| 166 | TRACE (printf (" PERFSQR_MOD_2 d=%u r=%lu idx=%u\n", \ |
| 167 | d, r%d, idx)); \ |
| 168 | m = ((int) idx - GMP_LIMB_BITS < 0 ? (mlo) : (mhi)); \ |
| 169 | idx %= GMP_LIMB_BITS; \ |
| 170 | if (((m >> idx) & 1) == 0) \ |
| 171 | { \ |
| 172 | TRACE (printf (" non-square\n")); \ |
| 173 | return 0; \ |
| 174 | } \ |
| 175 | } while (0) |
| 176 | |
| 177 | |
| 178 | int |
| 179 | mpn_perfect_square_p (mp_srcptr up, mp_size_t usize) |
| 180 | { |
| 181 | ASSERT (usize >= 1); |
| 182 | |
| 183 | TRACE (gmp_printf ("mpn_perfect_square_p %Nd\n", up, usize)); |
| 184 | |
| 185 | /* The first test excludes 212/256 (82.8%) of the perfect square candidates |
| 186 | in O(1) time. */ |
| 187 | { |
| 188 | unsigned idx = up[0] % 0x100; |
| 189 | if (((sq_res_0x100[idx / GMP_LIMB_BITS] |
| 190 | >> (idx % GMP_LIMB_BITS)) & 1) == 0) |
| 191 | return 0; |
| 192 | } |
| 193 | |
| 194 | #if 0 |
| 195 | /* Check that we have even multiplicity of 2, and then check that the rest is |
| 196 | a possible perfect square. Leave disabled until we can determine this |
| 197 | really is an improvement. It it is, it could completely replace the |
| 198 | simple probe above, since this should throw out more non-squares, but at |
| 199 | the expense of somewhat more cycles. */ |
| 200 | { |
| 201 | mp_limb_t lo; |
| 202 | int cnt; |
| 203 | lo = up[0]; |
| 204 | while (lo == 0) |
| 205 | up++, lo = up[0], usize--; |
| 206 | count_trailing_zeros (cnt, lo); |
| 207 | if ((cnt & 1) != 0) |
| 208 | return 0; /* return of not even multiplicity of 2 */ |
| 209 | lo >>= cnt; /* shift down to align lowest non-zero bit */ |
| 210 | lo >>= 1; /* shift away lowest non-zero bit */ |
| 211 | if ((lo & 3) != 0) |
| 212 | return 0; |
| 213 | } |
| 214 | #endif |
| 215 | |
| 216 | |
| 217 | /* The second test uses mpn_mod_34lsub1 or mpn_mod_1 to detect non-squares |
| 218 | according to their residues modulo small primes (or powers of |
| 219 | primes). See perfsqr.h. */ |
| 220 | PERFSQR_MOD_TEST (up, usize); |
| 221 | |
| 222 | |
| 223 | /* For the third and last test, we finally compute the square root, |
| 224 | to make sure we've really got a perfect square. */ |
| 225 | { |
| 226 | mp_ptr root_ptr; |
| 227 | int res; |
| 228 | TMP_DECL; |
| 229 | |
| 230 | TMP_MARK; |
| 231 | root_ptr = TMP_ALLOC_LIMBS ((usize + 1) / 2); |
| 232 | |
| 233 | /* Iff mpn_sqrtrem returns zero, the square is perfect. */ |
| 234 | res = ! mpn_sqrtrem (root_ptr, NULL, up, usize); |
| 235 | TMP_FREE; |
| 236 | |
| 237 | return res; |
| 238 | } |
| 239 | } |