Austin Schuh | dace2a6 | 2020-08-18 10:56:48 -0700 | [diff] [blame] | 1 | /* mpn_sqrlo -- squares an n-limb number and returns the low n limbs |
| 2 | of the result. |
| 3 | |
| 4 | Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. |
| 5 | |
| 6 | THIS IS (FOR NOW) AN INTERNAL FUNCTION. IT IS ONLY SAFE TO REACH THIS |
| 7 | FUNCTION THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED |
| 8 | THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. |
| 9 | |
| 10 | Copyright 2004, 2005, 2009, 2010, 2012, 2015 Free Software Foundation, Inc. |
| 11 | |
| 12 | This file is part of the GNU MP Library. |
| 13 | |
| 14 | The GNU MP Library is free software; you can redistribute it and/or modify |
| 15 | it under the terms of either: |
| 16 | |
| 17 | * the GNU Lesser General Public License as published by the Free |
| 18 | Software Foundation; either version 3 of the License, or (at your |
| 19 | option) any later version. |
| 20 | |
| 21 | or |
| 22 | |
| 23 | * the GNU General Public License as published by the Free Software |
| 24 | Foundation; either version 2 of the License, or (at your option) any |
| 25 | later version. |
| 26 | |
| 27 | or both in parallel, as here. |
| 28 | |
| 29 | The GNU MP Library is distributed in the hope that it will be useful, but |
| 30 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| 31 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 32 | for more details. |
| 33 | |
| 34 | You should have received copies of the GNU General Public License and the |
| 35 | GNU Lesser General Public License along with the GNU MP Library. If not, |
| 36 | see https://www.gnu.org/licenses/. */ |
| 37 | |
| 38 | #include "gmp-impl.h" |
| 39 | |
| 40 | #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY |
| 41 | #define MAYBE_range_basecase 1 |
| 42 | #define MAYBE_range_toom22 1 |
| 43 | #else |
| 44 | #define MAYBE_range_basecase \ |
| 45 | ((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM2_THRESHOLD*36/(36-11)) |
| 46 | #define MAYBE_range_toom22 \ |
| 47 | ((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM3_THRESHOLD*36/(36-11) ) |
| 48 | #endif |
| 49 | |
| 50 | /* THINK: The DC strategy uses different constants in different Toom's |
| 51 | ranges. Something smoother? |
| 52 | */ |
| 53 | |
| 54 | /* |
| 55 | Compute the least significant half of the product {xy,n}*{yp,n}, or |
| 56 | formally {rp,n} = {xy,n}*{yp,n} Mod (B^n). |
| 57 | |
| 58 | Above the given threshold, the Divide and Conquer strategy is used. |
| 59 | The operand is split in two, and a full square plus a mullo |
| 60 | is used to obtain the final result. The more natural strategy is to |
| 61 | split in two halves, but this is far from optimal when a |
| 62 | sub-quadratic multiplication is used. |
| 63 | |
| 64 | Mulders suggests an unbalanced split in favour of the full product, |
| 65 | split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2. |
| 66 | |
| 67 | To compute the value of a, we assume that the cost of mullo for a |
| 68 | given size ML(n) is a fraction of the cost of a full product with |
| 69 | same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2; |
| 70 | then we can write: |
| 71 | |
| 72 | ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e |
| 73 | |
| 74 | Given a value for e, want to minimise the value of k, i.e. the |
| 75 | function k=(1-a)^e/(1-2*a^e). |
| 76 | |
| 77 | With e=2, the exponent for schoolbook multiplication, the minimum is |
| 78 | given by the values a=1-a=1/2. |
| 79 | |
| 80 | With e=log(3)/log(2), the exponent for Karatsuba (aka toom22), |
| 81 | Mulders compute (1-a) = 0.694... and we approximate a with 11/36. |
| 82 | |
| 83 | Other possible approximations follow: |
| 84 | e=log(5)/log(3) [Toom-3] -> a ~= 9/40 |
| 85 | e=log(7)/log(4) [Toom-4] -> a ~= 7/39 |
| 86 | e=log(11)/log(6) [Toom-6] -> a ~= 1/8 |
| 87 | e=log(15)/log(8) [Toom-8] -> a ~= 1/10 |
| 88 | |
| 89 | The values above where obtained with the following trivial commands |
| 90 | in the gp-pari shell: |
| 91 | |
| 92 | fun(e,a)=(1-a)^e/(1-2*a^e) |
| 93 | mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)<m,m=fun(a,p);x=p));mul(a,(b+x)/2,(c+x)/2))} |
| 94 | contfracpnqn(contfrac(mul(log(2*2-1)/log(2),1/2,0),5)) |
| 95 | contfracpnqn(contfrac(mul(log(3*2-1)/log(3),1/2,0),5)) |
| 96 | contfracpnqn(contfrac(mul(log(4*2-1)/log(4),1/2,0),5)) |
| 97 | contfracpnqn(contfrac(mul(log(6*2-1)/log(6),1/2,0),3)) |
| 98 | contfracpnqn(contfrac(mul(log(8*2-1)/log(8),1/2,0),3)) |
| 99 | |
| 100 | , |
| 101 | |\ |
| 102 | | \ |
| 103 | +----, |
| 104 | | | |
| 105 | | | |
| 106 | | |\ |
| 107 | | | \ |
| 108 | +----+--` |
| 109 | ^ n2 ^n1^ |
| 110 | |
| 111 | For an actual implementation, the assumption that M(n)=n^e is |
| 112 | incorrect, as a consequence also the assumption that ML(n)=k*M(n) |
| 113 | with a constant k is wrong. |
| 114 | |
| 115 | But theory suggest us two things: |
| 116 | - the best the multiplication product is (lower e), the more k |
| 117 | approaches 1, and a approaches 0. |
| 118 | |
| 119 | - A value for a smaller than optimal is probably less bad than a |
| 120 | bigger one: e.g. let e=log(3)/log(2), a=0.3058_ the optimal |
| 121 | value, and k(a)=0.808_ the mul/mullo speed ratio. We get |
| 122 | k(a+1/6)=0.929_ but k(a-1/6)=0.865_. |
| 123 | */ |
| 124 | |
| 125 | static mp_size_t |
| 126 | mpn_sqrlo_itch (mp_size_t n) |
| 127 | { |
| 128 | return 2*n; |
| 129 | } |
| 130 | |
| 131 | /* |
| 132 | mpn_dc_sqrlo requires a scratch space of 2*n limbs at tp. |
| 133 | It accepts tp == rp. |
| 134 | */ |
| 135 | static void |
| 136 | mpn_dc_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n, mp_ptr tp) |
| 137 | { |
| 138 | mp_size_t n2, n1; |
| 139 | ASSERT (n >= 2); |
| 140 | ASSERT (! MPN_OVERLAP_P (rp, n, xp, n)); |
| 141 | ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n)); |
| 142 | |
| 143 | /* Divide-and-conquer */ |
| 144 | |
| 145 | /* We need fractional approximation of the value 0 < a <= 1/2 |
| 146 | giving the minimum in the function k=(1-a)^e/(1-2*a^e). |
| 147 | */ |
| 148 | if (MAYBE_range_basecase && BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD*36/(36-11))) |
| 149 | n1 = n >> 1; |
| 150 | else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD*36/(36-11))) |
| 151 | n1 = n * 11 / (size_t) 36; /* n1 ~= n*(1-.694...) */ |
| 152 | else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD*40/(40-9))) |
| 153 | n1 = n * 9 / (size_t) 40; /* n1 ~= n*(1-.775...) */ |
| 154 | else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD*10/9)) |
| 155 | n1 = n * 7 / (size_t) 39; /* n1 ~= n*(1-.821...) */ |
| 156 | /* n1 = n * 4 / (size_t) 31; // n1 ~= n*(1-.871...) [TOOM66] */ |
| 157 | else |
| 158 | n1 = n / (size_t) 10; /* n1 ~= n*(1-.899...) [TOOM88] */ |
| 159 | |
| 160 | n2 = n - n1; |
| 161 | |
| 162 | /* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0 */ |
| 163 | |
| 164 | /* x0 ^ 2 */ |
| 165 | mpn_sqr (tp, xp, n2); |
| 166 | MPN_COPY (rp, tp, n2); |
| 167 | |
| 168 | /* x1 * x0 * 2^(n2 GMP_NUMB_BITS) */ |
| 169 | if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD)) |
| 170 | mpn_mul_basecase (tp + n, xp + n2, n1, xp, n1); |
| 171 | else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD)) |
| 172 | mpn_mullo_basecase (tp + n, xp + n2, xp, n1); |
| 173 | else |
| 174 | mpn_mullo_n (tp + n, xp + n2, xp, n1); |
| 175 | /* mpn_dc_mullo_n (tp + n, xp + n2, xp, n1, tp + n); */ |
| 176 | #if HAVE_NATIVE_mpn_addlsh1_n |
| 177 | mpn_addlsh1_n (rp + n2, tp + n2, tp + n, n1); |
| 178 | #else |
| 179 | mpn_lshift (rp + n2, tp + n, n1, 1); |
| 180 | mpn_add_n (rp + n2, rp + n2, tp + n2, n1); |
| 181 | #endif |
| 182 | } |
| 183 | |
| 184 | /* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0. */ |
| 185 | #define SQR_BASECASE_ALLOC \ |
| 186 | (SQRLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*SQRLO_BASECASE_THRESHOLD_LIMIT) |
| 187 | |
| 188 | /* FIXME: This function should accept a temporary area; dc_sqrlo |
| 189 | accepts a pointer tp, and handle the case tp == rp, do the same here. |
| 190 | */ |
| 191 | |
| 192 | void |
| 193 | mpn_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n) |
| 194 | { |
| 195 | ASSERT (n >= 1); |
| 196 | ASSERT (! MPN_OVERLAP_P (rp, n, xp, n)); |
| 197 | |
| 198 | if (BELOW_THRESHOLD (n, SQRLO_BASECASE_THRESHOLD)) |
| 199 | { |
| 200 | /* FIXME: smarter criteria? */ |
| 201 | #if HAVE_NATIVE_mpn_mullo_basecase || ! HAVE_NATIVE_mpn_sqr_basecase |
| 202 | /* mullo computes as many products as sqr, but directly writes |
| 203 | on the result area. */ |
| 204 | mpn_mullo_basecase (rp, xp, xp, n); |
| 205 | #else |
| 206 | /* Allocate workspace of fixed size on stack: fast! */ |
| 207 | mp_limb_t tp[SQR_BASECASE_ALLOC]; |
| 208 | mpn_sqr_basecase (tp, xp, n); |
| 209 | MPN_COPY (rp, tp, n); |
| 210 | #endif |
| 211 | } |
| 212 | else if (BELOW_THRESHOLD (n, SQRLO_DC_THRESHOLD)) |
| 213 | { |
| 214 | mpn_sqrlo_basecase (rp, xp, n); |
| 215 | } |
| 216 | else |
| 217 | { |
| 218 | mp_ptr tp; |
| 219 | TMP_DECL; |
| 220 | TMP_MARK; |
| 221 | tp = TMP_ALLOC_LIMBS (mpn_sqrlo_itch (n)); |
| 222 | if (BELOW_THRESHOLD (n, SQRLO_SQR_THRESHOLD)) |
| 223 | { |
| 224 | mpn_dc_sqrlo (rp, xp, n, tp); |
| 225 | } |
| 226 | else |
| 227 | { |
| 228 | /* For really large operands, use plain mpn_mul_n but throw away upper n |
| 229 | limbs of result. */ |
| 230 | #if !TUNE_PROGRAM_BUILD && (SQRLO_SQR_THRESHOLD > SQR_FFT_THRESHOLD) |
| 231 | mpn_fft_mul (tp, xp, n, xp, n); |
| 232 | #else |
| 233 | mpn_sqr (tp, xp, n); |
| 234 | #endif |
| 235 | MPN_COPY (rp, tp, n); |
| 236 | } |
| 237 | TMP_FREE; |
| 238 | } |
| 239 | } |