blob: 71530b6d6dded2f602731e725d219c6a4ea6ec57 [file] [log] [blame]
Austin Schuhdace2a62020-08-18 10:56:48 -07001/* mpn_sqrlo -- squares an n-limb number and returns the low n limbs
2 of the result.
3
4 Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
5
6 THIS IS (FOR NOW) AN INTERNAL FUNCTION. IT IS ONLY SAFE TO REACH THIS
7 FUNCTION THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED
8 THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9
10Copyright 2004, 2005, 2009, 2010, 2012, 2015 Free Software Foundation, Inc.
11
12This file is part of the GNU MP Library.
13
14The GNU MP Library is free software; you can redistribute it and/or modify
15it under the terms of either:
16
17 * the GNU Lesser General Public License as published by the Free
18 Software Foundation; either version 3 of the License, or (at your
19 option) any later version.
20
21or
22
23 * the GNU General Public License as published by the Free Software
24 Foundation; either version 2 of the License, or (at your option) any
25 later version.
26
27or both in parallel, as here.
28
29The GNU MP Library is distributed in the hope that it will be useful, but
30WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
31or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
32for more details.
33
34You should have received copies of the GNU General Public License and the
35GNU Lesser General Public License along with the GNU MP Library. If not,
36see https://www.gnu.org/licenses/. */
37
38#include "gmp-impl.h"
39
40#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
41#define MAYBE_range_basecase 1
42#define MAYBE_range_toom22 1
43#else
44#define MAYBE_range_basecase \
45 ((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM2_THRESHOLD*36/(36-11))
46#define MAYBE_range_toom22 \
47 ((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM3_THRESHOLD*36/(36-11) )
48#endif
49
50/* THINK: The DC strategy uses different constants in different Toom's
51 ranges. Something smoother?
52*/
53
54/*
55 Compute the least significant half of the product {xy,n}*{yp,n}, or
56 formally {rp,n} = {xy,n}*{yp,n} Mod (B^n).
57
58 Above the given threshold, the Divide and Conquer strategy is used.
59 The operand is split in two, and a full square plus a mullo
60 is used to obtain the final result. The more natural strategy is to
61 split in two halves, but this is far from optimal when a
62 sub-quadratic multiplication is used.
63
64 Mulders suggests an unbalanced split in favour of the full product,
65 split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2.
66
67 To compute the value of a, we assume that the cost of mullo for a
68 given size ML(n) is a fraction of the cost of a full product with
69 same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2;
70 then we can write:
71
72 ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e
73
74 Given a value for e, want to minimise the value of k, i.e. the
75 function k=(1-a)^e/(1-2*a^e).
76
77 With e=2, the exponent for schoolbook multiplication, the minimum is
78 given by the values a=1-a=1/2.
79
80 With e=log(3)/log(2), the exponent for Karatsuba (aka toom22),
81 Mulders compute (1-a) = 0.694... and we approximate a with 11/36.
82
83 Other possible approximations follow:
84 e=log(5)/log(3) [Toom-3] -> a ~= 9/40
85 e=log(7)/log(4) [Toom-4] -> a ~= 7/39
86 e=log(11)/log(6) [Toom-6] -> a ~= 1/8
87 e=log(15)/log(8) [Toom-8] -> a ~= 1/10
88
89 The values above where obtained with the following trivial commands
90 in the gp-pari shell:
91
92fun(e,a)=(1-a)^e/(1-2*a^e)
93mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)<m,m=fun(a,p);x=p));mul(a,(b+x)/2,(c+x)/2))}
94contfracpnqn(contfrac(mul(log(2*2-1)/log(2),1/2,0),5))
95contfracpnqn(contfrac(mul(log(3*2-1)/log(3),1/2,0),5))
96contfracpnqn(contfrac(mul(log(4*2-1)/log(4),1/2,0),5))
97contfracpnqn(contfrac(mul(log(6*2-1)/log(6),1/2,0),3))
98contfracpnqn(contfrac(mul(log(8*2-1)/log(8),1/2,0),3))
99
100 ,
101 |\
102 | \
103 +----,
104 | |
105 | |
106 | |\
107 | | \
108 +----+--`
109 ^ n2 ^n1^
110
111 For an actual implementation, the assumption that M(n)=n^e is
112 incorrect, as a consequence also the assumption that ML(n)=k*M(n)
113 with a constant k is wrong.
114
115 But theory suggest us two things:
116 - the best the multiplication product is (lower e), the more k
117 approaches 1, and a approaches 0.
118
119 - A value for a smaller than optimal is probably less bad than a
120 bigger one: e.g. let e=log(3)/log(2), a=0.3058_ the optimal
121 value, and k(a)=0.808_ the mul/mullo speed ratio. We get
122 k(a+1/6)=0.929_ but k(a-1/6)=0.865_.
123*/
124
125static mp_size_t
126mpn_sqrlo_itch (mp_size_t n)
127{
128 return 2*n;
129}
130
131/*
132 mpn_dc_sqrlo requires a scratch space of 2*n limbs at tp.
133 It accepts tp == rp.
134*/
135static void
136mpn_dc_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n, mp_ptr tp)
137{
138 mp_size_t n2, n1;
139 ASSERT (n >= 2);
140 ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
141 ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n));
142
143 /* Divide-and-conquer */
144
145 /* We need fractional approximation of the value 0 < a <= 1/2
146 giving the minimum in the function k=(1-a)^e/(1-2*a^e).
147 */
148 if (MAYBE_range_basecase && BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD*36/(36-11)))
149 n1 = n >> 1;
150 else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD*36/(36-11)))
151 n1 = n * 11 / (size_t) 36; /* n1 ~= n*(1-.694...) */
152 else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD*40/(40-9)))
153 n1 = n * 9 / (size_t) 40; /* n1 ~= n*(1-.775...) */
154 else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD*10/9))
155 n1 = n * 7 / (size_t) 39; /* n1 ~= n*(1-.821...) */
156 /* n1 = n * 4 / (size_t) 31; // n1 ~= n*(1-.871...) [TOOM66] */
157 else
158 n1 = n / (size_t) 10; /* n1 ~= n*(1-.899...) [TOOM88] */
159
160 n2 = n - n1;
161
162 /* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0 */
163
164 /* x0 ^ 2 */
165 mpn_sqr (tp, xp, n2);
166 MPN_COPY (rp, tp, n2);
167
168 /* x1 * x0 * 2^(n2 GMP_NUMB_BITS) */
169 if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
170 mpn_mul_basecase (tp + n, xp + n2, n1, xp, n1);
171 else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
172 mpn_mullo_basecase (tp + n, xp + n2, xp, n1);
173 else
174 mpn_mullo_n (tp + n, xp + n2, xp, n1);
175 /* mpn_dc_mullo_n (tp + n, xp + n2, xp, n1, tp + n); */
176#if HAVE_NATIVE_mpn_addlsh1_n
177 mpn_addlsh1_n (rp + n2, tp + n2, tp + n, n1);
178#else
179 mpn_lshift (rp + n2, tp + n, n1, 1);
180 mpn_add_n (rp + n2, rp + n2, tp + n2, n1);
181#endif
182}
183
184/* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0. */
185#define SQR_BASECASE_ALLOC \
186 (SQRLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*SQRLO_BASECASE_THRESHOLD_LIMIT)
187
188/* FIXME: This function should accept a temporary area; dc_sqrlo
189 accepts a pointer tp, and handle the case tp == rp, do the same here.
190*/
191
192void
193mpn_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n)
194{
195 ASSERT (n >= 1);
196 ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
197
198 if (BELOW_THRESHOLD (n, SQRLO_BASECASE_THRESHOLD))
199 {
200 /* FIXME: smarter criteria? */
201#if HAVE_NATIVE_mpn_mullo_basecase || ! HAVE_NATIVE_mpn_sqr_basecase
202 /* mullo computes as many products as sqr, but directly writes
203 on the result area. */
204 mpn_mullo_basecase (rp, xp, xp, n);
205#else
206 /* Allocate workspace of fixed size on stack: fast! */
207 mp_limb_t tp[SQR_BASECASE_ALLOC];
208 mpn_sqr_basecase (tp, xp, n);
209 MPN_COPY (rp, tp, n);
210#endif
211 }
212 else if (BELOW_THRESHOLD (n, SQRLO_DC_THRESHOLD))
213 {
214 mpn_sqrlo_basecase (rp, xp, n);
215 }
216 else
217 {
218 mp_ptr tp;
219 TMP_DECL;
220 TMP_MARK;
221 tp = TMP_ALLOC_LIMBS (mpn_sqrlo_itch (n));
222 if (BELOW_THRESHOLD (n, SQRLO_SQR_THRESHOLD))
223 {
224 mpn_dc_sqrlo (rp, xp, n, tp);
225 }
226 else
227 {
228 /* For really large operands, use plain mpn_mul_n but throw away upper n
229 limbs of result. */
230#if !TUNE_PROGRAM_BUILD && (SQRLO_SQR_THRESHOLD > SQR_FFT_THRESHOLD)
231 mpn_fft_mul (tp, xp, n, xp, n);
232#else
233 mpn_sqr (tp, xp, n);
234#endif
235 MPN_COPY (rp, tp, n);
236 }
237 TMP_FREE;
238 }
239}