blob: 37444e91b4768f672db4727ff401c0517d9de761 [file] [log] [blame]
Austin Schuhbb1338c2024-06-15 19:31:16 -07001/* mpn_mul -- Multiply two natural numbers.
2
3 Contributed to the GNU project by Torbjorn Granlund.
4
5Copyright 1991, 1993, 1994, 1996, 1997, 1999-2003, 2005-2007, 2009, 2010, 2012,
62014, 2019 Free Software Foundation, Inc.
7
8This file is part of the GNU MP Library.
9
10The GNU MP Library is free software; you can redistribute it and/or modify
11it under the terms of either:
12
13 * the GNU Lesser General Public License as published by the Free
14 Software Foundation; either version 3 of the License, or (at your
15 option) any later version.
16
17or
18
19 * the GNU General Public License as published by the Free Software
20 Foundation; either version 2 of the License, or (at your option) any
21 later version.
22
23or both in parallel, as here.
24
25The GNU MP Library is distributed in the hope that it will be useful, but
26WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
28for more details.
29
30You should have received copies of the GNU General Public License and the
31GNU Lesser General Public License along with the GNU MP Library. If not,
32see https://www.gnu.org/licenses/. */
33
34#include "gmp-impl.h"
35
36
37#ifndef MUL_BASECASE_MAX_UN
38#define MUL_BASECASE_MAX_UN 500
39#endif
40
41/* Areas where the different toom algorithms can be called (extracted
42 from the t-toom*.c files, and ignoring small constant offsets):
43
44 1/6 1/5 1/4 4/13 1/3 3/8 2/5 5/11 1/2 3/5 2/3 3/4 4/5 1 vn/un
45 4/7 6/7
46 6/11
47 |--------------------| toom22 (small)
48 || toom22 (large)
49 |xxxx| toom22 called
50 |-------------------------------------| toom32
51 |xxxxxxxxxxxxxxxx| | toom32 called
52 |------------| toom33
53 |x| toom33 called
54 |---------------------------------| | toom42
55 |xxxxxxxxxxxxxxxxxxxxxxxx| | toom42 called
56 |--------------------| toom43
57 |xxxxxxxxxx| toom43 called
58 |-----------------------------| toom52 (unused)
59 |--------| toom44
60 |xxxxxxxx| toom44 called
61 |--------------------| | toom53
62 |xxxxxx| toom53 called
63 |-------------------------| toom62 (unused)
64 |----------------| toom54 (unused)
65 |--------------------| toom63
66 |xxxxxxxxx| | toom63 called
67 |---------------------------------| toom6h
68 |xxxxxxxx| toom6h called
69 |-------------------------| toom8h (32 bit)
70 |------------------------------------------| toom8h (64 bit)
71 |xxxxxxxx| toom8h called
72*/
73
74#define TOOM33_OK(an,bn) (6 + 2 * an < 3 * bn)
75#define TOOM44_OK(an,bn) (12 + 3 * an < 4 * bn)
76
77/* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v
78 (pointed to by VP, with VN limbs), and store the result at PRODP. The
79 result is UN + VN limbs. Return the most significant limb of the result.
80
81 NOTE: The space pointed to by PRODP is overwritten before finished with U
82 and V, so overlap is an error.
83
84 Argument constraints:
85 1. UN >= VN.
86 2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from
87 the multiplier and the multiplicand. */
88
89/*
90 * The cutoff lines in the toomX2 and toomX3 code are now exactly between the
91 ideal lines of the surrounding algorithms. Is that optimal?
92
93 * The toomX3 code now uses a structure similar to the one of toomX2, except
94 that it loops longer in the unbalanced case. The result is that the
95 remaining area might have un < vn. Should we fix the toomX2 code in a
96 similar way?
97
98 * The toomX3 code is used for the largest non-FFT unbalanced operands. It
99 therefore calls mpn_mul recursively for certain cases.
100
101 * Allocate static temp space using THRESHOLD variables (except for toom44
102 when !WANT_FFT). That way, we can typically have no TMP_ALLOC at all.
103
104 * We sort ToomX2 algorithms together, assuming the toom22, toom32, toom42
105 have the same vn threshold. This is not true, we should actually use
106 mul_basecase for slightly larger operands for toom32 than for toom22, and
107 even larger for toom42.
108
109 * That problem is even more prevalent for toomX3. We therefore use special
110 THRESHOLD variables there.
111*/
112
113mp_limb_t
114mpn_mul (mp_ptr prodp,
115 mp_srcptr up, mp_size_t un,
116 mp_srcptr vp, mp_size_t vn)
117{
118 ASSERT (un >= vn);
119 ASSERT (vn >= 1);
120 ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un));
121 ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn));
122
123 if (BELOW_THRESHOLD (un, MUL_TOOM22_THRESHOLD))
124 {
125 /* When un (and thus vn) is below the toom22 range, do mul_basecase.
126 Test un and not vn here not to thwart the un >> vn code below.
127 This special case is not necessary, but cuts the overhead for the
128 smallest operands. */
129 mpn_mul_basecase (prodp, up, un, vp, vn);
130 }
131 else if (un == vn)
132 {
133 mpn_mul_n (prodp, up, vp, un);
134 }
135 else if (vn < MUL_TOOM22_THRESHOLD)
136 { /* plain schoolbook multiplication */
137
138 /* Unless un is very large, or else if have an applicable mpn_mul_N,
139 perform basecase multiply directly. */
140 if (un <= MUL_BASECASE_MAX_UN
141#if HAVE_NATIVE_mpn_mul_2
142 || vn <= 2
143#else
144 || vn == 1
145#endif
146 )
147 mpn_mul_basecase (prodp, up, un, vp, vn);
148 else
149 {
150 /* We have un >> MUL_BASECASE_MAX_UN > vn. For better memory
151 locality, split up[] into MUL_BASECASE_MAX_UN pieces and multiply
152 these pieces with the vp[] operand. After each such partial
153 multiplication (but the last) we copy the most significant vn
154 limbs into a temporary buffer since that part would otherwise be
155 overwritten by the next multiplication. After the next
156 multiplication, we add it back. This illustrates the situation:
157
158 -->vn<--
159 | |<------- un ------->|
160 _____________________|
161 X /|
162 /XX__________________/ |
163 _____________________ |
164 X / |
165 /XX__________________/ |
166 _____________________ |
167 / / |
168 /____________________/ |
169 ==================================================================
170
171 The parts marked with X are the parts whose sums are copied into
172 the temporary buffer. */
173
174 mp_limb_t tp[MUL_TOOM22_THRESHOLD_LIMIT];
175 mp_limb_t cy;
176 ASSERT (MUL_TOOM22_THRESHOLD <= MUL_TOOM22_THRESHOLD_LIMIT);
177
178 mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
179 prodp += MUL_BASECASE_MAX_UN;
180 MPN_COPY (tp, prodp, vn); /* preserve high triangle */
181 up += MUL_BASECASE_MAX_UN;
182 un -= MUL_BASECASE_MAX_UN;
183 while (un > MUL_BASECASE_MAX_UN)
184 {
185 mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
186 cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
187 mpn_incr_u (prodp + vn, cy);
188 prodp += MUL_BASECASE_MAX_UN;
189 MPN_COPY (tp, prodp, vn); /* preserve high triangle */
190 up += MUL_BASECASE_MAX_UN;
191 un -= MUL_BASECASE_MAX_UN;
192 }
193 if (un > vn)
194 {
195 mpn_mul_basecase (prodp, up, un, vp, vn);
196 }
197 else
198 {
199 ASSERT (un > 0);
200 mpn_mul_basecase (prodp, vp, vn, up, un);
201 }
202 cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
203 mpn_incr_u (prodp + vn, cy);
204 }
205 }
206 else if (BELOW_THRESHOLD (vn, MUL_TOOM33_THRESHOLD))
207 {
208 /* Use ToomX2 variants */
209 mp_ptr scratch;
210 TMP_SDECL; TMP_SMARK;
211
212#define ITCH_TOOMX2 (9 * vn / 2 + GMP_NUMB_BITS * 2)
213 scratch = TMP_SALLOC_LIMBS (ITCH_TOOMX2);
214 ASSERT (mpn_toom22_mul_itch ((5*vn-1)/4, vn) <= ITCH_TOOMX2); /* 5vn/2+ */
215 ASSERT (mpn_toom32_mul_itch ((7*vn-1)/4, vn) <= ITCH_TOOMX2); /* 7vn/6+ */
216 ASSERT (mpn_toom42_mul_itch (3 * vn - 1, vn) <= ITCH_TOOMX2); /* 9vn/2+ */
217#undef ITCH_TOOMX2
218
219 /* FIXME: This condition (repeated in the loop below) leaves from a vn*vn
220 square to a (3vn-1)*vn rectangle. Leaving such a rectangle is hardly
221 wise; we would get better balance by slightly moving the bound. We
222 will sometimes end up with un < vn, like in the X3 arm below. */
223 if (un >= 3 * vn)
224 {
225 mp_limb_t cy;
226 mp_ptr ws;
227
228 /* The maximum ws usage is for the mpn_mul result. */
229 ws = TMP_SALLOC_LIMBS (4 * vn);
230
231 mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
232 un -= 2 * vn;
233 up += 2 * vn;
234 prodp += 2 * vn;
235
236 while (un >= 3 * vn)
237 {
238 mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
239 un -= 2 * vn;
240 up += 2 * vn;
241 cy = mpn_add_n (prodp, prodp, ws, vn);
242 MPN_COPY (prodp + vn, ws + vn, 2 * vn);
243 mpn_incr_u (prodp + vn, cy);
244 prodp += 2 * vn;
245 }
246
247 /* vn <= un < 3vn */
248
249 if (4 * un < 5 * vn)
250 mpn_toom22_mul (ws, up, un, vp, vn, scratch);
251 else if (4 * un < 7 * vn)
252 mpn_toom32_mul (ws, up, un, vp, vn, scratch);
253 else
254 mpn_toom42_mul (ws, up, un, vp, vn, scratch);
255
256 cy = mpn_add_n (prodp, prodp, ws, vn);
257 MPN_COPY (prodp + vn, ws + vn, un);
258 mpn_incr_u (prodp + vn, cy);
259 }
260 else
261 {
262 if (4 * un < 5 * vn)
263 mpn_toom22_mul (prodp, up, un, vp, vn, scratch);
264 else if (4 * un < 7 * vn)
265 mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
266 else
267 mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
268 }
269 TMP_SFREE;
270 }
271 else if (BELOW_THRESHOLD ((un + vn) >> 1, MUL_FFT_THRESHOLD) ||
272 BELOW_THRESHOLD (3 * vn, MUL_FFT_THRESHOLD))
273 {
274 /* Handle the largest operands that are not in the FFT range. The 2nd
275 condition makes very unbalanced operands avoid the FFT code (except
276 perhaps as coefficient products of the Toom code. */
277
278 if (BELOW_THRESHOLD (vn, MUL_TOOM44_THRESHOLD) || !TOOM44_OK (un, vn))
279 {
280 /* Use ToomX3 variants */
281 mp_ptr scratch;
282 TMP_DECL; TMP_MARK;
283
284#define ITCH_TOOMX3 (4 * vn + GMP_NUMB_BITS)
285 scratch = TMP_ALLOC_LIMBS (ITCH_TOOMX3);
286 ASSERT (mpn_toom33_mul_itch ((7*vn-1)/6, vn) <= ITCH_TOOMX3); /* 7vn/2+ */
287 ASSERT (mpn_toom43_mul_itch ((3*vn-1)/2, vn) <= ITCH_TOOMX3); /* 9vn/4+ */
288 ASSERT (mpn_toom32_mul_itch ((7*vn-1)/4, vn) <= ITCH_TOOMX3); /* 7vn/6+ */
289 ASSERT (mpn_toom53_mul_itch ((11*vn-1)/6, vn) <= ITCH_TOOMX3); /* 11vn/3+ */
290 ASSERT (mpn_toom42_mul_itch ((5*vn-1)/2, vn) <= ITCH_TOOMX3); /* 15vn/4+ */
291 ASSERT (mpn_toom63_mul_itch ((5*vn-1)/2, vn) <= ITCH_TOOMX3); /* 15vn/4+ */
292#undef ITCH_TOOMX3
293
294 if (2 * un >= 5 * vn)
295 {
296 mp_limb_t cy;
297 mp_ptr ws;
298
299 /* The maximum ws usage is for the mpn_mul result. */
300 ws = TMP_ALLOC_LIMBS (7 * vn >> 1);
301
302 if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
303 mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
304 else
305 mpn_toom63_mul (prodp, up, 2 * vn, vp, vn, scratch);
306 un -= 2 * vn;
307 up += 2 * vn;
308 prodp += 2 * vn;
309
310 while (2 * un >= 5 * vn) /* un >= 2.5vn */
311 {
312 if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
313 mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
314 else
315 mpn_toom63_mul (ws, up, 2 * vn, vp, vn, scratch);
316 un -= 2 * vn;
317 up += 2 * vn;
318 cy = mpn_add_n (prodp, prodp, ws, vn);
319 MPN_COPY (prodp + vn, ws + vn, 2 * vn);
320 mpn_incr_u (prodp + vn, cy);
321 prodp += 2 * vn;
322 }
323
324 /* vn / 2 <= un < 2.5vn */
325
326 if (un < vn)
327 mpn_mul (ws, vp, vn, up, un);
328 else
329 mpn_mul (ws, up, un, vp, vn);
330
331 cy = mpn_add_n (prodp, prodp, ws, vn);
332 MPN_COPY (prodp + vn, ws + vn, un);
333 mpn_incr_u (prodp + vn, cy);
334 }
335 else
336 {
337 if (6 * un < 7 * vn)
338 mpn_toom33_mul (prodp, up, un, vp, vn, scratch);
339 else if (2 * un < 3 * vn)
340 {
341 if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM43_THRESHOLD))
342 mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
343 else
344 mpn_toom43_mul (prodp, up, un, vp, vn, scratch);
345 }
346 else if (6 * un < 11 * vn)
347 {
348 if (4 * un < 7 * vn)
349 {
350 if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM53_THRESHOLD))
351 mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
352 else
353 mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
354 }
355 else
356 {
357 if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM53_THRESHOLD))
358 mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
359 else
360 mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
361 }
362 }
363 else
364 {
365 if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
366 mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
367 else
368 mpn_toom63_mul (prodp, up, un, vp, vn, scratch);
369 }
370 }
371 TMP_FREE;
372 }
373 else
374 {
375 mp_ptr scratch;
376 TMP_DECL; TMP_MARK;
377
378 if (BELOW_THRESHOLD (vn, MUL_TOOM6H_THRESHOLD))
379 {
380 scratch = TMP_SALLOC_LIMBS (mpn_toom44_mul_itch (un, vn));
381 mpn_toom44_mul (prodp, up, un, vp, vn, scratch);
382 }
383 else if (BELOW_THRESHOLD (vn, MUL_TOOM8H_THRESHOLD))
384 {
385 scratch = TMP_SALLOC_LIMBS (mpn_toom6h_mul_itch (un, vn));
386 mpn_toom6h_mul (prodp, up, un, vp, vn, scratch);
387 }
388 else
389 {
390 scratch = TMP_ALLOC_LIMBS (mpn_toom8h_mul_itch (un, vn));
391 mpn_toom8h_mul (prodp, up, un, vp, vn, scratch);
392 }
393 TMP_FREE;
394 }
395 }
396 else
397 {
398 if (un >= 8 * vn)
399 {
400 mp_limb_t cy;
401 mp_ptr ws;
402 TMP_DECL; TMP_MARK;
403
404 /* The maximum ws usage is for the mpn_mul result. */
405 ws = TMP_BALLOC_LIMBS (9 * vn >> 1);
406
407 mpn_fft_mul (prodp, up, 3 * vn, vp, vn);
408 un -= 3 * vn;
409 up += 3 * vn;
410 prodp += 3 * vn;
411
412 while (2 * un >= 7 * vn) /* un >= 3.5vn */
413 {
414 mpn_fft_mul (ws, up, 3 * vn, vp, vn);
415 un -= 3 * vn;
416 up += 3 * vn;
417 cy = mpn_add_n (prodp, prodp, ws, vn);
418 MPN_COPY (prodp + vn, ws + vn, 3 * vn);
419 mpn_incr_u (prodp + vn, cy);
420 prodp += 3 * vn;
421 }
422
423 /* vn / 2 <= un < 3.5vn */
424
425 if (un < vn)
426 mpn_mul (ws, vp, vn, up, un);
427 else
428 mpn_mul (ws, up, un, vp, vn);
429
430 cy = mpn_add_n (prodp, prodp, ws, vn);
431 MPN_COPY (prodp + vn, ws + vn, un);
432 mpn_incr_u (prodp + vn, cy);
433
434 TMP_FREE;
435 }
436 else
437 mpn_fft_mul (prodp, up, un, vp, vn);
438 }
439
440 return prodp[un + vn - 1]; /* historic */
441}