Blame - third_party/gmp/mpn/generic/mul.c - RealtimeRoboticsGroup/test

blob: 37444e91b4768f672db4727ff401c0517d9de761 [file] [log] [blame]

Austin Schuh	bb1338c	2024-06-15 19:31:16 -0700	[diff] [blame]	1	/* mpn_mul -- Multiply two natural numbers.
				2
				3	Contributed to the GNU project by Torbjorn Granlund.
				4
				5	Copyright 1991, 1993, 1994, 1996, 1997, 1999-2003, 2005-2007, 2009, 2010, 2012,
				6	2014, 2019 Free Software Foundation, Inc.
				7
				8	This file is part of the GNU MP Library.
				9
				10	The GNU MP Library is free software; you can redistribute it and/or modify
				11	it under the terms of either:
				12
				13	* the GNU Lesser General Public License as published by the Free
				14	Software Foundation; either version 3 of the License, or (at your
				15	option) any later version.
				16
				17	or
				18
				19	* the GNU General Public License as published by the Free Software
				20	Foundation; either version 2 of the License, or (at your option) any
				21	later version.
				22
				23	or both in parallel, as here.
				24
				25	The GNU MP Library is distributed in the hope that it will be useful, but
				26	WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
				27	or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
				28	for more details.
				29
				30	You should have received copies of the GNU General Public License and the
				31	GNU Lesser General Public License along with the GNU MP Library. If not,
				32	see https://www.gnu.org/licenses/. */
				33
				34	#include "gmp-impl.h"
				35
				36
				37	#ifndef MUL_BASECASE_MAX_UN
				38	#define MUL_BASECASE_MAX_UN 500
				39	#endif
				40
				41	/* Areas where the different toom algorithms can be called (extracted
				42	from the t-toom*.c files, and ignoring small constant offsets):
				43
				44	1/6 1/5 1/4 4/13 1/3 3/8 2/5 5/11 1/2 3/5 2/3 3/4 4/5 1 vn/un
				45	4/7 6/7
				46	6/11
				47	\|--------------------\| toom22 (small)
				48	\|\| toom22 (large)
				49	\|xxxx\| toom22 called
				50	\|-------------------------------------\| toom32
				51	\|xxxxxxxxxxxxxxxx\| \| toom32 called
				52	\|------------\| toom33
				53	\|x\| toom33 called
				54	\|---------------------------------\| \| toom42
				55	\|xxxxxxxxxxxxxxxxxxxxxxxx\| \| toom42 called
				56	\|--------------------\| toom43
				57	\|xxxxxxxxxx\| toom43 called
				58	\|-----------------------------\| toom52 (unused)
				59	\|--------\| toom44
				60	\|xxxxxxxx\| toom44 called
				61	\|--------------------\| \| toom53
				62	\|xxxxxx\| toom53 called
				63	\|-------------------------\| toom62 (unused)
				64	\|----------------\| toom54 (unused)
				65	\|--------------------\| toom63
				66	\|xxxxxxxxx\| \| toom63 called
				67	\|---------------------------------\| toom6h
				68	\|xxxxxxxx\| toom6h called
				69	\|-------------------------\| toom8h (32 bit)
				70	\|------------------------------------------\| toom8h (64 bit)
				71	\|xxxxxxxx\| toom8h called
				72	*/
				73
				74	#define TOOM33_OK(an,bn) (6 + 2 * an < 3 * bn)
				75	#define TOOM44_OK(an,bn) (12 + 3 * an < 4 * bn)
				76
				77	/* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v
				78	(pointed to by VP, with VN limbs), and store the result at PRODP. The
				79	result is UN + VN limbs. Return the most significant limb of the result.
				80
				81	NOTE: The space pointed to by PRODP is overwritten before finished with U
				82	and V, so overlap is an error.
				83
				84	Argument constraints:
				85	1. UN >= VN.
				86	2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from
				87	the multiplier and the multiplicand. */
				88
				89	/*
				90	* The cutoff lines in the toomX2 and toomX3 code are now exactly between the
				91	ideal lines of the surrounding algorithms. Is that optimal?
				92
				93	* The toomX3 code now uses a structure similar to the one of toomX2, except
				94	that it loops longer in the unbalanced case. The result is that the
				95	remaining area might have un < vn. Should we fix the toomX2 code in a
				96	similar way?
				97
				98	* The toomX3 code is used for the largest non-FFT unbalanced operands. It
				99	therefore calls mpn_mul recursively for certain cases.
				100
				101	* Allocate static temp space using THRESHOLD variables (except for toom44
				102	when !WANT_FFT). That way, we can typically have no TMP_ALLOC at all.
				103
				104	* We sort ToomX2 algorithms together, assuming the toom22, toom32, toom42
				105	have the same vn threshold. This is not true, we should actually use
				106	mul_basecase for slightly larger operands for toom32 than for toom22, and
				107	even larger for toom42.
				108
				109	* That problem is even more prevalent for toomX3. We therefore use special
				110	THRESHOLD variables there.
				111	*/
				112
				113	mp_limb_t
				114	mpn_mul (mp_ptr prodp,
				115	mp_srcptr up, mp_size_t un,
				116	mp_srcptr vp, mp_size_t vn)
				117	{
				118	ASSERT (un >= vn);
				119	ASSERT (vn >= 1);
				120	ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un));
				121	ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn));
				122
				123	if (BELOW_THRESHOLD (un, MUL_TOOM22_THRESHOLD))
				124	{
				125	/* When un (and thus vn) is below the toom22 range, do mul_basecase.
				126	Test un and not vn here not to thwart the un >> vn code below.
				127	This special case is not necessary, but cuts the overhead for the
				128	smallest operands. */
				129	mpn_mul_basecase (prodp, up, un, vp, vn);
				130	}
				131	else if (un == vn)
				132	{
				133	mpn_mul_n (prodp, up, vp, un);
				134	}
				135	else if (vn < MUL_TOOM22_THRESHOLD)
				136	{ /* plain schoolbook multiplication */
				137
				138	/* Unless un is very large, or else if have an applicable mpn_mul_N,
				139	perform basecase multiply directly. */
				140	if (un <= MUL_BASECASE_MAX_UN
				141	#if HAVE_NATIVE_mpn_mul_2
				142	\|\| vn <= 2
				143	#else
				144	\|\| vn == 1
				145	#endif
				146	)
				147	mpn_mul_basecase (prodp, up, un, vp, vn);
				148	else
				149	{
				150	/* We have un >> MUL_BASECASE_MAX_UN > vn. For better memory
				151	locality, split up[] into MUL_BASECASE_MAX_UN pieces and multiply
				152	these pieces with the vp[] operand. After each such partial
				153	multiplication (but the last) we copy the most significant vn
				154	limbs into a temporary buffer since that part would otherwise be
				155	overwritten by the next multiplication. After the next
				156	multiplication, we add it back. This illustrates the situation:
				157
				158	-->vn<--
				159	\| \|<------- un ------->\|
				160	_____________________\|
				161	X /\|
				162	/XX__________________/ \|
				163	_____________________ \|
				164	X / \|
				165	/XX__________________/ \|
				166	_____________________ \|
				167	/ / \|
				168	/____________________/ \|
				169	==================================================================
				170
				171	The parts marked with X are the parts whose sums are copied into
				172	the temporary buffer. */
				173
				174	mp_limb_t tp[MUL_TOOM22_THRESHOLD_LIMIT];
				175	mp_limb_t cy;
				176	ASSERT (MUL_TOOM22_THRESHOLD <= MUL_TOOM22_THRESHOLD_LIMIT);
				177
				178	mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
				179	prodp += MUL_BASECASE_MAX_UN;
				180	MPN_COPY (tp, prodp, vn); /* preserve high triangle */
				181	up += MUL_BASECASE_MAX_UN;
				182	un -= MUL_BASECASE_MAX_UN;
				183	while (un > MUL_BASECASE_MAX_UN)
				184	{
				185	mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
				186	cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
				187	mpn_incr_u (prodp + vn, cy);
				188	prodp += MUL_BASECASE_MAX_UN;
				189	MPN_COPY (tp, prodp, vn); /* preserve high triangle */
				190	up += MUL_BASECASE_MAX_UN;
				191	un -= MUL_BASECASE_MAX_UN;
				192	}
				193	if (un > vn)
				194	{
				195	mpn_mul_basecase (prodp, up, un, vp, vn);
				196	}
				197	else
				198	{
				199	ASSERT (un > 0);
				200	mpn_mul_basecase (prodp, vp, vn, up, un);
				201	}
				202	cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
				203	mpn_incr_u (prodp + vn, cy);
				204	}
				205	}
				206	else if (BELOW_THRESHOLD (vn, MUL_TOOM33_THRESHOLD))
				207	{
				208	/* Use ToomX2 variants */
				209	mp_ptr scratch;
				210	TMP_SDECL; TMP_SMARK;
				211
				212	#define ITCH_TOOMX2 (9 * vn / 2 + GMP_NUMB_BITS * 2)
				213	scratch = TMP_SALLOC_LIMBS (ITCH_TOOMX2);
				214	ASSERT (mpn_toom22_mul_itch ((5vn-1)/4, vn) <= ITCH_TOOMX2); / 5vn/2+ */
				215	ASSERT (mpn_toom32_mul_itch ((7vn-1)/4, vn) <= ITCH_TOOMX2); / 7vn/6+ */
				216	ASSERT (mpn_toom42_mul_itch (3 * vn - 1, vn) <= ITCH_TOOMX2); /* 9vn/2+ */
				217	#undef ITCH_TOOMX2
				218
				219	/* FIXME: This condition (repeated in the loop below) leaves from a vn*vn
				220	square to a (3vn-1)*vn rectangle. Leaving such a rectangle is hardly
				221	wise; we would get better balance by slightly moving the bound. We
				222	will sometimes end up with un < vn, like in the X3 arm below. */
				223	if (un >= 3 * vn)
				224	{
				225	mp_limb_t cy;
				226	mp_ptr ws;
				227
				228	/* The maximum ws usage is for the mpn_mul result. */
				229	ws = TMP_SALLOC_LIMBS (4 * vn);
				230
				231	mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
				232	un -= 2 * vn;
				233	up += 2 * vn;
				234	prodp += 2 * vn;
				235
				236	while (un >= 3 * vn)
				237	{
				238	mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
				239	un -= 2 * vn;
				240	up += 2 * vn;
				241	cy = mpn_add_n (prodp, prodp, ws, vn);
				242	MPN_COPY (prodp + vn, ws + vn, 2 * vn);
				243	mpn_incr_u (prodp + vn, cy);
				244	prodp += 2 * vn;
				245	}
				246
				247	/* vn <= un < 3vn */
				248
				249	if (4 * un < 5 * vn)
				250	mpn_toom22_mul (ws, up, un, vp, vn, scratch);
				251	else if (4 * un < 7 * vn)
				252	mpn_toom32_mul (ws, up, un, vp, vn, scratch);
				253	else
				254	mpn_toom42_mul (ws, up, un, vp, vn, scratch);
				255
				256	cy = mpn_add_n (prodp, prodp, ws, vn);
				257	MPN_COPY (prodp + vn, ws + vn, un);
				258	mpn_incr_u (prodp + vn, cy);
				259	}
				260	else
				261	{
				262	if (4 * un < 5 * vn)
				263	mpn_toom22_mul (prodp, up, un, vp, vn, scratch);
				264	else if (4 * un < 7 * vn)
				265	mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
				266	else
				267	mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
				268	}
				269	TMP_SFREE;
				270	}
				271	else if (BELOW_THRESHOLD ((un + vn) >> 1, MUL_FFT_THRESHOLD) \|\|
				272	BELOW_THRESHOLD (3 * vn, MUL_FFT_THRESHOLD))
				273	{
				274	/* Handle the largest operands that are not in the FFT range. The 2nd
				275	condition makes very unbalanced operands avoid the FFT code (except
				276	perhaps as coefficient products of the Toom code. */
				277
				278	if (BELOW_THRESHOLD (vn, MUL_TOOM44_THRESHOLD) \|\| !TOOM44_OK (un, vn))
				279	{
				280	/* Use ToomX3 variants */
				281	mp_ptr scratch;
				282	TMP_DECL; TMP_MARK;
				283
				284	#define ITCH_TOOMX3 (4 * vn + GMP_NUMB_BITS)
				285	scratch = TMP_ALLOC_LIMBS (ITCH_TOOMX3);
				286	ASSERT (mpn_toom33_mul_itch ((7vn-1)/6, vn) <= ITCH_TOOMX3); / 7vn/2+ */
				287	ASSERT (mpn_toom43_mul_itch ((3vn-1)/2, vn) <= ITCH_TOOMX3); / 9vn/4+ */
				288	ASSERT (mpn_toom32_mul_itch ((7vn-1)/4, vn) <= ITCH_TOOMX3); / 7vn/6+ */
				289	ASSERT (mpn_toom53_mul_itch ((11vn-1)/6, vn) <= ITCH_TOOMX3); / 11vn/3+ */
				290	ASSERT (mpn_toom42_mul_itch ((5vn-1)/2, vn) <= ITCH_TOOMX3); / 15vn/4+ */
				291	ASSERT (mpn_toom63_mul_itch ((5vn-1)/2, vn) <= ITCH_TOOMX3); / 15vn/4+ */
				292	#undef ITCH_TOOMX3
				293
				294	if (2 * un >= 5 * vn)
				295	{
				296	mp_limb_t cy;
				297	mp_ptr ws;
				298
				299	/* The maximum ws usage is for the mpn_mul result. */
				300	ws = TMP_ALLOC_LIMBS (7 * vn >> 1);
				301
				302	if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
				303	mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
				304	else
				305	mpn_toom63_mul (prodp, up, 2 * vn, vp, vn, scratch);
				306	un -= 2 * vn;
				307	up += 2 * vn;
				308	prodp += 2 * vn;
				309
				310	while (2 * un >= 5 * vn) /* un >= 2.5vn */
				311	{
				312	if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
				313	mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
				314	else
				315	mpn_toom63_mul (ws, up, 2 * vn, vp, vn, scratch);
				316	un -= 2 * vn;
				317	up += 2 * vn;
				318	cy = mpn_add_n (prodp, prodp, ws, vn);
				319	MPN_COPY (prodp + vn, ws + vn, 2 * vn);
				320	mpn_incr_u (prodp + vn, cy);
				321	prodp += 2 * vn;
				322	}
				323
				324	/* vn / 2 <= un < 2.5vn */
				325
				326	if (un < vn)
				327	mpn_mul (ws, vp, vn, up, un);
				328	else
				329	mpn_mul (ws, up, un, vp, vn);
				330
				331	cy = mpn_add_n (prodp, prodp, ws, vn);
				332	MPN_COPY (prodp + vn, ws + vn, un);
				333	mpn_incr_u (prodp + vn, cy);
				334	}
				335	else
				336	{
				337	if (6 * un < 7 * vn)
				338	mpn_toom33_mul (prodp, up, un, vp, vn, scratch);
				339	else if (2 * un < 3 * vn)
				340	{
				341	if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM43_THRESHOLD))
				342	mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
				343	else
				344	mpn_toom43_mul (prodp, up, un, vp, vn, scratch);
				345	}
				346	else if (6 * un < 11 * vn)
				347	{
				348	if (4 * un < 7 * vn)
				349	{
				350	if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM53_THRESHOLD))
				351	mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
				352	else
				353	mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
				354	}
				355	else
				356	{
				357	if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM53_THRESHOLD))
				358	mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
				359	else
				360	mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
				361	}
				362	}
				363	else
				364	{
				365	if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
				366	mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
				367	else
				368	mpn_toom63_mul (prodp, up, un, vp, vn, scratch);
				369	}
				370	}
				371	TMP_FREE;
				372	}
				373	else
				374	{
				375	mp_ptr scratch;
				376	TMP_DECL; TMP_MARK;
				377
				378	if (BELOW_THRESHOLD (vn, MUL_TOOM6H_THRESHOLD))
				379	{
				380	scratch = TMP_SALLOC_LIMBS (mpn_toom44_mul_itch (un, vn));
				381	mpn_toom44_mul (prodp, up, un, vp, vn, scratch);
				382	}
				383	else if (BELOW_THRESHOLD (vn, MUL_TOOM8H_THRESHOLD))
				384	{
				385	scratch = TMP_SALLOC_LIMBS (mpn_toom6h_mul_itch (un, vn));
				386	mpn_toom6h_mul (prodp, up, un, vp, vn, scratch);
				387	}
				388	else
				389	{
				390	scratch = TMP_ALLOC_LIMBS (mpn_toom8h_mul_itch (un, vn));
				391	mpn_toom8h_mul (prodp, up, un, vp, vn, scratch);
				392	}
				393	TMP_FREE;
				394	}
				395	}
				396	else
				397	{
				398	if (un >= 8 * vn)
				399	{
				400	mp_limb_t cy;
				401	mp_ptr ws;
				402	TMP_DECL; TMP_MARK;
				403
				404	/* The maximum ws usage is for the mpn_mul result. */
				405	ws = TMP_BALLOC_LIMBS (9 * vn >> 1);
				406
				407	mpn_fft_mul (prodp, up, 3 * vn, vp, vn);
				408	un -= 3 * vn;
				409	up += 3 * vn;
				410	prodp += 3 * vn;
				411
				412	while (2 * un >= 7 * vn) /* un >= 3.5vn */
				413	{
				414	mpn_fft_mul (ws, up, 3 * vn, vp, vn);
				415	un -= 3 * vn;
				416	up += 3 * vn;
				417	cy = mpn_add_n (prodp, prodp, ws, vn);
				418	MPN_COPY (prodp + vn, ws + vn, 3 * vn);
				419	mpn_incr_u (prodp + vn, cy);
				420	prodp += 3 * vn;
				421	}
				422
				423	/* vn / 2 <= un < 3.5vn */
				424
				425	if (un < vn)
				426	mpn_mul (ws, vp, vn, up, un);
				427	else
				428	mpn_mul (ws, up, un, vp, vn);
				429
				430	cy = mpn_add_n (prodp, prodp, ws, vn);
				431	MPN_COPY (prodp + vn, ws + vn, un);
				432	mpn_incr_u (prodp + vn, cy);
				433
				434	TMP_FREE;
				435	}
				436	else
				437	mpn_fft_mul (prodp, up, un, vp, vn);
				438	}
				439
				440	return prodp[un + vn - 1]; /* historic */
				441	}