Blame - third_party/gmp/mpn/generic/sqrlo.c - RealtimeRoboticsGroup/test

blob: 71530b6d6dded2f602731e725d219c6a4ea6ec57 [file] [log] [blame]

Austin Schuh	dace2a6	2020-08-18 10:56:48 -0700	[diff] [blame]	1	/* mpn_sqrlo -- squares an n-limb number and returns the low n limbs
				2	of the result.
				3
				4	Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
				5
				6	THIS IS (FOR NOW) AN INTERNAL FUNCTION. IT IS ONLY SAFE TO REACH THIS
				7	FUNCTION THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED
				8	THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
				9
				10	Copyright 2004, 2005, 2009, 2010, 2012, 2015 Free Software Foundation, Inc.
				11
				12	This file is part of the GNU MP Library.
				13
				14	The GNU MP Library is free software; you can redistribute it and/or modify
				15	it under the terms of either:
				16
				17	* the GNU Lesser General Public License as published by the Free
				18	Software Foundation; either version 3 of the License, or (at your
				19	option) any later version.
				20
				21	or
				22
				23	* the GNU General Public License as published by the Free Software
				24	Foundation; either version 2 of the License, or (at your option) any
				25	later version.
				26
				27	or both in parallel, as here.
				28
				29	The GNU MP Library is distributed in the hope that it will be useful, but
				30	WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
				31	or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
				32	for more details.
				33
				34	You should have received copies of the GNU General Public License and the
				35	GNU Lesser General Public License along with the GNU MP Library. If not,
				36	see https://www.gnu.org/licenses/. */
				37
				38	#include "gmp-impl.h"
				39
				40	#if TUNE_PROGRAM_BUILD \|\| WANT_FAT_BINARY
				41	#define MAYBE_range_basecase 1
				42	#define MAYBE_range_toom22 1
				43	#else
				44	#define MAYBE_range_basecase \
				45	((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM2_THRESHOLD*36/(36-11))
				46	#define MAYBE_range_toom22 \
				47	((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM3_THRESHOLD*36/(36-11) )
				48	#endif
				49
				50	/* THINK: The DC strategy uses different constants in different Toom's
				51	ranges. Something smoother?
				52	*/
				53
				54	/*
				55	Compute the least significant half of the product {xy,n}*{yp,n}, or
				56	formally {rp,n} = {xy,n}*{yp,n} Mod (B^n).
				57
				58	Above the given threshold, the Divide and Conquer strategy is used.
				59	The operand is split in two, and a full square plus a mullo
				60	is used to obtain the final result. The more natural strategy is to
				61	split in two halves, but this is far from optimal when a
				62	sub-quadratic multiplication is used.
				63
				64	Mulders suggests an unbalanced split in favour of the full product,
				65	split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2.
				66
				67	To compute the value of a, we assume that the cost of mullo for a
				68	given size ML(n) is a fraction of the cost of a full product with
				69	same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2;
				70	then we can write:
				71
				72	ML(n) = 2ML(an) + M((1-a)n) => kM(n) = 2kM(n)a^e + M(n)(1-a)^e
				73
				74	Given a value for e, want to minimise the value of k, i.e. the
				75	function k=(1-a)^e/(1-2*a^e).
				76
				77	With e=2, the exponent for schoolbook multiplication, the minimum is
				78	given by the values a=1-a=1/2.
				79
				80	With e=log(3)/log(2), the exponent for Karatsuba (aka toom22),
				81	Mulders compute (1-a) = 0.694... and we approximate a with 11/36.
				82
				83	Other possible approximations follow:
				84	e=log(5)/log(3) [Toom-3] -> a ~= 9/40
				85	e=log(7)/log(4) [Toom-4] -> a ~= 7/39
				86	e=log(11)/log(6) [Toom-6] -> a ~= 1/8
				87	e=log(15)/log(8) [Toom-8] -> a ~= 1/10
				88
				89	The values above where obtained with the following trivial commands
				90	in the gp-pari shell:
				91
				92	fun(e,a)=(1-a)^e/(1-2*a^e)
				93	mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)<m,m=fun(a,p);x=p));mul(a,(b+x)/2,(c+x)/2))}
				94	contfracpnqn(contfrac(mul(log(2*2-1)/log(2),1/2,0),5))
				95	contfracpnqn(contfrac(mul(log(3*2-1)/log(3),1/2,0),5))
				96	contfracpnqn(contfrac(mul(log(4*2-1)/log(4),1/2,0),5))
				97	contfracpnqn(contfrac(mul(log(6*2-1)/log(6),1/2,0),3))
				98	contfracpnqn(contfrac(mul(log(8*2-1)/log(8),1/2,0),3))
				99
				100	,
				101	\|\
				102	\| \
				103	+----,
				104	\| \|
				105	\| \|
				106	\| \|\
				107	\| \| \
				108	+----+--`
				109	^ n2 ^n1^
				110
				111	For an actual implementation, the assumption that M(n)=n^e is
				112	incorrect, as a consequence also the assumption that ML(n)=k*M(n)
				113	with a constant k is wrong.
				114
				115	But theory suggest us two things:
				116	- the best the multiplication product is (lower e), the more k
				117	approaches 1, and a approaches 0.
				118
				119	- A value for a smaller than optimal is probably less bad than a
				120	bigger one: e.g. let e=log(3)/log(2), a=0.3058_ the optimal
				121	value, and k(a)=0.808_ the mul/mullo speed ratio. We get
				122	k(a+1/6)=0.929_ but k(a-1/6)=0.865_.
				123	*/
				124
				125	static mp_size_t
				126	mpn_sqrlo_itch (mp_size_t n)
				127	{
				128	return 2*n;
				129	}
				130
				131	/*
				132	mpn_dc_sqrlo requires a scratch space of 2*n limbs at tp.
				133	It accepts tp == rp.
				134	*/
				135	static void
				136	mpn_dc_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n, mp_ptr tp)
				137	{
				138	mp_size_t n2, n1;
				139	ASSERT (n >= 2);
				140	ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
				141	ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n));
				142
				143	/* Divide-and-conquer */
				144
				145	/* We need fractional approximation of the value 0 < a <= 1/2
				146	giving the minimum in the function k=(1-a)^e/(1-2*a^e).
				147	*/
				148	if (MAYBE_range_basecase && BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD*36/(36-11)))
				149	n1 = n >> 1;
				150	else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD*36/(36-11)))
				151	n1 = n * 11 / (size_t) 36; /* n1 ~= n(1-.694...) /
				152	else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD*40/(40-9)))
				153	n1 = n * 9 / (size_t) 40; /* n1 ~= n(1-.775...) /
				154	else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD*10/9))
				155	n1 = n * 7 / (size_t) 39; /* n1 ~= n(1-.821...) /
				156	/* n1 = n * 4 / (size_t) 31; // n1 ~= n(1-.871...) [TOOM66] /
				157	else
				158	n1 = n / (size_t) 10; /* n1 ~= n(1-.899...) [TOOM88] /
				159
				160	n2 = n - n1;
				161
				162	/* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0 */
				163
				164	/* x0 ^ 2 */
				165	mpn_sqr (tp, xp, n2);
				166	MPN_COPY (rp, tp, n2);
				167
				168	/* x1 * x0 * 2^(n2 GMP_NUMB_BITS) */
				169	if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
				170	mpn_mul_basecase (tp + n, xp + n2, n1, xp, n1);
				171	else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
				172	mpn_mullo_basecase (tp + n, xp + n2, xp, n1);
				173	else
				174	mpn_mullo_n (tp + n, xp + n2, xp, n1);
				175	/* mpn_dc_mullo_n (tp + n, xp + n2, xp, n1, tp + n); */
				176	#if HAVE_NATIVE_mpn_addlsh1_n
				177	mpn_addlsh1_n (rp + n2, tp + n2, tp + n, n1);
				178	#else
				179	mpn_lshift (rp + n2, tp + n, n1, 1);
				180	mpn_add_n (rp + n2, rp + n2, tp + n2, n1);
				181	#endif
				182	}
				183
				184	/* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0. */
				185	#define SQR_BASECASE_ALLOC \
				186	(SQRLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*SQRLO_BASECASE_THRESHOLD_LIMIT)
				187
				188	/* FIXME: This function should accept a temporary area; dc_sqrlo
				189	accepts a pointer tp, and handle the case tp == rp, do the same here.
				190	*/
				191
				192	void
				193	mpn_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n)
				194	{
				195	ASSERT (n >= 1);
				196	ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
				197
				198	if (BELOW_THRESHOLD (n, SQRLO_BASECASE_THRESHOLD))
				199	{
				200	/* FIXME: smarter criteria? */
				201	#if HAVE_NATIVE_mpn_mullo_basecase \|\| ! HAVE_NATIVE_mpn_sqr_basecase
				202	/* mullo computes as many products as sqr, but directly writes
				203	on the result area. */
				204	mpn_mullo_basecase (rp, xp, xp, n);
				205	#else
				206	/* Allocate workspace of fixed size on stack: fast! */
				207	mp_limb_t tp[SQR_BASECASE_ALLOC];
				208	mpn_sqr_basecase (tp, xp, n);
				209	MPN_COPY (rp, tp, n);
				210	#endif
				211	}
				212	else if (BELOW_THRESHOLD (n, SQRLO_DC_THRESHOLD))
				213	{
				214	mpn_sqrlo_basecase (rp, xp, n);
				215	}
				216	else
				217	{
				218	mp_ptr tp;
				219	TMP_DECL;
				220	TMP_MARK;
				221	tp = TMP_ALLOC_LIMBS (mpn_sqrlo_itch (n));
				222	if (BELOW_THRESHOLD (n, SQRLO_SQR_THRESHOLD))
				223	{
				224	mpn_dc_sqrlo (rp, xp, n, tp);
				225	}
				226	else
				227	{
				228	/* For really large operands, use plain mpn_mul_n but throw away upper n
				229	limbs of result. */
				230	#if !TUNE_PROGRAM_BUILD && (SQRLO_SQR_THRESHOLD > SQR_FFT_THRESHOLD)
				231	mpn_fft_mul (tp, xp, n, xp, n);
				232	#else
				233	mpn_sqr (tp, xp, n);
				234	#endif
				235	MPN_COPY (rp, tp, n);
				236	}
				237	TMP_FREE;
				238	}
				239	}