third_party/gmp/mpn/generic/sqrlo.c - RealtimeRoboticsGroup/test - Gitiles

 /* mpn_sqrlo -- squares an n-limb number and returns the low n limbs
    of the result.

    Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.

    THIS IS (FOR NOW) AN INTERNAL FUNCTION.  IT IS ONLY SAFE TO REACH THIS
    FUNCTION THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED
    THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.

 Copyright 2004, 2005, 2009, 2010, 2012, 2015 Free Software Foundation, Inc.

 This file is part of the GNU MP Library.

 The GNU MP Library is free software; you can redistribute it and/or modify
 it under the terms of either:

   * the GNU Lesser General Public License as published by the Free
     Software Foundation; either version 3 of the License, or (at your
     option) any later version.

 or

   * the GNU General Public License as published by the Free Software
     Foundation; either version 2 of the License, or (at your option) any
     later version.

 or both in parallel, as here.

 The GNU MP Library is distributed in the hope that it will be useful, but
 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 for more details.

 You should have received copies of the GNU General Public License and the
 GNU Lesser General Public License along with the GNU MP Library.  If not,
 see https://www.gnu.org/licenses/.  */

 #include "gmp-impl.h"

 #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
 #define MAYBE_range_basecase 1
 #define MAYBE_range_toom22   1
 #else
 #define MAYBE_range_basecase                                           \
   ((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM2_THRESHOLD*36/(36-11))
 #define MAYBE_range_toom22                                             \
   ((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM3_THRESHOLD*36/(36-11) )
 #endif

 /*  THINK: The DC strategy uses different constants in different Toom's
 	 ranges. Something smoother?
 */

 /*
   Compute the least significant half of the product {xy,n}*{yp,n}, or
   formally {rp,n} = {xy,n}*{yp,n} Mod (B^n).

   Above the given threshold, the Divide and Conquer strategy is used.
   The operand is split in two, and a full square plus a mullo
   is used to obtain the final result. The more natural strategy is to
   split in two halves, but this is far from optimal when a
   sub-quadratic multiplication is used.

   Mulders suggests an unbalanced split in favour of the full product,
   split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2.

   To compute the value of a, we assume that the cost of mullo for a
   given size ML(n) is a fraction of the cost of a full product with
   same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2;
   then we can write:

   ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e

   Given a value for e, want to minimise the value of k, i.e. the
   function k=(1-a)^e/(1-2*a^e).

   With e=2, the exponent for schoolbook multiplication, the minimum is
   given by the values a=1-a=1/2.

   With e=log(3)/log(2), the exponent for Karatsuba (aka toom22),
   Mulders compute (1-a) = 0.694... and we approximate a with 11/36.

   Other possible approximations follow:
   e=log(5)/log(3) [Toom-3] -> a ~= 9/40
   e=log(7)/log(4) [Toom-4] -> a ~= 7/39
   e=log(11)/log(6) [Toom-6] -> a ~= 1/8
   e=log(15)/log(8) [Toom-8] -> a ~= 1/10

   The values above where obtained with the following trivial commands
   in the gp-pari shell:

 fun(e,a)=(1-a)^e/(1-2*a^e)
 mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)<m,m=fun(a,p);x=p));mul(a,(b+x)/2,(c+x)/2))}
 contfracpnqn(contfrac(mul(log(2*2-1)/log(2),1/2,0),5))
 contfracpnqn(contfrac(mul(log(3*2-1)/log(3),1/2,0),5))
 contfracpnqn(contfrac(mul(log(4*2-1)/log(4),1/2,0),5))
 contfracpnqn(contfrac(mul(log(6*2-1)/log(6),1/2,0),3))
 contfracpnqn(contfrac(mul(log(8*2-1)/log(8),1/2,0),3))

   ,
   |\
   | \
   +----,
   |    |
   |    |
   |    |\
   |    | \
   +----+--`
   ^ n2 ^n1^

   For an actual implementation, the assumption that M(n)=n^e is
   incorrect, as a consequence also the assumption that ML(n)=k*M(n)
   with a constant k is wrong.

   But theory suggest us two things:
   - the best the multiplication product is (lower e), the more k
     approaches 1, and a approaches 0.

   - A value for a smaller than optimal is probably less bad than a
     bigger one: e.g. let e=log(3)/log(2), a=0.3058_ the optimal
     value, and k(a)=0.808_ the mul/mullo speed ratio. We get
     k(a+1/6)=0.929_ but k(a-1/6)=0.865_.
 */

 static mp_size_t
 mpn_sqrlo_itch (mp_size_t n)
 {
   return 2*n;
 }

 /*
     mpn_dc_sqrlo requires a scratch space of 2*n limbs at tp.
     It accepts tp == rp.
 */
 static void
 mpn_dc_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n, mp_ptr tp)
 {
   mp_size_t n2, n1;
   ASSERT (n >= 2);
   ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
   ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n));

   /* Divide-and-conquer */

   /* We need fractional approximation of the value 0 < a <= 1/2
      giving the minimum in the function k=(1-a)^e/(1-2*a^e).
   */
   if (MAYBE_range_basecase && BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD*36/(36-11)))
     n1 = n >> 1;
   else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD*36/(36-11)))
     n1 = n * 11 / (size_t) 36;	/* n1 ~= n*(1-.694...) */
   else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD*40/(40-9)))
     n1 = n * 9 / (size_t) 40;	/* n1 ~= n*(1-.775...) */
   else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD*10/9))
     n1 = n * 7 / (size_t) 39;	/* n1 ~= n*(1-.821...) */
   /* n1 = n * 4 / (size_t) 31;	// n1 ~= n*(1-.871...) [TOOM66] */
   else
     n1 = n / (size_t) 10;		/* n1 ~= n*(1-.899...) [TOOM88] */

   n2 = n - n1;

   /* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0 */

   /* x0 ^ 2 */
   mpn_sqr (tp, xp, n2);
   MPN_COPY (rp, tp, n2);

   /* x1 * x0 * 2^(n2 GMP_NUMB_BITS) */
   if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
     mpn_mul_basecase (tp + n, xp + n2, n1, xp, n1);
   else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
     mpn_mullo_basecase (tp + n, xp + n2, xp, n1);
   else
     mpn_mullo_n (tp + n, xp + n2, xp, n1);
   /* mpn_dc_mullo_n (tp + n, xp + n2, xp, n1, tp + n); */
 #if HAVE_NATIVE_mpn_addlsh1_n
   mpn_addlsh1_n (rp + n2, tp + n2, tp + n, n1);
 #else
   mpn_lshift (rp + n2, tp + n, n1, 1);
   mpn_add_n (rp + n2, rp + n2, tp + n2, n1);
 #endif
 }

 /* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0.  */
 #define SQR_BASECASE_ALLOC \
  (SQRLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*SQRLO_BASECASE_THRESHOLD_LIMIT)

 /* FIXME: This function should accept a temporary area; dc_sqrlo
    accepts a pointer tp, and handle the case tp == rp, do the same here.
 */

 void
 mpn_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n)
 {
   ASSERT (n >= 1);
   ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));

   if (BELOW_THRESHOLD (n, SQRLO_BASECASE_THRESHOLD))
     {
       /* FIXME: smarter criteria? */
 #if HAVE_NATIVE_mpn_mullo_basecase || ! HAVE_NATIVE_mpn_sqr_basecase
       /* mullo computes as many products as sqr, but directly writes
 	 on the result area. */
       mpn_mullo_basecase (rp, xp, xp, n);
 #else
       /* Allocate workspace of fixed size on stack: fast! */
       mp_limb_t tp[SQR_BASECASE_ALLOC];
       mpn_sqr_basecase (tp, xp, n);
       MPN_COPY (rp, tp, n);
 #endif
     }
   else if (BELOW_THRESHOLD (n, SQRLO_DC_THRESHOLD))
     {
       mpn_sqrlo_basecase (rp, xp, n);
     }
   else
     {
       mp_ptr tp;
       TMP_DECL;
       TMP_MARK;
       tp = TMP_ALLOC_LIMBS (mpn_sqrlo_itch (n));
       if (BELOW_THRESHOLD (n, SQRLO_SQR_THRESHOLD))
 	{
 	  mpn_dc_sqrlo (rp, xp, n, tp);
 	}
       else
 	{
 	  /* For really large operands, use plain mpn_mul_n but throw away upper n
 	     limbs of result.  */
 #if !TUNE_PROGRAM_BUILD && (SQRLO_SQR_THRESHOLD > SQR_FFT_THRESHOLD)
 	  mpn_fft_mul (tp, xp, n, xp, n);
 #else
 	  mpn_sqr (tp, xp, n);
 #endif
 	  MPN_COPY (rp, tp, n);
 	}
       TMP_FREE;
     }
 }
	/* mpn_sqrlo -- squares an n-limb number and returns the low n limbs
	of the result.

	Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.

	THIS IS (FOR NOW) AN INTERNAL FUNCTION. IT IS ONLY SAFE TO REACH THIS
	FUNCTION THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED
	THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.

	Copyright 2004, 2005, 2009, 2010, 2012, 2015 Free Software Foundation, Inc.

	This file is part of the GNU MP Library.

	The GNU MP Library is free software; you can redistribute it and/or modify
	it under the terms of either:

	* the GNU Lesser General Public License as published by the Free
	Software Foundation; either version 3 of the License, or (at your
	option) any later version.

	or

	* the GNU General Public License as published by the Free Software
	Foundation; either version 2 of the License, or (at your option) any
	later version.

	or both in parallel, as here.

	The GNU MP Library is distributed in the hope that it will be useful, but
	WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
	or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	for more details.

	You should have received copies of the GNU General Public License and the
	GNU Lesser General Public License along with the GNU MP Library. If not,
	see https://www.gnu.org/licenses/. */

	#include "gmp-impl.h"

	#if TUNE_PROGRAM_BUILD \|\| WANT_FAT_BINARY
	#define MAYBE_range_basecase 1
	#define MAYBE_range_toom22 1
	#else
	#define MAYBE_range_basecase \
	((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM2_THRESHOLD*36/(36-11))
	#define MAYBE_range_toom22 \
	((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM3_THRESHOLD*36/(36-11) )
	#endif

	/* THINK: The DC strategy uses different constants in different Toom's
	ranges. Something smoother?
	*/

	/*
	Compute the least significant half of the product {xy,n}*{yp,n}, or
	formally {rp,n} = {xy,n}*{yp,n} Mod (B^n).

	Above the given threshold, the Divide and Conquer strategy is used.
	The operand is split in two, and a full square plus a mullo
	is used to obtain the final result. The more natural strategy is to
	split in two halves, but this is far from optimal when a
	sub-quadratic multiplication is used.

	Mulders suggests an unbalanced split in favour of the full product,
	split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2.

	To compute the value of a, we assume that the cost of mullo for a
	given size ML(n) is a fraction of the cost of a full product with
	same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2;
	then we can write:

	ML(n) = 2ML(an) + M((1-a)n) => kM(n) = 2kM(n)a^e + M(n)(1-a)^e

	Given a value for e, want to minimise the value of k, i.e. the
	function k=(1-a)^e/(1-2*a^e).

	With e=2, the exponent for schoolbook multiplication, the minimum is
	given by the values a=1-a=1/2.

	With e=log(3)/log(2), the exponent for Karatsuba (aka toom22),
	Mulders compute (1-a) = 0.694... and we approximate a with 11/36.

	Other possible approximations follow:
	e=log(5)/log(3) [Toom-3] -> a ~= 9/40
	e=log(7)/log(4) [Toom-4] -> a ~= 7/39
	e=log(11)/log(6) [Toom-6] -> a ~= 1/8
	e=log(15)/log(8) [Toom-8] -> a ~= 1/10

	The values above where obtained with the following trivial commands
	in the gp-pari shell:

	fun(e,a)=(1-a)^e/(1-2*a^e)
	mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)<m,m=fun(a,p);x=p));mul(a,(b+x)/2,(c+x)/2))}
	contfracpnqn(contfrac(mul(log(2*2-1)/log(2),1/2,0),5))
	contfracpnqn(contfrac(mul(log(3*2-1)/log(3),1/2,0),5))
	contfracpnqn(contfrac(mul(log(4*2-1)/log(4),1/2,0),5))
	contfracpnqn(contfrac(mul(log(6*2-1)/log(6),1/2,0),3))
	contfracpnqn(contfrac(mul(log(8*2-1)/log(8),1/2,0),3))

	,
	\|\
	\| \
	+----,
	\| \|
	\| \|
	\| \|\
	\| \| \
	+----+--`
	^ n2 ^n1^

	For an actual implementation, the assumption that M(n)=n^e is
	incorrect, as a consequence also the assumption that ML(n)=k*M(n)
	with a constant k is wrong.

	But theory suggest us two things:
	- the best the multiplication product is (lower e), the more k
	approaches 1, and a approaches 0.

	- A value for a smaller than optimal is probably less bad than a
	bigger one: e.g. let e=log(3)/log(2), a=0.3058_ the optimal
	value, and k(a)=0.808_ the mul/mullo speed ratio. We get
	k(a+1/6)=0.929_ but k(a-1/6)=0.865_.
	*/

	static mp_size_t
	mpn_sqrlo_itch (mp_size_t n)
	{
	return 2*n;
	}

	/*
	mpn_dc_sqrlo requires a scratch space of 2*n limbs at tp.
	It accepts tp == rp.
	*/
	static void
	mpn_dc_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n, mp_ptr tp)
	{
	mp_size_t n2, n1;
	ASSERT (n >= 2);
	ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
	ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n));

	/* Divide-and-conquer */

	/* We need fractional approximation of the value 0 < a <= 1/2
	giving the minimum in the function k=(1-a)^e/(1-2*a^e).
	*/
	if (MAYBE_range_basecase && BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD*36/(36-11)))
	n1 = n >> 1;
	else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD*36/(36-11)))
	n1 = n * 11 / (size_t) 36; /* n1 ~= n(1-.694...) /
	else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD*40/(40-9)))
	n1 = n * 9 / (size_t) 40; /* n1 ~= n(1-.775...) /
	else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD*10/9))
	n1 = n * 7 / (size_t) 39; /* n1 ~= n(1-.821...) /
	/* n1 = n * 4 / (size_t) 31; // n1 ~= n(1-.871...) [TOOM66] /
	else
	n1 = n / (size_t) 10; /* n1 ~= n(1-.899...) [TOOM88] /

	n2 = n - n1;

	/* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0 */

	/* x0 ^ 2 */
	mpn_sqr (tp, xp, n2);
	MPN_COPY (rp, tp, n2);

	/* x1 * x0 * 2^(n2 GMP_NUMB_BITS) */
	if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
	mpn_mul_basecase (tp + n, xp + n2, n1, xp, n1);
	else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
	mpn_mullo_basecase (tp + n, xp + n2, xp, n1);
	else
	mpn_mullo_n (tp + n, xp + n2, xp, n1);
	/* mpn_dc_mullo_n (tp + n, xp + n2, xp, n1, tp + n); */
	#if HAVE_NATIVE_mpn_addlsh1_n
	mpn_addlsh1_n (rp + n2, tp + n2, tp + n, n1);
	#else
	mpn_lshift (rp + n2, tp + n, n1, 1);
	mpn_add_n (rp + n2, rp + n2, tp + n2, n1);
	#endif
	}

	/* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0. */
	#define SQR_BASECASE_ALLOC \
	(SQRLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*SQRLO_BASECASE_THRESHOLD_LIMIT)

	/* FIXME: This function should accept a temporary area; dc_sqrlo
	accepts a pointer tp, and handle the case tp == rp, do the same here.
	*/

	void
	mpn_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n)
	{
	ASSERT (n >= 1);
	ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));

	if (BELOW_THRESHOLD (n, SQRLO_BASECASE_THRESHOLD))
	{
	/* FIXME: smarter criteria? */
	#if HAVE_NATIVE_mpn_mullo_basecase \|\| ! HAVE_NATIVE_mpn_sqr_basecase
	/* mullo computes as many products as sqr, but directly writes
	on the result area. */
	mpn_mullo_basecase (rp, xp, xp, n);
	#else
	/* Allocate workspace of fixed size on stack: fast! */
	mp_limb_t tp[SQR_BASECASE_ALLOC];
	mpn_sqr_basecase (tp, xp, n);
	MPN_COPY (rp, tp, n);
	#endif
	}
	else if (BELOW_THRESHOLD (n, SQRLO_DC_THRESHOLD))
	{
	mpn_sqrlo_basecase (rp, xp, n);
	}
	else
	{
	mp_ptr tp;
	TMP_DECL;
	TMP_MARK;
	tp = TMP_ALLOC_LIMBS (mpn_sqrlo_itch (n));
	if (BELOW_THRESHOLD (n, SQRLO_SQR_THRESHOLD))
	{
	mpn_dc_sqrlo (rp, xp, n, tp);
	}
	else
	{
	/* For really large operands, use plain mpn_mul_n but throw away upper n
	limbs of result. */
	#if !TUNE_PROGRAM_BUILD && (SQRLO_SQR_THRESHOLD > SQR_FFT_THRESHOLD)
	mpn_fft_mul (tp, xp, n, xp, n);
	#else
	mpn_sqr (tp, xp, n);
	#endif
	MPN_COPY (rp, tp, n);
	}
	TMP_FREE;
	}
	}