third_party/gmp/mpn/generic/mul.c - RealtimeRoboticsGroup/test - Gitiles

 /* mpn_mul -- Multiply two natural numbers.

    Contributed to the GNU project by Torbjorn Granlund.

 Copyright 1991, 1993, 1994, 1996, 1997, 1999-2003, 2005-2007, 2009, 2010, 2012,
 2014, 2019 Free Software Foundation, Inc.

 This file is part of the GNU MP Library.

 The GNU MP Library is free software; you can redistribute it and/or modify
 it under the terms of either:

   * the GNU Lesser General Public License as published by the Free
     Software Foundation; either version 3 of the License, or (at your
     option) any later version.

 or

   * the GNU General Public License as published by the Free Software
     Foundation; either version 2 of the License, or (at your option) any
     later version.

 or both in parallel, as here.

 The GNU MP Library is distributed in the hope that it will be useful, but
 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 for more details.

 You should have received copies of the GNU General Public License and the
 GNU Lesser General Public License along with the GNU MP Library.  If not,
 see https://www.gnu.org/licenses/.  */

 #include "gmp-impl.h"


 #ifndef MUL_BASECASE_MAX_UN
 #define MUL_BASECASE_MAX_UN 500
 #endif

 /* Areas where the different toom algorithms can be called (extracted
    from the t-toom*.c files, and ignoring small constant offsets):

    1/6  1/5 1/4 4/13 1/3 3/8 2/5 5/11 1/2 3/5 2/3 3/4 4/5   1 vn/un
                                         4/7              6/7
 				       6/11
                                        |--------------------| toom22 (small)
                                                            || toom22 (large)
                                                        |xxxx| toom22 called
                       |-------------------------------------| toom32
                                          |xxxxxxxxxxxxxxxx| | toom32 called
                                                |------------| toom33
                                                           |x| toom33 called
              |---------------------------------|            | toom42
 	              |xxxxxxxxxxxxxxxxxxxxxxxx|            | toom42 called
                                        |--------------------| toom43
                                                |xxxxxxxxxx|   toom43 called
          |-----------------------------|                      toom52 (unused)
                                                    |--------| toom44
 						   |xxxxxxxx| toom44 called
                               |--------------------|        | toom53
                                         |xxxxxx|              toom53 called
     |-------------------------|                               toom62 (unused)
                                            |----------------| toom54 (unused)
                       |--------------------|                  toom63
 	                      |xxxxxxxxx|                   | toom63 called
                           |---------------------------------| toom6h
 						   |xxxxxxxx| toom6h called
                                   |-------------------------| toom8h (32 bit)
                  |------------------------------------------| toom8h (64 bit)
 						   |xxxxxxxx| toom8h called
 */

 #define TOOM33_OK(an,bn) (6 + 2 * an < 3 * bn)
 #define TOOM44_OK(an,bn) (12 + 3 * an < 4 * bn)

 /* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v
    (pointed to by VP, with VN limbs), and store the result at PRODP.  The
    result is UN + VN limbs.  Return the most significant limb of the result.

    NOTE: The space pointed to by PRODP is overwritten before finished with U
    and V, so overlap is an error.

    Argument constraints:
    1. UN >= VN.
    2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from
       the multiplier and the multiplicand.  */

 /*
   * The cutoff lines in the toomX2 and toomX3 code are now exactly between the
     ideal lines of the surrounding algorithms.  Is that optimal?

   * The toomX3 code now uses a structure similar to the one of toomX2, except
     that it loops longer in the unbalanced case.  The result is that the
     remaining area might have un < vn.  Should we fix the toomX2 code in a
     similar way?

   * The toomX3 code is used for the largest non-FFT unbalanced operands.  It
     therefore calls mpn_mul recursively for certain cases.

   * Allocate static temp space using THRESHOLD variables (except for toom44
     when !WANT_FFT).  That way, we can typically have no TMP_ALLOC at all.

   * We sort ToomX2 algorithms together, assuming the toom22, toom32, toom42
     have the same vn threshold.  This is not true, we should actually use
     mul_basecase for slightly larger operands for toom32 than for toom22, and
     even larger for toom42.

   * That problem is even more prevalent for toomX3.  We therefore use special
     THRESHOLD variables there.
 */

 mp_limb_t
 mpn_mul (mp_ptr prodp,
 	 mp_srcptr up, mp_size_t un,
 	 mp_srcptr vp, mp_size_t vn)
 {
   ASSERT (un >= vn);
   ASSERT (vn >= 1);
   ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un));
   ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn));

   if (BELOW_THRESHOLD (un, MUL_TOOM22_THRESHOLD))
     {
       /* When un (and thus vn) is below the toom22 range, do mul_basecase.
 	 Test un and not vn here not to thwart the un >> vn code below.
 	 This special case is not necessary, but cuts the overhead for the
 	 smallest operands. */
       mpn_mul_basecase (prodp, up, un, vp, vn);
     }
   else if (un == vn)
     {
       mpn_mul_n (prodp, up, vp, un);
     }
   else if (vn < MUL_TOOM22_THRESHOLD)
     { /* plain schoolbook multiplication */

       /* Unless un is very large, or else if have an applicable mpn_mul_N,
 	 perform basecase multiply directly.  */
       if (un <= MUL_BASECASE_MAX_UN
 #if HAVE_NATIVE_mpn_mul_2
 	  || vn <= 2
 #else
 	  || vn == 1
 #endif
 	  )
 	mpn_mul_basecase (prodp, up, un, vp, vn);
       else
 	{
 	  /* We have un >> MUL_BASECASE_MAX_UN > vn.  For better memory
 	     locality, split up[] into MUL_BASECASE_MAX_UN pieces and multiply
 	     these pieces with the vp[] operand.  After each such partial
 	     multiplication (but the last) we copy the most significant vn
 	     limbs into a temporary buffer since that part would otherwise be
 	     overwritten by the next multiplication.  After the next
 	     multiplication, we add it back.  This illustrates the situation:

                                                     -->vn<--
                                                       |  |<------- un ------->|
                                                          _____________________|
                                                         X                    /|
                                                       /XX__________________/  |
                                     _____________________                     |
                                    X                    /                     |
                                  /XX__________________/                       |
                _____________________                                          |
               /                    /                                          |
             /____________________/                                            |
 	    ==================================================================

 	    The parts marked with X are the parts whose sums are copied into
 	    the temporary buffer.  */

 	  mp_limb_t tp[MUL_TOOM22_THRESHOLD_LIMIT];
 	  mp_limb_t cy;
 	  ASSERT (MUL_TOOM22_THRESHOLD <= MUL_TOOM22_THRESHOLD_LIMIT);

 	  mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
 	  prodp += MUL_BASECASE_MAX_UN;
 	  MPN_COPY (tp, prodp, vn);		/* preserve high triangle */
 	  up += MUL_BASECASE_MAX_UN;
 	  un -= MUL_BASECASE_MAX_UN;
 	  while (un > MUL_BASECASE_MAX_UN)
 	    {
 	      mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
 	      cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
 	      mpn_incr_u (prodp + vn, cy);
 	      prodp += MUL_BASECASE_MAX_UN;
 	      MPN_COPY (tp, prodp, vn);		/* preserve high triangle */
 	      up += MUL_BASECASE_MAX_UN;
 	      un -= MUL_BASECASE_MAX_UN;
 	    }
 	  if (un > vn)
 	    {
 	      mpn_mul_basecase (prodp, up, un, vp, vn);
 	    }
 	  else
 	    {
 	      ASSERT (un > 0);
 	      mpn_mul_basecase (prodp, vp, vn, up, un);
 	    }
 	  cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
 	  mpn_incr_u (prodp + vn, cy);
 	}
     }
   else if (BELOW_THRESHOLD (vn, MUL_TOOM33_THRESHOLD))
     {
       /* Use ToomX2 variants */
       mp_ptr scratch;
       TMP_SDECL; TMP_SMARK;

 #define ITCH_TOOMX2 (9 * vn / 2 + GMP_NUMB_BITS * 2)
       scratch = TMP_SALLOC_LIMBS (ITCH_TOOMX2);
       ASSERT (mpn_toom22_mul_itch ((5*vn-1)/4, vn) <= ITCH_TOOMX2); /* 5vn/2+ */
       ASSERT (mpn_toom32_mul_itch ((7*vn-1)/4, vn) <= ITCH_TOOMX2); /* 7vn/6+ */
       ASSERT (mpn_toom42_mul_itch (3 * vn - 1, vn) <= ITCH_TOOMX2); /* 9vn/2+ */
 #undef ITCH_TOOMX2

       /* FIXME: This condition (repeated in the loop below) leaves from a vn*vn
 	 square to a (3vn-1)*vn rectangle.  Leaving such a rectangle is hardly
 	 wise; we would get better balance by slightly moving the bound.  We
 	 will sometimes end up with un < vn, like in the X3 arm below.  */
       if (un >= 3 * vn)
 	{
 	  mp_limb_t cy;
 	  mp_ptr ws;

 	  /* The maximum ws usage is for the mpn_mul result.  */
 	  ws = TMP_SALLOC_LIMBS (4 * vn);

 	  mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
 	  un -= 2 * vn;
 	  up += 2 * vn;
 	  prodp += 2 * vn;

 	  while (un >= 3 * vn)
 	    {
 	      mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
 	      un -= 2 * vn;
 	      up += 2 * vn;
 	      cy = mpn_add_n (prodp, prodp, ws, vn);
 	      MPN_COPY (prodp + vn, ws + vn, 2 * vn);
 	      mpn_incr_u (prodp + vn, cy);
 	      prodp += 2 * vn;
 	    }

 	  /* vn <= un < 3vn */

 	  if (4 * un < 5 * vn)
 	    mpn_toom22_mul (ws, up, un, vp, vn, scratch);
 	  else if (4 * un < 7 * vn)
 	    mpn_toom32_mul (ws, up, un, vp, vn, scratch);
 	  else
 	    mpn_toom42_mul (ws, up, un, vp, vn, scratch);

 	  cy = mpn_add_n (prodp, prodp, ws, vn);
 	  MPN_COPY (prodp + vn, ws + vn, un);
 	  mpn_incr_u (prodp + vn, cy);
 	}
       else
 	{
 	  if (4 * un < 5 * vn)
 	    mpn_toom22_mul (prodp, up, un, vp, vn, scratch);
 	  else if (4 * un < 7 * vn)
 	    mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
 	  else
 	    mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
 	}
       TMP_SFREE;
     }
   else if (BELOW_THRESHOLD ((un + vn) >> 1, MUL_FFT_THRESHOLD) ||
 	   BELOW_THRESHOLD (3 * vn, MUL_FFT_THRESHOLD))
     {
       /* Handle the largest operands that are not in the FFT range.  The 2nd
 	 condition makes very unbalanced operands avoid the FFT code (except
 	 perhaps as coefficient products of the Toom code.  */

       if (BELOW_THRESHOLD (vn, MUL_TOOM44_THRESHOLD) || !TOOM44_OK (un, vn))
 	{
 	  /* Use ToomX3 variants */
 	  mp_ptr scratch;
 	  TMP_DECL; TMP_MARK;

 #define ITCH_TOOMX3 (4 * vn + GMP_NUMB_BITS)
 	  scratch = TMP_ALLOC_LIMBS (ITCH_TOOMX3);
 	  ASSERT (mpn_toom33_mul_itch ((7*vn-1)/6, vn) <= ITCH_TOOMX3); /* 7vn/2+ */
 	  ASSERT (mpn_toom43_mul_itch ((3*vn-1)/2, vn) <= ITCH_TOOMX3); /* 9vn/4+ */
 	  ASSERT (mpn_toom32_mul_itch ((7*vn-1)/4, vn) <= ITCH_TOOMX3); /* 7vn/6+ */
 	  ASSERT (mpn_toom53_mul_itch ((11*vn-1)/6, vn) <= ITCH_TOOMX3); /* 11vn/3+ */
 	  ASSERT (mpn_toom42_mul_itch ((5*vn-1)/2, vn) <= ITCH_TOOMX3); /* 15vn/4+ */
 	  ASSERT (mpn_toom63_mul_itch ((5*vn-1)/2, vn) <= ITCH_TOOMX3); /* 15vn/4+ */
 #undef ITCH_TOOMX3

 	  if (2 * un >= 5 * vn)
 	    {
 	      mp_limb_t cy;
 	      mp_ptr ws;

 	      /* The maximum ws usage is for the mpn_mul result.  */
 	      ws = TMP_ALLOC_LIMBS (7 * vn >> 1);

 	      if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
 		mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
 	      else
 		mpn_toom63_mul (prodp, up, 2 * vn, vp, vn, scratch);
 	      un -= 2 * vn;
 	      up += 2 * vn;
 	      prodp += 2 * vn;

 	      while (2 * un >= 5 * vn)	/* un >= 2.5vn */
 		{
 		  if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
 		    mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
 		  else
 		    mpn_toom63_mul (ws, up, 2 * vn, vp, vn, scratch);
 		  un -= 2 * vn;
 		  up += 2 * vn;
 		  cy = mpn_add_n (prodp, prodp, ws, vn);
 		  MPN_COPY (prodp + vn, ws + vn, 2 * vn);
 		  mpn_incr_u (prodp + vn, cy);
 		  prodp += 2 * vn;
 		}

 	      /* vn / 2 <= un < 2.5vn */

 	      if (un < vn)
 		mpn_mul (ws, vp, vn, up, un);
 	      else
 		mpn_mul (ws, up, un, vp, vn);

 	      cy = mpn_add_n (prodp, prodp, ws, vn);
 	      MPN_COPY (prodp + vn, ws + vn, un);
 	      mpn_incr_u (prodp + vn, cy);
 	    }
 	  else
 	    {
 	      if (6 * un < 7 * vn)
 		mpn_toom33_mul (prodp, up, un, vp, vn, scratch);
 	      else if (2 * un < 3 * vn)
 		{
 		  if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM43_THRESHOLD))
 		    mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
 		  else
 		    mpn_toom43_mul (prodp, up, un, vp, vn, scratch);
 		}
 	      else if (6 * un < 11 * vn)
 		{
 		  if (4 * un < 7 * vn)
 		    {
 		      if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM53_THRESHOLD))
 			mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
 		      else
 			mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
 		    }
 		  else
 		    {
 		      if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM53_THRESHOLD))
 			mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
 		      else
 			mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
 		    }
 		}
 	      else
 		{
 		  if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
 		    mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
 		  else
 		    mpn_toom63_mul (prodp, up, un, vp, vn, scratch);
 		}
 	    }
 	  TMP_FREE;
 	}
       else
 	{
 	  mp_ptr scratch;
 	  TMP_DECL; TMP_MARK;

 	  if (BELOW_THRESHOLD (vn, MUL_TOOM6H_THRESHOLD))
 	    {
 	      scratch = TMP_SALLOC_LIMBS (mpn_toom44_mul_itch (un, vn));
 	      mpn_toom44_mul (prodp, up, un, vp, vn, scratch);
 	    }
 	  else if (BELOW_THRESHOLD (vn, MUL_TOOM8H_THRESHOLD))
 	    {
 	      scratch = TMP_SALLOC_LIMBS (mpn_toom6h_mul_itch (un, vn));
 	      mpn_toom6h_mul (prodp, up, un, vp, vn, scratch);
 	    }
 	  else
 	    {
 	      scratch = TMP_ALLOC_LIMBS (mpn_toom8h_mul_itch (un, vn));
 	      mpn_toom8h_mul (prodp, up, un, vp, vn, scratch);
 	    }
 	  TMP_FREE;
 	}
     }
   else
     {
       if (un >= 8 * vn)
 	{
 	  mp_limb_t cy;
 	  mp_ptr ws;
 	  TMP_DECL; TMP_MARK;

 	  /* The maximum ws usage is for the mpn_mul result.  */
 	  ws = TMP_BALLOC_LIMBS (9 * vn >> 1);

 	  mpn_fft_mul (prodp, up, 3 * vn, vp, vn);
 	  un -= 3 * vn;
 	  up += 3 * vn;
 	  prodp += 3 * vn;

 	  while (2 * un >= 7 * vn)	/* un >= 3.5vn  */
 	    {
 	      mpn_fft_mul (ws, up, 3 * vn, vp, vn);
 	      un -= 3 * vn;
 	      up += 3 * vn;
 	      cy = mpn_add_n (prodp, prodp, ws, vn);
 	      MPN_COPY (prodp + vn, ws + vn, 3 * vn);
 	      mpn_incr_u (prodp + vn, cy);
 	      prodp += 3 * vn;
 	    }

 	  /* vn / 2 <= un < 3.5vn */

 	  if (un < vn)
 	    mpn_mul (ws, vp, vn, up, un);
 	  else
 	    mpn_mul (ws, up, un, vp, vn);

 	  cy = mpn_add_n (prodp, prodp, ws, vn);
 	  MPN_COPY (prodp + vn, ws + vn, un);
 	  mpn_incr_u (prodp + vn, cy);

 	  TMP_FREE;
 	}
       else
 	mpn_fft_mul (prodp, up, un, vp, vn);
     }

   return prodp[un + vn - 1];	/* historic */
 }
	/* mpn_mul -- Multiply two natural numbers.

	Contributed to the GNU project by Torbjorn Granlund.

	Copyright 1991, 1993, 1994, 1996, 1997, 1999-2003, 2005-2007, 2009, 2010, 2012,
	2014, 2019 Free Software Foundation, Inc.

	This file is part of the GNU MP Library.

	The GNU MP Library is free software; you can redistribute it and/or modify
	it under the terms of either:

	* the GNU Lesser General Public License as published by the Free
	Software Foundation; either version 3 of the License, or (at your
	option) any later version.

	or

	* the GNU General Public License as published by the Free Software
	Foundation; either version 2 of the License, or (at your option) any
	later version.

	or both in parallel, as here.

	The GNU MP Library is distributed in the hope that it will be useful, but
	WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
	or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	for more details.

	You should have received copies of the GNU General Public License and the
	GNU Lesser General Public License along with the GNU MP Library. If not,
	see https://www.gnu.org/licenses/. */

	#include "gmp-impl.h"


	#ifndef MUL_BASECASE_MAX_UN
	#define MUL_BASECASE_MAX_UN 500
	#endif

	/* Areas where the different toom algorithms can be called (extracted
	from the t-toom*.c files, and ignoring small constant offsets):

	1/6 1/5 1/4 4/13 1/3 3/8 2/5 5/11 1/2 3/5 2/3 3/4 4/5 1 vn/un
	4/7 6/7
	6/11
	\|--------------------\| toom22 (small)
	\|\| toom22 (large)
	\|xxxx\| toom22 called
	\|-------------------------------------\| toom32
	\|xxxxxxxxxxxxxxxx\| \| toom32 called
	\|------------\| toom33
	\|x\| toom33 called
	\|---------------------------------\| \| toom42
	\|xxxxxxxxxxxxxxxxxxxxxxxx\| \| toom42 called
	\|--------------------\| toom43
	\|xxxxxxxxxx\| toom43 called
	\|-----------------------------\| toom52 (unused)
	\|--------\| toom44
	\|xxxxxxxx\| toom44 called
	\|--------------------\| \| toom53
	\|xxxxxx\| toom53 called
	\|-------------------------\| toom62 (unused)
	\|----------------\| toom54 (unused)
	\|--------------------\| toom63
	\|xxxxxxxxx\| \| toom63 called
	\|---------------------------------\| toom6h
	\|xxxxxxxx\| toom6h called
	\|-------------------------\| toom8h (32 bit)
	\|------------------------------------------\| toom8h (64 bit)
	\|xxxxxxxx\| toom8h called
	*/

	#define TOOM33_OK(an,bn) (6 + 2 * an < 3 * bn)
	#define TOOM44_OK(an,bn) (12 + 3 * an < 4 * bn)

	/* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v
	(pointed to by VP, with VN limbs), and store the result at PRODP. The
	result is UN + VN limbs. Return the most significant limb of the result.

	NOTE: The space pointed to by PRODP is overwritten before finished with U
	and V, so overlap is an error.

	Argument constraints:
	1. UN >= VN.
	2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from
	the multiplier and the multiplicand. */

	/*
	* The cutoff lines in the toomX2 and toomX3 code are now exactly between the
	ideal lines of the surrounding algorithms. Is that optimal?

	* The toomX3 code now uses a structure similar to the one of toomX2, except
	that it loops longer in the unbalanced case. The result is that the
	remaining area might have un < vn. Should we fix the toomX2 code in a
	similar way?

	* The toomX3 code is used for the largest non-FFT unbalanced operands. It
	therefore calls mpn_mul recursively for certain cases.

	* Allocate static temp space using THRESHOLD variables (except for toom44
	when !WANT_FFT). That way, we can typically have no TMP_ALLOC at all.

	* We sort ToomX2 algorithms together, assuming the toom22, toom32, toom42
	have the same vn threshold. This is not true, we should actually use
	mul_basecase for slightly larger operands for toom32 than for toom22, and
	even larger for toom42.

	* That problem is even more prevalent for toomX3. We therefore use special
	THRESHOLD variables there.
	*/

	mp_limb_t
	mpn_mul (mp_ptr prodp,
	mp_srcptr up, mp_size_t un,
	mp_srcptr vp, mp_size_t vn)
	{
	ASSERT (un >= vn);
	ASSERT (vn >= 1);
	ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un));
	ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn));

	if (BELOW_THRESHOLD (un, MUL_TOOM22_THRESHOLD))
	{
	/* When un (and thus vn) is below the toom22 range, do mul_basecase.
	Test un and not vn here not to thwart the un >> vn code below.
	This special case is not necessary, but cuts the overhead for the
	smallest operands. */
	mpn_mul_basecase (prodp, up, un, vp, vn);
	}
	else if (un == vn)
	{
	mpn_mul_n (prodp, up, vp, un);
	}
	else if (vn < MUL_TOOM22_THRESHOLD)
	{ /* plain schoolbook multiplication */

	/* Unless un is very large, or else if have an applicable mpn_mul_N,
	perform basecase multiply directly. */
	if (un <= MUL_BASECASE_MAX_UN
	#if HAVE_NATIVE_mpn_mul_2
	\|\| vn <= 2
	#else
	\|\| vn == 1
	#endif
	)
	mpn_mul_basecase (prodp, up, un, vp, vn);
	else
	{
	/* We have un >> MUL_BASECASE_MAX_UN > vn. For better memory
	locality, split up[] into MUL_BASECASE_MAX_UN pieces and multiply
	these pieces with the vp[] operand. After each such partial
	multiplication (but the last) we copy the most significant vn
	limbs into a temporary buffer since that part would otherwise be
	overwritten by the next multiplication. After the next
	multiplication, we add it back. This illustrates the situation:

	-->vn<--
	\| \|<------- un ------->\|
	_____________________\|
	X /\|
	/XX__________________/ \|
	_____________________ \|
	X / \|
	/XX__________________/ \|
	_____________________ \|
	/ / \|
	/____________________/ \|
	==================================================================

	The parts marked with X are the parts whose sums are copied into
	the temporary buffer. */

	mp_limb_t tp[MUL_TOOM22_THRESHOLD_LIMIT];
	mp_limb_t cy;
	ASSERT (MUL_TOOM22_THRESHOLD <= MUL_TOOM22_THRESHOLD_LIMIT);

	mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
	prodp += MUL_BASECASE_MAX_UN;
	MPN_COPY (tp, prodp, vn); /* preserve high triangle */
	up += MUL_BASECASE_MAX_UN;
	un -= MUL_BASECASE_MAX_UN;
	while (un > MUL_BASECASE_MAX_UN)
	{
	mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
	cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
	mpn_incr_u (prodp + vn, cy);
	prodp += MUL_BASECASE_MAX_UN;
	MPN_COPY (tp, prodp, vn); /* preserve high triangle */
	up += MUL_BASECASE_MAX_UN;
	un -= MUL_BASECASE_MAX_UN;
	}
	if (un > vn)
	{
	mpn_mul_basecase (prodp, up, un, vp, vn);
	}
	else
	{
	ASSERT (un > 0);
	mpn_mul_basecase (prodp, vp, vn, up, un);
	}
	cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
	mpn_incr_u (prodp + vn, cy);
	}
	}
	else if (BELOW_THRESHOLD (vn, MUL_TOOM33_THRESHOLD))
	{
	/* Use ToomX2 variants */
	mp_ptr scratch;
	TMP_SDECL; TMP_SMARK;

	#define ITCH_TOOMX2 (9 * vn / 2 + GMP_NUMB_BITS * 2)
	scratch = TMP_SALLOC_LIMBS (ITCH_TOOMX2);
	ASSERT (mpn_toom22_mul_itch ((5vn-1)/4, vn) <= ITCH_TOOMX2); / 5vn/2+ */
	ASSERT (mpn_toom32_mul_itch ((7vn-1)/4, vn) <= ITCH_TOOMX2); / 7vn/6+ */
	ASSERT (mpn_toom42_mul_itch (3 * vn - 1, vn) <= ITCH_TOOMX2); /* 9vn/2+ */
	#undef ITCH_TOOMX2

	/* FIXME: This condition (repeated in the loop below) leaves from a vn*vn
	square to a (3vn-1)*vn rectangle. Leaving such a rectangle is hardly
	wise; we would get better balance by slightly moving the bound. We
	will sometimes end up with un < vn, like in the X3 arm below. */
	if (un >= 3 * vn)
	{
	mp_limb_t cy;
	mp_ptr ws;

	/* The maximum ws usage is for the mpn_mul result. */
	ws = TMP_SALLOC_LIMBS (4 * vn);

	mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
	un -= 2 * vn;
	up += 2 * vn;
	prodp += 2 * vn;

	while (un >= 3 * vn)
	{
	mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
	un -= 2 * vn;
	up += 2 * vn;
	cy = mpn_add_n (prodp, prodp, ws, vn);
	MPN_COPY (prodp + vn, ws + vn, 2 * vn);
	mpn_incr_u (prodp + vn, cy);
	prodp += 2 * vn;
	}

	/* vn <= un < 3vn */

	if (4 * un < 5 * vn)
	mpn_toom22_mul (ws, up, un, vp, vn, scratch);
	else if (4 * un < 7 * vn)
	mpn_toom32_mul (ws, up, un, vp, vn, scratch);
	else
	mpn_toom42_mul (ws, up, un, vp, vn, scratch);

	cy = mpn_add_n (prodp, prodp, ws, vn);
	MPN_COPY (prodp + vn, ws + vn, un);
	mpn_incr_u (prodp + vn, cy);
	}
	else
	{
	if (4 * un < 5 * vn)
	mpn_toom22_mul (prodp, up, un, vp, vn, scratch);
	else if (4 * un < 7 * vn)
	mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
	else
	mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
	}
	TMP_SFREE;
	}
	else if (BELOW_THRESHOLD ((un + vn) >> 1, MUL_FFT_THRESHOLD) \|\|
	BELOW_THRESHOLD (3 * vn, MUL_FFT_THRESHOLD))
	{
	/* Handle the largest operands that are not in the FFT range. The 2nd
	condition makes very unbalanced operands avoid the FFT code (except
	perhaps as coefficient products of the Toom code. */

	if (BELOW_THRESHOLD (vn, MUL_TOOM44_THRESHOLD) \|\| !TOOM44_OK (un, vn))
	{
	/* Use ToomX3 variants */
	mp_ptr scratch;
	TMP_DECL; TMP_MARK;

	#define ITCH_TOOMX3 (4 * vn + GMP_NUMB_BITS)
	scratch = TMP_ALLOC_LIMBS (ITCH_TOOMX3);
	ASSERT (mpn_toom33_mul_itch ((7vn-1)/6, vn) <= ITCH_TOOMX3); / 7vn/2+ */
	ASSERT (mpn_toom43_mul_itch ((3vn-1)/2, vn) <= ITCH_TOOMX3); / 9vn/4+ */
	ASSERT (mpn_toom32_mul_itch ((7vn-1)/4, vn) <= ITCH_TOOMX3); / 7vn/6+ */
	ASSERT (mpn_toom53_mul_itch ((11vn-1)/6, vn) <= ITCH_TOOMX3); / 11vn/3+ */
	ASSERT (mpn_toom42_mul_itch ((5vn-1)/2, vn) <= ITCH_TOOMX3); / 15vn/4+ */
	ASSERT (mpn_toom63_mul_itch ((5vn-1)/2, vn) <= ITCH_TOOMX3); / 15vn/4+ */
	#undef ITCH_TOOMX3

	if (2 * un >= 5 * vn)
	{
	mp_limb_t cy;
	mp_ptr ws;

	/* The maximum ws usage is for the mpn_mul result. */
	ws = TMP_ALLOC_LIMBS (7 * vn >> 1);

	if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
	mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
	else
	mpn_toom63_mul (prodp, up, 2 * vn, vp, vn, scratch);
	un -= 2 * vn;
	up += 2 * vn;
	prodp += 2 * vn;

	while (2 * un >= 5 * vn) /* un >= 2.5vn */
	{
	if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
	mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
	else
	mpn_toom63_mul (ws, up, 2 * vn, vp, vn, scratch);
	un -= 2 * vn;
	up += 2 * vn;
	cy = mpn_add_n (prodp, prodp, ws, vn);
	MPN_COPY (prodp + vn, ws + vn, 2 * vn);
	mpn_incr_u (prodp + vn, cy);
	prodp += 2 * vn;
	}

	/* vn / 2 <= un < 2.5vn */

	if (un < vn)
	mpn_mul (ws, vp, vn, up, un);
	else
	mpn_mul (ws, up, un, vp, vn);

	cy = mpn_add_n (prodp, prodp, ws, vn);
	MPN_COPY (prodp + vn, ws + vn, un);
	mpn_incr_u (prodp + vn, cy);
	}
	else
	{
	if (6 * un < 7 * vn)
	mpn_toom33_mul (prodp, up, un, vp, vn, scratch);
	else if (2 * un < 3 * vn)
	{
	if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM43_THRESHOLD))
	mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
	else
	mpn_toom43_mul (prodp, up, un, vp, vn, scratch);
	}
	else if (6 * un < 11 * vn)
	{
	if (4 * un < 7 * vn)
	{
	if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM53_THRESHOLD))
	mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
	else
	mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
	}
	else
	{
	if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM53_THRESHOLD))
	mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
	else
	mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
	}
	}
	else
	{
	if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
	mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
	else
	mpn_toom63_mul (prodp, up, un, vp, vn, scratch);
	}
	}
	TMP_FREE;
	}
	else
	{
	mp_ptr scratch;
	TMP_DECL; TMP_MARK;

	if (BELOW_THRESHOLD (vn, MUL_TOOM6H_THRESHOLD))
	{
	scratch = TMP_SALLOC_LIMBS (mpn_toom44_mul_itch (un, vn));
	mpn_toom44_mul (prodp, up, un, vp, vn, scratch);
	}
	else if (BELOW_THRESHOLD (vn, MUL_TOOM8H_THRESHOLD))
	{
	scratch = TMP_SALLOC_LIMBS (mpn_toom6h_mul_itch (un, vn));
	mpn_toom6h_mul (prodp, up, un, vp, vn, scratch);
	}
	else
	{
	scratch = TMP_ALLOC_LIMBS (mpn_toom8h_mul_itch (un, vn));
	mpn_toom8h_mul (prodp, up, un, vp, vn, scratch);
	}
	TMP_FREE;
	}
	}
	else
	{
	if (un >= 8 * vn)
	{
	mp_limb_t cy;
	mp_ptr ws;
	TMP_DECL; TMP_MARK;

	/* The maximum ws usage is for the mpn_mul result. */
	ws = TMP_BALLOC_LIMBS (9 * vn >> 1);

	mpn_fft_mul (prodp, up, 3 * vn, vp, vn);
	un -= 3 * vn;
	up += 3 * vn;
	prodp += 3 * vn;

	while (2 * un >= 7 * vn) /* un >= 3.5vn */
	{
	mpn_fft_mul (ws, up, 3 * vn, vp, vn);
	un -= 3 * vn;
	up += 3 * vn;
	cy = mpn_add_n (prodp, prodp, ws, vn);
	MPN_COPY (prodp + vn, ws + vn, 3 * vn);
	mpn_incr_u (prodp + vn, cy);
	prodp += 3 * vn;
	}

	/* vn / 2 <= un < 3.5vn */

	if (un < vn)
	mpn_mul (ws, vp, vn, up, un);
	else
	mpn_mul (ws, up, un, vp, vn);

	cy = mpn_add_n (prodp, prodp, ws, vn);
	MPN_COPY (prodp + vn, ws + vn, un);
	mpn_incr_u (prodp + vn, cy);

	TMP_FREE;
	}
	else
	mpn_fft_mul (prodp, up, un, vp, vn);
	}

	return prodp[un + vn - 1]; /* historic */
	}