third_party/gmp/mpn/generic/diveby3.c - RealtimeRoboticsGroup/test - Gitiles

 /* mpn_divexact_by3c -- mpn exact division by 3.

 Copyright 2000-2003, 2008 Free Software Foundation, Inc.

 This file is part of the GNU MP Library.

 The GNU MP Library is free software; you can redistribute it and/or modify
 it under the terms of either:

   * the GNU Lesser General Public License as published by the Free
     Software Foundation; either version 3 of the License, or (at your
     option) any later version.

 or

   * the GNU General Public License as published by the Free Software
     Foundation; either version 2 of the License, or (at your option) any
     later version.

 or both in parallel, as here.

 The GNU MP Library is distributed in the hope that it will be useful, but
 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 for more details.

 You should have received copies of the GNU General Public License and the
 GNU Lesser General Public License along with the GNU MP Library.  If not,
 see https://www.gnu.org/licenses/.  */

 #include "gmp-impl.h"

 #if DIVEXACT_BY3_METHOD == 0

 mp_limb_t
 mpn_divexact_by3c (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_limb_t c)
 {
   mp_limb_t r;
   r = mpn_bdiv_dbm1c (rp, up, un, GMP_NUMB_MASK / 3, GMP_NUMB_MASK / 3 * c);

   /* Possible bdiv_dbm1 return values are C * (GMP_NUMB_MASK / 3), 0 <= C < 3.
      We want to return C.  We compute the remainder mod 4 and notice that the
      inverse of (2^(2k)-1)/3 mod 4 is 1.  */
   return r & 3;
 }

 #endif

 #if DIVEXACT_BY3_METHOD == 1

 /* The algorithm here is basically the same as mpn_divexact_1, as described
    in the manual.  Namely at each step q = (src[i]-c)*inverse, and new c =
    borrow(src[i]-c) + high(divisor*q).  But because the divisor is just 3,
    high(divisor*q) can be determined with two comparisons instead of a
    multiply.

    The "c += ..."s add the high limb of 3*l to c.  That high limb will be 0,
    1 or 2.  Doing two separate "+="s seems to give better code on gcc (as of
    2.95.2 at least).

    It will be noted that the new c is formed by adding three values each 0
    or 1.  But the total is only 0, 1 or 2.  When the subtraction src[i]-c
    causes a borrow, that leaves a limb value of either 0xFF...FF or
    0xFF...FE.  The multiply by MODLIMB_INVERSE_3 gives 0x55...55 or
    0xAA...AA respectively, and in those cases high(3*q) is only 0 or 1
    respectively, hence a total of no more than 2.

    Alternatives:

    This implementation has each multiply on the dependent chain, due to
    "l=s-c".  See below for alternative code which avoids that.  */

 mp_limb_t
 mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t c)
 {
   mp_limb_t  l, q, s;
   mp_size_t  i;

   ASSERT (un >= 1);
   ASSERT (c == 0 || c == 1 || c == 2);
   ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un));

   i = 0;
   do
     {
       s = up[i];
       SUBC_LIMB (c, l, s, c);

       q = (l * MODLIMB_INVERSE_3) & GMP_NUMB_MASK;
       rp[i] = q;

       c += (q >= GMP_NUMB_CEIL_MAX_DIV3);
       c += (q >= GMP_NUMB_CEIL_2MAX_DIV3);
     }
   while (++i < un);

   ASSERT (c == 0 || c == 1 || c == 2);
   return c;
 }


 #endif

 #if DIVEXACT_BY3_METHOD == 2

 /* The following alternative code re-arranges the quotient calculation from
    (src[i]-c)*inverse to instead

        q = src[i]*inverse - c*inverse

    thereby allowing src[i]*inverse to be scheduled back as far as desired,
    making full use of multiplier throughput and leaving just some carry
    handing on the dependent chain.

    The carry handling consists of determining the c for the next iteration.
    This is the same as described above, namely look for any borrow from
    src[i]-c, and at the high of 3*q.

    high(3*q) is done with two comparisons as above (in c2 and c3).  The
    borrow from src[i]-c is incorporated into those by noting that if there's
    a carry then then we have src[i]-c == 0xFF..FF or 0xFF..FE, in turn
    giving q = 0x55..55 or 0xAA..AA.  Adding 1 to either of those q values is
    enough to make high(3*q) come out 1 bigger, as required.

    l = -c*inverse is calculated at the same time as c, since for most chips
    it can be more conveniently derived from separate c1/c2/c3 values than
    from a combined c equal to 0, 1 or 2.

    The net effect is that with good pipelining this loop should be able to
    run at perhaps 4 cycles/limb, depending on available execute resources
    etc.

    Usage:

    This code is not used by default, since we really can't rely on the
    compiler generating a good software pipeline, nor on such an approach
    even being worthwhile on all CPUs.

    Itanium is one chip where this algorithm helps though, see
    mpn/ia64/diveby3.asm.  */

 mp_limb_t
 mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t cy)
 {
   mp_limb_t  s, sm, cl, q, qx, c2, c3;
   mp_size_t  i;

   ASSERT (un >= 1);
   ASSERT (cy == 0 || cy == 1 || cy == 2);
   ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un));

   cl = cy == 0 ? 0 : cy == 1 ? -MODLIMB_INVERSE_3 : -2*MODLIMB_INVERSE_3;

   for (i = 0; i < un; i++)
     {
       s = up[i];
       sm = (s * MODLIMB_INVERSE_3) & GMP_NUMB_MASK;

       q = (cl + sm) & GMP_NUMB_MASK;
       rp[i] = q;
       qx = q + (s < cy);

       c2 = qx >= GMP_NUMB_CEIL_MAX_DIV3;
       c3 = qx >= GMP_NUMB_CEIL_2MAX_DIV3 ;

       cy = c2 + c3;
       cl = (-c2 & -MODLIMB_INVERSE_3) + (-c3 & -MODLIMB_INVERSE_3);
     }

   return cy;
 }

 #endif
	/* mpn_divexact_by3c -- mpn exact division by 3.

	Copyright 2000-2003, 2008 Free Software Foundation, Inc.

	This file is part of the GNU MP Library.

	The GNU MP Library is free software; you can redistribute it and/or modify
	it under the terms of either:

	* the GNU Lesser General Public License as published by the Free
	Software Foundation; either version 3 of the License, or (at your
	option) any later version.

	or

	* the GNU General Public License as published by the Free Software
	Foundation; either version 2 of the License, or (at your option) any
	later version.

	or both in parallel, as here.

	The GNU MP Library is distributed in the hope that it will be useful, but
	WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
	or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	for more details.

	You should have received copies of the GNU General Public License and the
	GNU Lesser General Public License along with the GNU MP Library. If not,
	see https://www.gnu.org/licenses/. */

	#include "gmp-impl.h"

	#if DIVEXACT_BY3_METHOD == 0

	mp_limb_t
	mpn_divexact_by3c (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_limb_t c)
	{
	mp_limb_t r;
	r = mpn_bdiv_dbm1c (rp, up, un, GMP_NUMB_MASK / 3, GMP_NUMB_MASK / 3 * c);

	/* Possible bdiv_dbm1 return values are C * (GMP_NUMB_MASK / 3), 0 <= C < 3.
	We want to return C. We compute the remainder mod 4 and notice that the
	inverse of (2^(2k)-1)/3 mod 4 is 1. */
	return r & 3;
	}

	#endif

	#if DIVEXACT_BY3_METHOD == 1

	/* The algorithm here is basically the same as mpn_divexact_1, as described
	in the manual. Namely at each step q = (src[i]-c)*inverse, and new c =
	borrow(src[i]-c) + high(divisor*q). But because the divisor is just 3,
	high(divisor*q) can be determined with two comparisons instead of a
	multiply.

	The "c += ..."s add the high limb of 3*l to c. That high limb will be 0,
	1 or 2. Doing two separate "+="s seems to give better code on gcc (as of
	2.95.2 at least).

	It will be noted that the new c is formed by adding three values each 0
	or 1. But the total is only 0, 1 or 2. When the subtraction src[i]-c
	causes a borrow, that leaves a limb value of either 0xFF...FF or
	0xFF...FE. The multiply by MODLIMB_INVERSE_3 gives 0x55...55 or
	0xAA...AA respectively, and in those cases high(3*q) is only 0 or 1
	respectively, hence a total of no more than 2.

	Alternatives:

	This implementation has each multiply on the dependent chain, due to
	"l=s-c". See below for alternative code which avoids that. */

	mp_limb_t
	mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t c)
	{
	mp_limb_t l, q, s;
	mp_size_t i;

	ASSERT (un >= 1);
	ASSERT (c == 0 \|\| c == 1 \|\| c == 2);
	ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un));

	i = 0;
	do
	{
	s = up[i];
	SUBC_LIMB (c, l, s, c);

	q = (l * MODLIMB_INVERSE_3) & GMP_NUMB_MASK;
	rp[i] = q;

	c += (q >= GMP_NUMB_CEIL_MAX_DIV3);
	c += (q >= GMP_NUMB_CEIL_2MAX_DIV3);
	}
	while (++i < un);

	ASSERT (c == 0 \|\| c == 1 \|\| c == 2);
	return c;
	}


	#endif

	#if DIVEXACT_BY3_METHOD == 2

	/* The following alternative code re-arranges the quotient calculation from
	(src[i]-c)*inverse to instead

	q = src[i]inverse - cinverse

	thereby allowing src[i]*inverse to be scheduled back as far as desired,
	making full use of multiplier throughput and leaving just some carry
	handing on the dependent chain.

	The carry handling consists of determining the c for the next iteration.
	This is the same as described above, namely look for any borrow from
	src[i]-c, and at the high of 3*q.

	high(3*q) is done with two comparisons as above (in c2 and c3). The
	borrow from src[i]-c is incorporated into those by noting that if there's
	a carry then then we have src[i]-c == 0xFF..FF or 0xFF..FE, in turn
	giving q = 0x55..55 or 0xAA..AA. Adding 1 to either of those q values is
	enough to make high(3*q) come out 1 bigger, as required.

	l = -c*inverse is calculated at the same time as c, since for most chips
	it can be more conveniently derived from separate c1/c2/c3 values than
	from a combined c equal to 0, 1 or 2.

	The net effect is that with good pipelining this loop should be able to
	run at perhaps 4 cycles/limb, depending on available execute resources
	etc.

	Usage:

	This code is not used by default, since we really can't rely on the
	compiler generating a good software pipeline, nor on such an approach
	even being worthwhile on all CPUs.

	Itanium is one chip where this algorithm helps though, see
	mpn/ia64/diveby3.asm. */

	mp_limb_t
	mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t cy)
	{
	mp_limb_t s, sm, cl, q, qx, c2, c3;
	mp_size_t i;

	ASSERT (un >= 1);
	ASSERT (cy == 0 \|\| cy == 1 \|\| cy == 2);
	ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un));

	cl = cy == 0 ? 0 : cy == 1 ? -MODLIMB_INVERSE_3 : -2*MODLIMB_INVERSE_3;

	for (i = 0; i < un; i++)
	{
	s = up[i];
	sm = (s * MODLIMB_INVERSE_3) & GMP_NUMB_MASK;

	q = (cl + sm) & GMP_NUMB_MASK;
	rp[i] = q;
	qx = q + (s < cy);

	c2 = qx >= GMP_NUMB_CEIL_MAX_DIV3;
	c3 = qx >= GMP_NUMB_CEIL_2MAX_DIV3 ;

	cy = c2 + c3;
	cl = (-c2 & -MODLIMB_INVERSE_3) + (-c3 & -MODLIMB_INVERSE_3);
	}

	return cy;
	}

	#endif