third_party/gmp/mpn/generic/mode1o.c - RealtimeRoboticsGroup/test - Gitiles

 /* mpn_modexact_1c_odd -- mpn by limb exact division style remainder.

    THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
    CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
    FUTURE GNU MP RELEASES.

 Copyright 2000-2004 Free Software Foundation, Inc.

 This file is part of the GNU MP Library.

 The GNU MP Library is free software; you can redistribute it and/or modify
 it under the terms of either:

   * the GNU Lesser General Public License as published by the Free
     Software Foundation; either version 3 of the License, or (at your
     option) any later version.

 or

   * the GNU General Public License as published by the Free Software
     Foundation; either version 2 of the License, or (at your option) any
     later version.

 or both in parallel, as here.

 The GNU MP Library is distributed in the hope that it will be useful, but
 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 for more details.

 You should have received copies of the GNU General Public License and the
 GNU Lesser General Public License along with the GNU MP Library.  If not,
 see https://www.gnu.org/licenses/.  */

 #include "gmp-impl.h"
 #include "longlong.h"


 /* Calculate an r satisfying

            r*B^k + a - c == q*d

    where B=2^GMP_LIMB_BITS, a is {src,size}, k is either size or size-1
    (the caller won't know which), and q is the quotient (discarded).  d must
    be odd, c can be any limb value.

    If c<d then r will be in the range 0<=r<d, or if c>=d then 0<=r<=d.

    This slightly strange function suits the initial Nx1 reduction for GCDs
    or Jacobi symbols since the factors of 2 in B^k can be ignored, leaving
    -r == a mod d (by passing c=0).  For a GCD the factor of -1 on r can be
    ignored, or for the Jacobi symbol it can be accounted for.  The function
    also suits divisibility and congruence testing since if r=0 (or r=d) is
    obtained then a==c mod d.


    r is a bit like the remainder returned by mpn_divexact_by3c, and is the
    sort of remainder mpn_divexact_1 might return.  Like mpn_divexact_by3c, r
    represents a borrow, since effectively quotient limbs are chosen so that
    subtracting that multiple of d from src at each step will produce a zero
    limb.

    A long calculation can be done piece by piece from low to high by passing
    the return value from one part as the carry parameter to the next part.
    The effective final k becomes anything between size and size-n, if n
    pieces are used.


    A similar sort of routine could be constructed based on adding multiples
    of d at each limb, much like redc in mpz_powm does.  Subtracting however
    has a small advantage that when subtracting to cancel out l there's never
    a borrow into h, whereas using an addition would put a carry into h
    depending whether l==0 or l!=0.


    In terms of efficiency, this function is similar to a mul-by-inverse
    mpn_mod_1.  Both are essentially two multiplies and are best suited to
    CPUs with low latency multipliers (in comparison to a divide instruction
    at least.)  But modexact has a few less supplementary operations, only
    needs low part and high part multiplies, and has fewer working quantities
    (helping CPUs with few registers).


    In the main loop it will be noted that the new carry (call it r) is the
    sum of the high product h and any borrow from l=s-c.  If c<d then we will
    have r<d too, for the following reasons.  Let q=l*inverse be the quotient
    limb, so that q*d = B*h + l, where B=2^GMP_NUMB_BITS.  Now if h=d-1 then

        l = q*d - B*(d-1) <= (B-1)*d - B*(d-1) = B-d

    But if l=s-c produces a borrow when c<d, then l>=B-d+1 and hence will
    never have h=d-1 and so r=h+borrow <= d-1.

    When c>=d, on the other hand, h=d-1 can certainly occur together with a
    borrow, thereby giving only r<=d, as per the function definition above.

    As a design decision it's left to the caller to check for r=d if it might
    be passing c>=d.  Several applications have c<d initially so the extra
    test is often unnecessary, for example the GCDs or a plain divisibility
    d|a test will pass c=0.


    The special case for size==1 is so that it can be assumed c<=d in the
    high<=divisor test at the end.  c<=d is only guaranteed after at least
    one iteration of the main loop.  There's also a decent chance one % is
    faster than a binvert_limb, though that will depend on the processor.

    A CPU specific implementation might want to omit the size==1 code or the
    high<divisor test.  mpn/x86/k6/mode1o.asm for instance finds neither
    useful.  */


 mp_limb_t
 mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d,
                      mp_limb_t orig_c)
 {
   mp_limb_t  s, h, l, inverse, dummy, dmul, ret;
   mp_limb_t  c = orig_c;
   mp_size_t  i;

   ASSERT (size >= 1);
   ASSERT (d & 1);
   ASSERT_MPN (src, size);
   ASSERT_LIMB (d);
   ASSERT_LIMB (c);

   if (size == 1)
     {
       s = src[0];
       if (s > c)
 	{
 	  l = s-c;
 	  h = l % d;
 	  if (h != 0)
 	    h = d - h;
 	}
       else
 	{
 	  l = c-s;
 	  h = l % d;
 	}
       return h;
     }


   binvert_limb (inverse, d);
   dmul = d << GMP_NAIL_BITS;

   i = 0;
   do
     {
       s = src[i];
       SUBC_LIMB (c, l, s, c);
       l = (l * inverse) & GMP_NUMB_MASK;
       umul_ppmm (h, dummy, l, dmul);
       c += h;
     }
   while (++i < size-1);


   s = src[i];
   if (s <= d)
     {
       /* With high<=d the final step can be a subtract and addback.  If c==0
 	 then the addback will restore to l>=0.  If c==d then will get l==d
 	 if s==0, but that's ok per the function definition.  */

       l = c - s;
       if (c < s)
 	l += d;

       ret = l;
     }
   else
     {
       /* Can't skip a divide, just do the loop code once more. */

       SUBC_LIMB (c, l, s, c);
       l = (l * inverse) & GMP_NUMB_MASK;
       umul_ppmm (h, dummy, l, dmul);
       c += h;
       ret = c;
     }

   ASSERT (orig_c < d ? ret < d : ret <= d);
   return ret;
 }


 #if 0

 /* The following is an alternate form that might shave one cycle on a
    superscalar processor since it takes c+=h off the dependent chain,
    leaving just a low product, high product, and a subtract.

    This is for CPU specific implementations to consider.  A special case for
    high<divisor and/or size==1 can be added if desired.

    Notice that c is only ever 0 or 1, since if s-c produces a borrow then
    x=0xFF..FF and x-h cannot produce a borrow.  The c=(x>s) could become
    c=(x==0xFF..FF) too, if that helped.  */

 mp_limb_t
 mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t h)
 {
   mp_limb_t  s, x, y, inverse, dummy, dmul, c1, c2;
   mp_limb_t  c = 0;
   mp_size_t  i;

   ASSERT (size >= 1);
   ASSERT (d & 1);

   binvert_limb (inverse, d);
   dmul = d << GMP_NAIL_BITS;

   for (i = 0; i < size; i++)
     {
       ASSERT (c==0 || c==1);

       s = src[i];
       SUBC_LIMB (c1, x, s, c);

       SUBC_LIMB (c2, y, x, h);
       c = c1 + c2;

       y = (y * inverse) & GMP_NUMB_MASK;
       umul_ppmm (h, dummy, y, dmul);
     }

   h += c;
   return h;
 }

 #endif
	/* mpn_modexact_1c_odd -- mpn by limb exact division style remainder.

	THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST
	CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
	FUTURE GNU MP RELEASES.

	Copyright 2000-2004 Free Software Foundation, Inc.

	This file is part of the GNU MP Library.

	The GNU MP Library is free software; you can redistribute it and/or modify
	it under the terms of either:

	* the GNU Lesser General Public License as published by the Free
	Software Foundation; either version 3 of the License, or (at your
	option) any later version.

	or

	* the GNU General Public License as published by the Free Software
	Foundation; either version 2 of the License, or (at your option) any
	later version.

	or both in parallel, as here.

	The GNU MP Library is distributed in the hope that it will be useful, but
	WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
	or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	for more details.

	You should have received copies of the GNU General Public License and the
	GNU Lesser General Public License along with the GNU MP Library. If not,
	see https://www.gnu.org/licenses/. */

	#include "gmp-impl.h"
	#include "longlong.h"


	/* Calculate an r satisfying

	rB^k + a - c == qd

	where B=2^GMP_LIMB_BITS, a is {src,size}, k is either size or size-1
	(the caller won't know which), and q is the quotient (discarded). d must
	be odd, c can be any limb value.

	If c<d then r will be in the range 0<=r<d, or if c>=d then 0<=r<=d.

	This slightly strange function suits the initial Nx1 reduction for GCDs
	or Jacobi symbols since the factors of 2 in B^k can be ignored, leaving
	-r == a mod d (by passing c=0). For a GCD the factor of -1 on r can be
	ignored, or for the Jacobi symbol it can be accounted for. The function
	also suits divisibility and congruence testing since if r=0 (or r=d) is
	obtained then a==c mod d.


	r is a bit like the remainder returned by mpn_divexact_by3c, and is the
	sort of remainder mpn_divexact_1 might return. Like mpn_divexact_by3c, r
	represents a borrow, since effectively quotient limbs are chosen so that
	subtracting that multiple of d from src at each step will produce a zero
	limb.

	A long calculation can be done piece by piece from low to high by passing
	the return value from one part as the carry parameter to the next part.
	The effective final k becomes anything between size and size-n, if n
	pieces are used.


	A similar sort of routine could be constructed based on adding multiples
	of d at each limb, much like redc in mpz_powm does. Subtracting however
	has a small advantage that when subtracting to cancel out l there's never
	a borrow into h, whereas using an addition would put a carry into h
	depending whether l==0 or l!=0.


	In terms of efficiency, this function is similar to a mul-by-inverse
	mpn_mod_1. Both are essentially two multiplies and are best suited to
	CPUs with low latency multipliers (in comparison to a divide instruction
	at least.) But modexact has a few less supplementary operations, only
	needs low part and high part multiplies, and has fewer working quantities
	(helping CPUs with few registers).


	In the main loop it will be noted that the new carry (call it r) is the
	sum of the high product h and any borrow from l=s-c. If c<d then we will
	have r<d too, for the following reasons. Let q=l*inverse be the quotient
	limb, so that qd = Bh + l, where B=2^GMP_NUMB_BITS. Now if h=d-1 then

	l = qd - B(d-1) <= (B-1)d - B(d-1) = B-d

	But if l=s-c produces a borrow when c<d, then l>=B-d+1 and hence will
	never have h=d-1 and so r=h+borrow <= d-1.

	When c>=d, on the other hand, h=d-1 can certainly occur together with a
	borrow, thereby giving only r<=d, as per the function definition above.

	As a design decision it's left to the caller to check for r=d if it might
	be passing c>=d. Several applications have c<d initially so the extra
	test is often unnecessary, for example the GCDs or a plain divisibility
	d\|a test will pass c=0.


	The special case for size==1 is so that it can be assumed c<=d in the
	high<=divisor test at the end. c<=d is only guaranteed after at least
	one iteration of the main loop. There's also a decent chance one % is
	faster than a binvert_limb, though that will depend on the processor.

	A CPU specific implementation might want to omit the size==1 code or the
	high<divisor test. mpn/x86/k6/mode1o.asm for instance finds neither
	useful. */


	mp_limb_t
	mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d,
	mp_limb_t orig_c)
	{
	mp_limb_t s, h, l, inverse, dummy, dmul, ret;
	mp_limb_t c = orig_c;
	mp_size_t i;

	ASSERT (size >= 1);
	ASSERT (d & 1);
	ASSERT_MPN (src, size);
	ASSERT_LIMB (d);
	ASSERT_LIMB (c);

	if (size == 1)
	{
	s = src[0];
	if (s > c)
	{
	l = s-c;
	h = l % d;
	if (h != 0)
	h = d - h;
	}
	else
	{
	l = c-s;
	h = l % d;
	}
	return h;
	}


	binvert_limb (inverse, d);
	dmul = d << GMP_NAIL_BITS;

	i = 0;
	do
	{
	s = src[i];
	SUBC_LIMB (c, l, s, c);
	l = (l * inverse) & GMP_NUMB_MASK;
	umul_ppmm (h, dummy, l, dmul);
	c += h;
	}
	while (++i < size-1);


	s = src[i];
	if (s <= d)
	{
	/* With high<=d the final step can be a subtract and addback. If c==0
	then the addback will restore to l>=0. If c==d then will get l==d
	if s==0, but that's ok per the function definition. */

	l = c - s;
	if (c < s)
	l += d;

	ret = l;
	}
	else
	{
	/* Can't skip a divide, just do the loop code once more. */

	SUBC_LIMB (c, l, s, c);
	l = (l * inverse) & GMP_NUMB_MASK;
	umul_ppmm (h, dummy, l, dmul);
	c += h;
	ret = c;
	}

	ASSERT (orig_c < d ? ret < d : ret <= d);
	return ret;
	}



	#if 0

	/* The following is an alternate form that might shave one cycle on a
	superscalar processor since it takes c+=h off the dependent chain,
	leaving just a low product, high product, and a subtract.

	This is for CPU specific implementations to consider. A special case for
	high<divisor and/or size==1 can be added if desired.

	Notice that c is only ever 0 or 1, since if s-c produces a borrow then
	x=0xFF..FF and x-h cannot produce a borrow. The c=(x>s) could become
	c=(x==0xFF..FF) too, if that helped. */

	mp_limb_t
	mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t h)
	{
	mp_limb_t s, x, y, inverse, dummy, dmul, c1, c2;
	mp_limb_t c = 0;
	mp_size_t i;

	ASSERT (size >= 1);
	ASSERT (d & 1);

	binvert_limb (inverse, d);
	dmul = d << GMP_NAIL_BITS;

	for (i = 0; i < size; i++)
	{
	ASSERT (c==0 \|\| c==1);

	s = src[i];
	SUBC_LIMB (c1, x, s, c);

	SUBC_LIMB (c2, y, x, h);
	c = c1 + c2;

	y = (y * inverse) & GMP_NUMB_MASK;
	umul_ppmm (h, dummy, y, dmul);
	}

	h += c;
	return h;
	}

	#endif