Blame - third_party/gmp/mpn/generic/mode1o.c - RealtimeRoboticsGroup/test

blob: 9ba0ae1ccf9219af763dd3b30785a47c8ddf5b94 [file] [log] [blame]

Austin Schuh	dace2a6	2020-08-18 10:56:48 -0700	[diff] [blame^]	1	/* mpn_modexact_1c_odd -- mpn by limb exact division style remainder.
				2
				3	THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST
				4	CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
				5	FUTURE GNU MP RELEASES.
				6
				7	Copyright 2000-2004 Free Software Foundation, Inc.
				8
				9	This file is part of the GNU MP Library.
				10
				11	The GNU MP Library is free software; you can redistribute it and/or modify
				12	it under the terms of either:
				13
				14	* the GNU Lesser General Public License as published by the Free
				15	Software Foundation; either version 3 of the License, or (at your
				16	option) any later version.
				17
				18	or
				19
				20	* the GNU General Public License as published by the Free Software
				21	Foundation; either version 2 of the License, or (at your option) any
				22	later version.
				23
				24	or both in parallel, as here.
				25
				26	The GNU MP Library is distributed in the hope that it will be useful, but
				27	WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
				28	or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
				29	for more details.
				30
				31	You should have received copies of the GNU General Public License and the
				32	GNU Lesser General Public License along with the GNU MP Library. If not,
				33	see https://www.gnu.org/licenses/. */
				34
				35	#include "gmp-impl.h"
				36	#include "longlong.h"
				37
				38
				39	/* Calculate an r satisfying
				40
				41	rB^k + a - c == qd
				42
				43	where B=2^GMP_LIMB_BITS, a is {src,size}, k is either size or size-1
				44	(the caller won't know which), and q is the quotient (discarded). d must
				45	be odd, c can be any limb value.
				46
				47	If c<d then r will be in the range 0<=r<d, or if c>=d then 0<=r<=d.
				48
				49	This slightly strange function suits the initial Nx1 reduction for GCDs
				50	or Jacobi symbols since the factors of 2 in B^k can be ignored, leaving
				51	-r == a mod d (by passing c=0). For a GCD the factor of -1 on r can be
				52	ignored, or for the Jacobi symbol it can be accounted for. The function
				53	also suits divisibility and congruence testing since if r=0 (or r=d) is
				54	obtained then a==c mod d.
				55
				56
				57	r is a bit like the remainder returned by mpn_divexact_by3c, and is the
				58	sort of remainder mpn_divexact_1 might return. Like mpn_divexact_by3c, r
				59	represents a borrow, since effectively quotient limbs are chosen so that
				60	subtracting that multiple of d from src at each step will produce a zero
				61	limb.
				62
				63	A long calculation can be done piece by piece from low to high by passing
				64	the return value from one part as the carry parameter to the next part.
				65	The effective final k becomes anything between size and size-n, if n
				66	pieces are used.
				67
				68
				69	A similar sort of routine could be constructed based on adding multiples
				70	of d at each limb, much like redc in mpz_powm does. Subtracting however
				71	has a small advantage that when subtracting to cancel out l there's never
				72	a borrow into h, whereas using an addition would put a carry into h
				73	depending whether l==0 or l!=0.
				74
				75
				76	In terms of efficiency, this function is similar to a mul-by-inverse
				77	mpn_mod_1. Both are essentially two multiplies and are best suited to
				78	CPUs with low latency multipliers (in comparison to a divide instruction
				79	at least.) But modexact has a few less supplementary operations, only
				80	needs low part and high part multiplies, and has fewer working quantities
				81	(helping CPUs with few registers).
				82
				83
				84	In the main loop it will be noted that the new carry (call it r) is the
				85	sum of the high product h and any borrow from l=s-c. If c<d then we will
				86	have r<d too, for the following reasons. Let q=l*inverse be the quotient
				87	limb, so that qd = Bh + l, where B=2^GMP_NUMB_BITS. Now if h=d-1 then
				88
				89	l = qd - B(d-1) <= (B-1)d - B(d-1) = B-d
				90
				91	But if l=s-c produces a borrow when c<d, then l>=B-d+1 and hence will
				92	never have h=d-1 and so r=h+borrow <= d-1.
				93
				94	When c>=d, on the other hand, h=d-1 can certainly occur together with a
				95	borrow, thereby giving only r<=d, as per the function definition above.
				96
				97	As a design decision it's left to the caller to check for r=d if it might
				98	be passing c>=d. Several applications have c<d initially so the extra
				99	test is often unnecessary, for example the GCDs or a plain divisibility
				100	d\|a test will pass c=0.
				101
				102
				103	The special case for size==1 is so that it can be assumed c<=d in the
				104	high<=divisor test at the end. c<=d is only guaranteed after at least
				105	one iteration of the main loop. There's also a decent chance one % is
				106	faster than a binvert_limb, though that will depend on the processor.
				107
				108	A CPU specific implementation might want to omit the size==1 code or the
				109	high<divisor test. mpn/x86/k6/mode1o.asm for instance finds neither
				110	useful. */
				111
				112
				113	mp_limb_t
				114	mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d,
				115	mp_limb_t orig_c)
				116	{
				117	mp_limb_t s, h, l, inverse, dummy, dmul, ret;
				118	mp_limb_t c = orig_c;
				119	mp_size_t i;
				120
				121	ASSERT (size >= 1);
				122	ASSERT (d & 1);
				123	ASSERT_MPN (src, size);
				124	ASSERT_LIMB (d);
				125	ASSERT_LIMB (c);
				126
				127	if (size == 1)
				128	{
				129	s = src[0];
				130	if (s > c)
				131	{
				132	l = s-c;
				133	h = l % d;
				134	if (h != 0)
				135	h = d - h;
				136	}
				137	else
				138	{
				139	l = c-s;
				140	h = l % d;
				141	}
				142	return h;
				143	}
				144
				145
				146	binvert_limb (inverse, d);
				147	dmul = d << GMP_NAIL_BITS;
				148
				149	i = 0;
				150	do
				151	{
				152	s = src[i];
				153	SUBC_LIMB (c, l, s, c);
				154	l = (l * inverse) & GMP_NUMB_MASK;
				155	umul_ppmm (h, dummy, l, dmul);
				156	c += h;
				157	}
				158	while (++i < size-1);
				159
				160
				161	s = src[i];
				162	if (s <= d)
				163	{
				164	/* With high<=d the final step can be a subtract and addback. If c==0
				165	then the addback will restore to l>=0. If c==d then will get l==d
				166	if s==0, but that's ok per the function definition. */
				167
				168	l = c - s;
				169	if (c < s)
				170	l += d;
				171
				172	ret = l;
				173	}
				174	else
				175	{
				176	/* Can't skip a divide, just do the loop code once more. */
				177
				178	SUBC_LIMB (c, l, s, c);
				179	l = (l * inverse) & GMP_NUMB_MASK;
				180	umul_ppmm (h, dummy, l, dmul);
				181	c += h;
				182	ret = c;
				183	}
				184
				185	ASSERT (orig_c < d ? ret < d : ret <= d);
				186	return ret;
				187	}
				188
				189
				190
				191	#if 0
				192
				193	/* The following is an alternate form that might shave one cycle on a
				194	superscalar processor since it takes c+=h off the dependent chain,
				195	leaving just a low product, high product, and a subtract.
				196
				197	This is for CPU specific implementations to consider. A special case for
				198	high<divisor and/or size==1 can be added if desired.
				199
				200	Notice that c is only ever 0 or 1, since if s-c produces a borrow then
				201	x=0xFF..FF and x-h cannot produce a borrow. The c=(x>s) could become
				202	c=(x==0xFF..FF) too, if that helped. */
				203
				204	mp_limb_t
				205	mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t h)
				206	{
				207	mp_limb_t s, x, y, inverse, dummy, dmul, c1, c2;
				208	mp_limb_t c = 0;
				209	mp_size_t i;
				210
				211	ASSERT (size >= 1);
				212	ASSERT (d & 1);
				213
				214	binvert_limb (inverse, d);
				215	dmul = d << GMP_NAIL_BITS;
				216
				217	for (i = 0; i < size; i++)
				218	{
				219	ASSERT (c==0 \|\| c==1);
				220
				221	s = src[i];
				222	SUBC_LIMB (c1, x, s, c);
				223
				224	SUBC_LIMB (c2, y, x, h);
				225	c = c1 + c2;
				226
				227	y = (y * inverse) & GMP_NUMB_MASK;
				228	umul_ppmm (h, dummy, y, dmul);
				229	}
				230
				231	h += c;
				232	return h;
				233	}
				234
				235	#endif