Blame - third_party/gmp/mpn/generic/diveby3.c - RealtimeRoboticsGroup/test

blob: 7dee0bcd95ca5bdb9c785d364365b2962d2788ee [file] [log] [blame]

Austin Schuh	dace2a6	2020-08-18 10:56:48 -0700	[diff] [blame^]	1	/* mpn_divexact_by3c -- mpn exact division by 3.
				2
				3	Copyright 2000-2003, 2008 Free Software Foundation, Inc.
				4
				5	This file is part of the GNU MP Library.
				6
				7	The GNU MP Library is free software; you can redistribute it and/or modify
				8	it under the terms of either:
				9
				10	* the GNU Lesser General Public License as published by the Free
				11	Software Foundation; either version 3 of the License, or (at your
				12	option) any later version.
				13
				14	or
				15
				16	* the GNU General Public License as published by the Free Software
				17	Foundation; either version 2 of the License, or (at your option) any
				18	later version.
				19
				20	or both in parallel, as here.
				21
				22	The GNU MP Library is distributed in the hope that it will be useful, but
				23	WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
				24	or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
				25	for more details.
				26
				27	You should have received copies of the GNU General Public License and the
				28	GNU Lesser General Public License along with the GNU MP Library. If not,
				29	see https://www.gnu.org/licenses/. */
				30
				31	#include "gmp-impl.h"
				32
				33	#if DIVEXACT_BY3_METHOD == 0
				34
				35	mp_limb_t
				36	mpn_divexact_by3c (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_limb_t c)
				37	{
				38	mp_limb_t r;
				39	r = mpn_bdiv_dbm1c (rp, up, un, GMP_NUMB_MASK / 3, GMP_NUMB_MASK / 3 * c);
				40
				41	/* Possible bdiv_dbm1 return values are C * (GMP_NUMB_MASK / 3), 0 <= C < 3.
				42	We want to return C. We compute the remainder mod 4 and notice that the
				43	inverse of (2^(2k)-1)/3 mod 4 is 1. */
				44	return r & 3;
				45	}
				46
				47	#endif
				48
				49	#if DIVEXACT_BY3_METHOD == 1
				50
				51	/* The algorithm here is basically the same as mpn_divexact_1, as described
				52	in the manual. Namely at each step q = (src[i]-c)*inverse, and new c =
				53	borrow(src[i]-c) + high(divisor*q). But because the divisor is just 3,
				54	high(divisor*q) can be determined with two comparisons instead of a
				55	multiply.
				56
				57	The "c += ..."s add the high limb of 3*l to c. That high limb will be 0,
				58	1 or 2. Doing two separate "+="s seems to give better code on gcc (as of
				59	2.95.2 at least).
				60
				61	It will be noted that the new c is formed by adding three values each 0
				62	or 1. But the total is only 0, 1 or 2. When the subtraction src[i]-c
				63	causes a borrow, that leaves a limb value of either 0xFF...FF or
				64	0xFF...FE. The multiply by MODLIMB_INVERSE_3 gives 0x55...55 or
				65	0xAA...AA respectively, and in those cases high(3*q) is only 0 or 1
				66	respectively, hence a total of no more than 2.
				67
				68	Alternatives:
				69
				70	This implementation has each multiply on the dependent chain, due to
				71	"l=s-c". See below for alternative code which avoids that. */
				72
				73	mp_limb_t
				74	mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t c)
				75	{
				76	mp_limb_t l, q, s;
				77	mp_size_t i;
				78
				79	ASSERT (un >= 1);
				80	ASSERT (c == 0 \|\| c == 1 \|\| c == 2);
				81	ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un));
				82
				83	i = 0;
				84	do
				85	{
				86	s = up[i];
				87	SUBC_LIMB (c, l, s, c);
				88
				89	q = (l * MODLIMB_INVERSE_3) & GMP_NUMB_MASK;
				90	rp[i] = q;
				91
				92	c += (q >= GMP_NUMB_CEIL_MAX_DIV3);
				93	c += (q >= GMP_NUMB_CEIL_2MAX_DIV3);
				94	}
				95	while (++i < un);
				96
				97	ASSERT (c == 0 \|\| c == 1 \|\| c == 2);
				98	return c;
				99	}
				100
				101
				102	#endif
				103
				104	#if DIVEXACT_BY3_METHOD == 2
				105
				106	/* The following alternative code re-arranges the quotient calculation from
				107	(src[i]-c)*inverse to instead
				108
				109	q = src[i]inverse - cinverse
				110
				111	thereby allowing src[i]*inverse to be scheduled back as far as desired,
				112	making full use of multiplier throughput and leaving just some carry
				113	handing on the dependent chain.
				114
				115	The carry handling consists of determining the c for the next iteration.
				116	This is the same as described above, namely look for any borrow from
				117	src[i]-c, and at the high of 3*q.
				118
				119	high(3*q) is done with two comparisons as above (in c2 and c3). The
				120	borrow from src[i]-c is incorporated into those by noting that if there's
				121	a carry then then we have src[i]-c == 0xFF..FF or 0xFF..FE, in turn
				122	giving q = 0x55..55 or 0xAA..AA. Adding 1 to either of those q values is
				123	enough to make high(3*q) come out 1 bigger, as required.
				124
				125	l = -c*inverse is calculated at the same time as c, since for most chips
				126	it can be more conveniently derived from separate c1/c2/c3 values than
				127	from a combined c equal to 0, 1 or 2.
				128
				129	The net effect is that with good pipelining this loop should be able to
				130	run at perhaps 4 cycles/limb, depending on available execute resources
				131	etc.
				132
				133	Usage:
				134
				135	This code is not used by default, since we really can't rely on the
				136	compiler generating a good software pipeline, nor on such an approach
				137	even being worthwhile on all CPUs.
				138
				139	Itanium is one chip where this algorithm helps though, see
				140	mpn/ia64/diveby3.asm. */
				141
				142	mp_limb_t
				143	mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t cy)
				144	{
				145	mp_limb_t s, sm, cl, q, qx, c2, c3;
				146	mp_size_t i;
				147
				148	ASSERT (un >= 1);
				149	ASSERT (cy == 0 \|\| cy == 1 \|\| cy == 2);
				150	ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un));
				151
				152	cl = cy == 0 ? 0 : cy == 1 ? -MODLIMB_INVERSE_3 : -2*MODLIMB_INVERSE_3;
				153
				154	for (i = 0; i < un; i++)
				155	{
				156	s = up[i];
				157	sm = (s * MODLIMB_INVERSE_3) & GMP_NUMB_MASK;
				158
				159	q = (cl + sm) & GMP_NUMB_MASK;
				160	rp[i] = q;
				161	qx = q + (s < cy);
				162
				163	c2 = qx >= GMP_NUMB_CEIL_MAX_DIV3;
				164	c3 = qx >= GMP_NUMB_CEIL_2MAX_DIV3 ;
				165
				166	cy = c2 + c3;
				167	cl = (-c2 & -MODLIMB_INVERSE_3) + (-c3 & -MODLIMB_INVERSE_3);
				168	}
				169
				170	return cy;
				171	}
				172
				173	#endif