Blame - third_party/gmp/mpn/generic/divexact.c - RealtimeRoboticsGroup/test

blob: ec417dfc266e3b42b432397e1084f2e010a0c1b3 [file] [log] [blame]

Austin Schuh	dace2a6	2020-08-18 10:56:48 -0700	[diff] [blame]	1	/* mpn_divexact(qp,np,nn,dp,dn,tp) -- Divide N = {np,nn} by D = {dp,dn} storing
				2	the result in Q = {qp,nn-dn+1} expecting no remainder. Overlap allowed
				3	between Q and N; all other overlap disallowed.
				4
				5	Contributed to the GNU project by Torbjorn Granlund.
				6
				7	THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
				8	SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
				9	GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
				10
				11	Copyright 2006, 2007, 2009, 2017 Free Software Foundation, Inc.
				12
				13	This file is part of the GNU MP Library.
				14
				15	The GNU MP Library is free software; you can redistribute it and/or modify
				16	it under the terms of either:
				17
				18	* the GNU Lesser General Public License as published by the Free
				19	Software Foundation; either version 3 of the License, or (at your
				20	option) any later version.
				21
				22	or
				23
				24	* the GNU General Public License as published by the Free Software
				25	Foundation; either version 2 of the License, or (at your option) any
				26	later version.
				27
				28	or both in parallel, as here.
				29
				30	The GNU MP Library is distributed in the hope that it will be useful, but
				31	WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
				32	or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
				33	for more details.
				34
				35	You should have received copies of the GNU General Public License and the
				36	GNU Lesser General Public License along with the GNU MP Library. If not,
				37	see https://www.gnu.org/licenses/. */
				38
				39
				40	#include "gmp-impl.h"
				41	#include "longlong.h"
				42
				43	#if 1
				44	void
				45	mpn_divexact (mp_ptr qp,
				46	mp_srcptr np, mp_size_t nn,
				47	mp_srcptr dp, mp_size_t dn)
				48	{
				49	unsigned shift;
				50	mp_size_t qn;
				51	mp_ptr tp;
				52	TMP_DECL;
				53
				54	ASSERT (dn > 0);
				55	ASSERT (nn >= dn);
				56	ASSERT (dp[dn-1] > 0);
				57
				58	while (dp[0] == 0)
				59	{
				60	ASSERT (np[0] == 0);
				61	dp++;
				62	np++;
				63	dn--;
				64	nn--;
				65	}
				66
				67	if (dn == 1)
				68	{
				69	MPN_DIVREM_OR_DIVEXACT_1 (qp, np, nn, dp[0]);
				70	return;
				71	}
				72
				73	TMP_MARK;
				74
				75	qn = nn + 1 - dn;
				76	count_trailing_zeros (shift, dp[0]);
				77
				78	if (shift > 0)
				79	{
				80	mp_ptr wp;
				81	mp_size_t ss;
				82	ss = (dn > qn) ? qn + 1 : dn;
				83
				84	tp = TMP_ALLOC_LIMBS (ss);
				85	mpn_rshift (tp, dp, ss, shift);
				86	dp = tp;
				87
				88	/* Since we have excluded dn == 1, we have nn > qn, and we need
				89	to shift one limb beyond qn. */
				90	wp = TMP_ALLOC_LIMBS (qn + 1);
				91	mpn_rshift (wp, np, qn + 1, shift);
				92	np = wp;
				93	}
				94
				95	if (dn > qn)
				96	dn = qn;
				97
				98	tp = TMP_ALLOC_LIMBS (mpn_bdiv_q_itch (qn, dn));
				99	mpn_bdiv_q (qp, np, qn, dp, dn, tp);
				100	TMP_FREE;
				101
				102	/* Since bdiv_q computes -N/D (mod B^{qn}), we must negate now. */
				103	mpn_neg (qp, qp, qn);
				104	}
				105
				106	#else
				107
				108	/* We use the Jebelean's bidirectional exact division algorithm. This is
				109	somewhat naively implemented, with equal quotient parts done by 2-adic
				110	division and truncating division. Since 2-adic division is faster, it
				111	should be used for a larger chunk.
				112
				113	This code is horrendously ugly, in all sorts of ways.
				114
				115	* It was hacked without much care or thought, but with a testing program.
				116	* It handles scratch space frivolously, and furthermore the itch function
				117	is broken.
				118	* Doesn't provide any measures to deal with mu_divappr_q's +3 error. We
				119	have yet to provoke an error due to this, though.
				120	* Algorithm selection leaves a lot to be desired. In particular, the choice
				121	between DC and MU isn't a point, but we treat it like one.
				122	* It makes the msb part 1 or 2 limbs larger than the lsb part, in spite of
				123	that the latter is faster. We should at least reverse this, but perhaps
				124	we should make the lsb part considerably larger. (How do we tune this?)
				125	*/
				126
				127	mp_size_t
				128	mpn_divexact_itch (mp_size_t nn, mp_size_t dn)
				129	{
				130	return nn + dn; /* FIXME this is not right */
				131	}
				132
				133	void
				134	mpn_divexact (mp_ptr qp,
				135	mp_srcptr np, mp_size_t nn,
				136	mp_srcptr dp, mp_size_t dn,
				137	mp_ptr scratch)
				138	{
				139	mp_size_t qn;
				140	mp_size_t nn0, qn0;
				141	mp_size_t nn1, qn1;
				142	mp_ptr tp;
				143	mp_limb_t qml;
				144	mp_limb_t qh;
				145	int cnt;
				146	mp_ptr xdp;
				147	mp_limb_t di;
				148	mp_limb_t cy;
				149	gmp_pi1_t dinv;
				150	TMP_DECL;
				151
				152	TMP_MARK;
				153
				154	qn = nn - dn + 1;
				155
				156	/* For small divisors, and small quotients, don't use Jebelean's algorithm. */
				157	if (dn < DIVEXACT_JEB_THRESHOLD \|\| qn < DIVEXACT_JEB_THRESHOLD)
				158	{
				159	tp = scratch;
				160	MPN_COPY (tp, np, qn);
				161	binvert_limb (di, dp[0]); di = -di;
				162	dn = MIN (dn, qn);
				163	mpn_sbpi1_bdiv_q (qp, tp, qn, dp, dn, di);
				164	TMP_FREE;
				165	return;
				166	}
				167
				168	qn0 = ((nn - dn) >> 1) + 1; /* low quotient size */
				169
				170	/* If quotient is much larger than the divisor, the bidirectional algorithm
				171	does not work as currently implemented. Fall back to plain bdiv. */
				172	if (qn0 > dn)
				173	{
				174	if (BELOW_THRESHOLD (dn, DC_BDIV_Q_THRESHOLD))
				175	{
				176	tp = scratch;
				177	MPN_COPY (tp, np, qn);
				178	binvert_limb (di, dp[0]); di = -di;
				179	dn = MIN (dn, qn);
				180	mpn_sbpi1_bdiv_q (qp, tp, qn, dp, dn, di);
				181	}
				182	else if (BELOW_THRESHOLD (dn, MU_BDIV_Q_THRESHOLD))
				183	{
				184	tp = scratch;
				185	MPN_COPY (tp, np, qn);
				186	binvert_limb (di, dp[0]); di = -di;
				187	mpn_dcpi1_bdiv_q (qp, tp, qn, dp, dn, di);
				188	}
				189	else
				190	{
				191	mpn_mu_bdiv_q (qp, np, qn, dp, dn, scratch);
				192	}
				193	TMP_FREE;
				194	return;
				195	}
				196
				197	nn0 = qn0 + qn0;
				198
				199	nn1 = nn0 - 1 + ((nn-dn) & 1);
				200	qn1 = qn0;
				201	if (LIKELY (qn0 != dn))
				202	{
				203	nn1 = nn1 + 1;
				204	qn1 = qn1 + 1;
				205	if (UNLIKELY (dp[dn - 1] == 1 && qn1 != dn))
				206	{
				207	/* If the leading divisor limb == 1, i.e. has just one bit, we have
				208	to include an extra limb in order to get the needed overlap. */
				209	/* FIXME: Now with the mu_divappr_q function, we should really need
				210	more overlap. That indicates one of two things: (1) The test code
				211	is not good. (2) We actually overlap too much by default. */
				212	nn1 = nn1 + 1;
				213	qn1 = qn1 + 1;
				214	}
				215	}
				216
				217	tp = TMP_ALLOC_LIMBS (nn1 + 1);
				218
				219	count_leading_zeros (cnt, dp[dn - 1]);
				220
				221	/* Normalize divisor, store into tmp area. */
				222	if (cnt != 0)
				223	{
				224	xdp = TMP_ALLOC_LIMBS (qn1);
				225	mpn_lshift (xdp, dp + dn - qn1, qn1, cnt);
				226	}
				227	else
				228	{
				229	xdp = (mp_ptr) dp + dn - qn1;
				230	}
				231
				232	/* Shift dividend according to the divisor normalization. */
				233	/* FIXME: We compute too much here for XX_divappr_q, but these functions'
				234	interfaces want a pointer to the imaginative least significant limb, not
				235	to the least significant used limb. Of course, we could leave nn1-qn1
				236	rubbish limbs in the low part, to save some time. */
				237	if (cnt != 0)
				238	{
				239	cy = mpn_lshift (tp, np + nn - nn1, nn1, cnt);
				240	if (cy != 0)
				241	{
				242	tp[nn1] = cy;
				243	nn1++;
				244	}
				245	}
				246	else
				247	{
				248	/* FIXME: This copy is not needed for mpn_mu_divappr_q, except when the
				249	mpn_sub_n right before is executed. */
				250	MPN_COPY (tp, np + nn - nn1, nn1);
				251	}
				252
				253	invert_pi1 (dinv, xdp[qn1 - 1], xdp[qn1 - 2]);
				254	if (BELOW_THRESHOLD (qn1, DC_DIVAPPR_Q_THRESHOLD))
				255	{
				256	qp[qn0 - 1 + nn1 - qn1] = mpn_sbpi1_divappr_q (qp + qn0 - 1, tp, nn1, xdp, qn1, dinv.inv32);
				257	}
				258	else if (BELOW_THRESHOLD (qn1, MU_DIVAPPR_Q_THRESHOLD))
				259	{
				260	qp[qn0 - 1 + nn1 - qn1] = mpn_dcpi1_divappr_q (qp + qn0 - 1, tp, nn1, xdp, qn1, &dinv);
				261	}
				262	else
				263	{
				264	/* FIXME: mpn_mu_divappr_q doesn't handle qh != 0. Work around it with a
				265	conditional subtraction here. */
				266	qh = mpn_cmp (tp + nn1 - qn1, xdp, qn1) >= 0;
				267	if (qh)
				268	mpn_sub_n (tp + nn1 - qn1, tp + nn1 - qn1, xdp, qn1);
				269	mpn_mu_divappr_q (qp + qn0 - 1, tp, nn1, xdp, qn1, scratch);
				270	qp[qn0 - 1 + nn1 - qn1] = qh;
				271	}
				272	qml = qp[qn0 - 1];
				273
				274	binvert_limb (di, dp[0]); di = -di;
				275
				276	if (BELOW_THRESHOLD (qn0, DC_BDIV_Q_THRESHOLD))
				277	{
				278	MPN_COPY (tp, np, qn0);
				279	mpn_sbpi1_bdiv_q (qp, tp, qn0, dp, qn0, di);
				280	}
				281	else if (BELOW_THRESHOLD (qn0, MU_BDIV_Q_THRESHOLD))
				282	{
				283	MPN_COPY (tp, np, qn0);
				284	mpn_dcpi1_bdiv_q (qp, tp, qn0, dp, qn0, di);
				285	}
				286	else
				287	{
				288	mpn_mu_bdiv_q (qp, np, qn0, dp, qn0, scratch);
				289	}
				290
				291	if (qml < qp[qn0 - 1])
				292	mpn_decr_u (qp + qn0, 1);
				293
				294	TMP_FREE;
				295	}
				296	#endif