Blame - third_party/gmp/mpn/generic/gcdext.c - RealtimeRoboticsGroup/test

blob: 55014807effee62d66a786daa871fd5ed9cf42e1 [file] [log] [blame]

Austin Schuh	bb1338c	2024-06-15 19:31:16 -0700	[diff] [blame]	1	/* mpn_gcdext -- Extended Greatest Common Divisor.
				2
				3	Copyright 1996, 1998, 2000-2005, 2008, 2009, 2012 Free Software Foundation,
				4	Inc.
				5
				6	This file is part of the GNU MP Library.
				7
				8	The GNU MP Library is free software; you can redistribute it and/or modify
				9	it under the terms of either:
				10
				11	* the GNU Lesser General Public License as published by the Free
				12	Software Foundation; either version 3 of the License, or (at your
				13	option) any later version.
				14
				15	or
				16
				17	* the GNU General Public License as published by the Free Software
				18	Foundation; either version 2 of the License, or (at your option) any
				19	later version.
				20
				21	or both in parallel, as here.
				22
				23	The GNU MP Library is distributed in the hope that it will be useful, but
				24	WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
				25	or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
				26	for more details.
				27
				28	You should have received copies of the GNU General Public License and the
				29	GNU Lesser General Public License along with the GNU MP Library. If not,
				30	see https://www.gnu.org/licenses/. */
				31
				32	#include "gmp-impl.h"
				33	#include "longlong.h"
				34
				35	/* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and
				36	the size is returned (if inputs are non-normalized, result may be
				37	non-normalized too). Temporary space needed is M->n + n.
				38	*/
				39	static size_t
				40	hgcd_mul_matrix_vector (struct hgcd_matrix *M,
				41	mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
				42	{
				43	mp_limb_t ah, bh;
				44
				45	/* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
				46
				47	t = u00 * a
				48	r = u10 * b
				49	r += t;
				50
				51	t = u11 * b
				52	b = u01 * a
				53	b += t;
				54	*/
				55
				56	if (M->n >= n)
				57	{
				58	mpn_mul (tp, M->p[0][0], M->n, ap, n);
				59	mpn_mul (rp, M->p[1][0], M->n, bp, n);
				60	}
				61	else
				62	{
				63	mpn_mul (tp, ap, n, M->p[0][0], M->n);
				64	mpn_mul (rp, bp, n, M->p[1][0], M->n);
				65	}
				66
				67	ah = mpn_add_n (rp, rp, tp, n + M->n);
				68
				69	if (M->n >= n)
				70	{
				71	mpn_mul (tp, M->p[1][1], M->n, bp, n);
				72	mpn_mul (bp, M->p[0][1], M->n, ap, n);
				73	}
				74	else
				75	{
				76	mpn_mul (tp, bp, n, M->p[1][1], M->n);
				77	mpn_mul (bp, ap, n, M->p[0][1], M->n);
				78	}
				79	bh = mpn_add_n (bp, bp, tp, n + M->n);
				80
				81	n += M->n;
				82	if ( (ah \| bh) > 0)
				83	{
				84	rp[n] = ah;
				85	bp[n] = bh;
				86	n++;
				87	}
				88	else
				89	{
				90	/* Normalize */
				91	while ( (rp[n-1] \| bp[n-1]) == 0)
				92	n--;
				93	}
				94
				95	return n;
				96	}
				97
				98	#define COMPUTE_V_ITCH(n) (2*(n))
				99
				100	/* Computes \|v\| = \|(g - u a)\| / b, where u may be positive or
				101	negative, and v is of the opposite sign. max(a, b) is of size n, u and
				102	v at most size n, and v must have space for n+1 limbs. */
				103	static mp_size_t
				104	compute_v (mp_ptr vp,
				105	mp_srcptr ap, mp_srcptr bp, mp_size_t n,
				106	mp_srcptr gp, mp_size_t gn,
				107	mp_srcptr up, mp_size_t usize,
				108	mp_ptr tp)
				109	{
				110	mp_size_t size;
				111	mp_size_t an;
				112	mp_size_t bn;
				113	mp_size_t vn;
				114
				115	ASSERT (n > 0);
				116	ASSERT (gn > 0);
				117	ASSERT (usize != 0);
				118
				119	size = ABS (usize);
				120	ASSERT (size <= n);
				121	ASSERT (up[size-1] > 0);
				122
				123	an = n;
				124	MPN_NORMALIZE (ap, an);
				125	ASSERT (gn <= an);
				126
				127	if (an >= size)
				128	mpn_mul (tp, ap, an, up, size);
				129	else
				130	mpn_mul (tp, up, size, ap, an);
				131
				132	size += an;
				133
				134	if (usize > 0)
				135	{
				136	/* \|v\| = -v = (u a - g) / b */
				137
				138	ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn));
				139	MPN_NORMALIZE (tp, size);
				140	if (size == 0)
				141	return 0;
				142	}
				143	else
				144	{ /* \|v\| = v = (g - u a) / b = (g + \|u\| a) / b. Since g <= a,
				145	(g + \|u\| a) always fits in (\|usize\| + an) limbs. */
				146
				147	ASSERT_NOCARRY (mpn_add (tp, tp, size, gp, gn));
				148	size -= (tp[size - 1] == 0);
				149	}
				150
				151	/* Now divide t / b. There must be no remainder */
				152	bn = n;
				153	MPN_NORMALIZE (bp, bn);
				154	ASSERT (size >= bn);
				155
				156	vn = size + 1 - bn;
				157	ASSERT (vn <= n + 1);
				158
				159	mpn_divexact (vp, tp, size, bp, bn);
				160	vn -= (vp[vn-1] == 0);
				161
				162	return vn;
				163	}
				164
				165	/* Temporary storage:
				166
				167	Initial division: Quotient of at most an - n + 1 <= an limbs.
				168
				169	Storage for u0 and u1: 2(n+1).
				170
				171	Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4)
				172
				173	Storage for hgcd, input (n + 1)/2: 9 n/4 plus some.
				174
				175	When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors.
				176
				177	When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less.
				178
				179	For the lehmer call after the loop, Let T denote
				180	GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for
				181	u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T
				182	for u, T+1 for v and 2T scratch space. In all, 7T + 3 is
				183	sufficient for both operations.
				184
				185	*/
				186
				187	/* Optimal choice of p seems difficult. In each iteration the division
				188	* of work between hgcd and the updates of u0 and u1 depends on the
				189	* current size of the u. It may be desirable to use a different
				190	* choice of p in each iteration. Also the input size seems to matter;
				191	* choosing p = n / 3 in the first iteration seems to improve
				192	* performance slightly for input size just above the threshold, but
				193	* degrade performance for larger inputs. */
				194	#define CHOOSE_P_1(n) ((n) / 2)
				195	#define CHOOSE_P_2(n) ((n) / 3)
				196
				197	mp_size_t
				198	mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
				199	mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n)
				200	{
				201	mp_size_t talloc;
				202	mp_size_t scratch;
				203	mp_size_t matrix_scratch;
				204	mp_size_t ualloc = n + 1;
				205
				206	struct gcdext_ctx ctx;
				207	mp_size_t un;
				208	mp_ptr u0;
				209	mp_ptr u1;
				210
				211	mp_ptr tp;
				212
				213	TMP_DECL;
				214
				215	ASSERT (an >= n);
				216	ASSERT (n > 0);
				217	ASSERT (bp[n-1] > 0);
				218
				219	TMP_MARK;
				220
				221	/* FIXME: Check for small sizes first, before setting up temporary
				222	storage etc. */
				223	talloc = MPN_GCDEXT_LEHMER_N_ITCH(n);
				224
				225	/* For initial division */
				226	scratch = an - n + 1;
				227	if (scratch > talloc)
				228	talloc = scratch;
				229
				230	if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
				231	{
				232	/* For hgcd loop. */
				233	mp_size_t hgcd_scratch;
				234	mp_size_t update_scratch;
				235	mp_size_t p1 = CHOOSE_P_1 (n);
				236	mp_size_t p2 = CHOOSE_P_2 (n);
				237	mp_size_t min_p = MIN(p1, p2);
				238	mp_size_t max_p = MAX(p1, p2);
				239	matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p);
				240	hgcd_scratch = mpn_hgcd_itch (n - min_p);
				241	update_scratch = max_p + n - 1;
				242
				243	scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch);
				244	if (scratch > talloc)
				245	talloc = scratch;
				246
				247	/* Final mpn_gcdext_lehmer_n call. Need space for u and for
				248	copies of a and b. */
				249	scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD)
				250	+ 3*GCDEXT_DC_THRESHOLD;
				251
				252	if (scratch > talloc)
				253	talloc = scratch;
				254
				255	/* Cofactors u0 and u1 */
				256	talloc += 2*(n+1);
				257	}
				258
				259	tp = TMP_ALLOC_LIMBS(talloc);
				260
				261	if (an > n)
				262	{
				263	mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n);
				264
				265	if (mpn_zero_p (ap, n))
				266	{
				267	MPN_COPY (gp, bp, n);
				268	*usizep = 0;
				269	TMP_FREE;
				270	return n;
				271	}
				272	}
				273
				274	if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
				275	{
				276	mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp);
				277
				278	TMP_FREE;
				279	return gn;
				280	}
				281
				282	MPN_ZERO (tp, 2*ualloc);
				283	u0 = tp; tp += ualloc;
				284	u1 = tp; tp += ualloc;
				285
				286	ctx.gp = gp;
				287	ctx.up = up;
				288	ctx.usize = usizep;
				289
				290	{
				291	/* For the first hgcd call, there are no u updates, and it makes
				292	some sense to use a different choice for p. */
				293
				294	/* FIXME: We could trim use of temporary storage, since u0 and u1
				295	are not used yet. For the hgcd call, we could swap in the u0
				296	and u1 pointers for the relevant matrix elements. */
				297
				298	struct hgcd_matrix M;
				299	mp_size_t p = CHOOSE_P_1 (n);
				300	mp_size_t nn;
				301
				302	mpn_hgcd_matrix_init (&M, n - p, tp);
				303	nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
				304	if (nn > 0)
				305	{
				306	ASSERT (M.n <= (n - p - 1)/2);
				307	ASSERT (M.n + p <= (p + n - 1) / 2);
				308
				309	/* Temporary storage 2 (p + M->n) <= p + n - 1 */
				310	n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch);
				311
				312	MPN_COPY (u0, M.p[1][0], M.n);
				313	MPN_COPY (u1, M.p[1][1], M.n);
				314	un = M.n;
				315	while ( (u0[un-1] \| u1[un-1] ) == 0)
				316	un--;
				317	}
				318	else
				319	{
				320	/* mpn_hgcd has failed. Then either one of a or b is very
				321	small, or the difference is very small. Perform one
				322	subtraction followed by one division. */
				323	u1[0] = 1;
				324
				325	ctx.u0 = u0;
				326	ctx.u1 = u1;
				327	ctx.tp = tp + n; /* ualloc */
				328	ctx.un = 1;
				329
				330	/* Temporary storage n */
				331	n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp);
				332	if (n == 0)
				333	{
				334	TMP_FREE;
				335	return ctx.gn;
				336	}
				337
				338	un = ctx.un;
				339	ASSERT (un < ualloc);
				340	}
				341	}
				342
				343	while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
				344	{
				345	struct hgcd_matrix M;
				346	mp_size_t p = CHOOSE_P_2 (n);
				347	mp_size_t nn;
				348
				349	mpn_hgcd_matrix_init (&M, n - p, tp);
				350	nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
				351	if (nn > 0)
				352	{
				353	mp_ptr t0;
				354
				355	t0 = tp + matrix_scratch;
				356	ASSERT (M.n <= (n - p - 1)/2);
				357	ASSERT (M.n + p <= (p + n - 1) / 2);
				358
				359	/* Temporary storage 2 (p + M->n) <= p + n - 1 */
				360	n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0);
				361
				362	/* By the same analysis as for mpn_hgcd_matrix_mul */
				363	ASSERT (M.n + un <= ualloc);
				364
				365	/* FIXME: This copying could be avoided by some swapping of
				366	* pointers. May need more temporary storage, though. */
				367	MPN_COPY (t0, u0, un);
				368
				369	/* Temporary storage ualloc */
				370	un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un);
				371
				372	ASSERT (un < ualloc);
				373	ASSERT ( (u0[un-1] \| u1[un-1]) > 0);
				374	}
				375	else
				376	{
				377	/* mpn_hgcd has failed. Then either one of a or b is very
				378	small, or the difference is very small. Perform one
				379	subtraction followed by one division. */
				380	ctx.u0 = u0;
				381	ctx.u1 = u1;
				382	ctx.tp = tp + n; /* ualloc */
				383	ctx.un = un;
				384
				385	/* Temporary storage n */
				386	n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp);
				387	if (n == 0)
				388	{
				389	TMP_FREE;
				390	return ctx.gn;
				391	}
				392
				393	un = ctx.un;
				394	ASSERT (un < ualloc);
				395	}
				396	}
				397	/* We have A = ... a + ... b
				398	B = u0 a + u1 b
				399
				400	a = u1 A + ... B
				401	b = -u0 A + ... B
				402
				403	with bounds
				404
				405	\|u0\|, \|u1\| <= B / min(a, b)
				406
				407	We always have u1 > 0, and u0 == 0 is possible only if u1 == 1,
				408	in which case the only reduction done so far is a = A - k B for
				409	some k.
				410
				411	Compute g = u a + v b = (u u1 - v u0) A + (...) B
				412	Here, u, v are bounded by
				413
				414	\|u\| <= b,
				415	\|v\| <= a
				416	*/
				417
				418	ASSERT ( (ap[n-1] \| bp[n-1]) > 0);
				419
				420	if (UNLIKELY (mpn_cmp (ap, bp, n) == 0))
				421	{
				422	/* Must return the smallest cofactor, +u1 or -u0 */
				423	int c;
				424
				425	MPN_COPY (gp, ap, n);
				426
				427	MPN_CMP (c, u0, u1, un);
				428	/* c == 0 can happen only when A = (2k+1) G, B = 2 G. And in
				429	this case we choose the cofactor + 1, corresponding to G = A
				430	- k B, rather than -1, corresponding to G = - A + (k+1) B. */
				431	ASSERT (c != 0 \|\| (un == 1 && u0[0] == 1 && u1[0] == 1));
				432	if (c < 0)
				433	{
				434	MPN_NORMALIZE (u0, un);
				435	MPN_COPY (up, u0, un);
				436	*usizep = -un;
				437	}
				438	else
				439	{
				440	MPN_NORMALIZE_NOT_ZERO (u1, un);
				441	MPN_COPY (up, u1, un);
				442	*usizep = un;
				443	}
				444
				445	TMP_FREE;
				446	return n;
				447	}
				448	else if (UNLIKELY (u0[0] == 0) && un == 1)
				449	{
				450	mp_size_t gn;
				451	ASSERT (u1[0] == 1);
				452
				453	/* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */
				454	gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp);
				455
				456	TMP_FREE;
				457	return gn;
				458	}
				459	else
				460	{
				461	mp_size_t u0n;
				462	mp_size_t u1n;
				463	mp_size_t lehmer_un;
				464	mp_size_t lehmer_vn;
				465	mp_size_t gn;
				466
				467	mp_ptr lehmer_up;
				468	mp_ptr lehmer_vp;
				469	int negate;
				470
				471	lehmer_up = tp; tp += n;
				472
				473	/* Call mpn_gcdext_lehmer_n with copies of a and b. */
				474	MPN_COPY (tp, ap, n);
				475	MPN_COPY (tp + n, bp, n);
				476	gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n);
				477
				478	u0n = un;
				479	MPN_NORMALIZE (u0, u0n);
				480	ASSERT (u0n > 0);
				481
				482	if (lehmer_un == 0)
				483	{
				484	/* u == 0 ==> v = g / b == 1 ==> g = - u0 A + (...) B */
				485	MPN_COPY (up, u0, u0n);
				486	*usizep = -u0n;
				487
				488	TMP_FREE;
				489	return gn;
				490	}
				491
				492	lehmer_vp = tp;
				493	/* Compute v = (g - u a) / b */
				494	lehmer_vn = compute_v (lehmer_vp,
				495	ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1);
				496
				497	if (lehmer_un > 0)
				498	negate = 0;
				499	else
				500	{
				501	lehmer_un = -lehmer_un;
				502	negate = 1;
				503	}
				504
				505	u1n = un;
				506	MPN_NORMALIZE (u1, u1n);
				507	ASSERT (u1n > 0);
				508
				509	ASSERT (lehmer_un + u1n <= ualloc);
				510	ASSERT (lehmer_vn + u0n <= ualloc);
				511
				512	/* We may still have v == 0 */
				513
				514	/* Compute u u0 */
				515	if (lehmer_un <= u1n)
				516	/* Should be the common case */
				517	mpn_mul (up, u1, u1n, lehmer_up, lehmer_un);
				518	else
				519	mpn_mul (up, lehmer_up, lehmer_un, u1, u1n);
				520
				521	un = u1n + lehmer_un;
				522	un -= (up[un - 1] == 0);
				523
				524	if (lehmer_vn > 0)
				525	{
				526	mp_limb_t cy;
				527
				528	/* Overwrites old u1 value */
				529	if (lehmer_vn <= u0n)
				530	/* Should be the common case */
				531	mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn);
				532	else
				533	mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n);
				534
				535	u1n = u0n + lehmer_vn;
				536	u1n -= (u1[u1n - 1] == 0);
				537
				538	if (u1n <= un)
				539	{
				540	cy = mpn_add (up, up, un, u1, u1n);
				541	}
				542	else
				543	{
				544	cy = mpn_add (up, u1, u1n, up, un);
				545	un = u1n;
				546	}
				547	up[un] = cy;
				548	un += (cy != 0);
				549
				550	ASSERT (un < ualloc);
				551	}
				552	*usizep = negate ? -un : un;
				553
				554	TMP_FREE;
				555	return gn;
				556	}
				557	}