Blame - third_party/gmp/mpn/generic/hgcd2.c - RealtimeRoboticsGroup/test

blob: 3fa40122e42f7a5ec85d3970dfc48d8e5210a4c2 [file] [log] [blame]

Austin Schuh	bb1338c	2024-06-15 19:31:16 -0700	[diff] [blame^]	1	/* hgcd2.c
				2
				3	THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
				4	SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
				5	GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
				6
				7	Copyright 1996, 1998, 2000-2004, 2008, 2012, 2019 Free Software Foundation,
				8	Inc.
				9
				10	This file is part of the GNU MP Library.
				11
				12	The GNU MP Library is free software; you can redistribute it and/or modify
				13	it under the terms of either:
				14
				15	* the GNU Lesser General Public License as published by the Free
				16	Software Foundation; either version 3 of the License, or (at your
				17	option) any later version.
				18
				19	or
				20
				21	* the GNU General Public License as published by the Free Software
				22	Foundation; either version 2 of the License, or (at your option) any
				23	later version.
				24
				25	or both in parallel, as here.
				26
				27	The GNU MP Library is distributed in the hope that it will be useful, but
				28	WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
				29	or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
				30	for more details.
				31
				32	You should have received copies of the GNU General Public License and the
				33	GNU Lesser General Public License along with the GNU MP Library. If not,
				34	see https://www.gnu.org/licenses/. */
				35
				36	#include "gmp-impl.h"
				37	#include "longlong.h"
				38
				39	#ifndef HGCD2_DIV1_METHOD
				40	#define HGCD2_DIV1_METHOD 3
				41	#endif
				42
				43	#ifndef HGCD2_DIV2_METHOD
				44	#define HGCD2_DIV2_METHOD 2
				45	#endif
				46
				47	#if GMP_NAIL_BITS != 0
				48	#error Nails not implemented
				49	#endif
				50
				51	#if HAVE_NATIVE_mpn_div_11
				52
				53	#define div1 mpn_div_11
				54	/* Single-limb division optimized for small quotients.
				55	Returned value holds d0 = r, d1 = q. */
				56	mp_double_limb_t div1 (mp_limb_t, mp_limb_t);
				57
				58	#elif HGCD2_DIV1_METHOD == 1
				59
				60	static inline mp_double_limb_t
				61	div1 (mp_limb_t n0, mp_limb_t d0)
				62	{
				63	mp_double_limb_t res;
				64	res.d1 = n0 / d0;
				65	res.d0 = n0 - res.d1 * d0;
				66
				67	return res;
				68	}
				69
				70	#elif HGCD2_DIV1_METHOD == 2
				71
				72	static mp_double_limb_t
				73	div1 (mp_limb_t n0, mp_limb_t d0)
				74	{
				75	mp_double_limb_t res;
				76	int ncnt, dcnt, cnt;
				77	mp_limb_t q;
				78	mp_limb_t mask;
				79
				80	ASSERT (n0 >= d0);
				81
				82	count_leading_zeros (ncnt, n0);
				83	count_leading_zeros (dcnt, d0);
				84	cnt = dcnt - ncnt;
				85
				86	d0 <<= cnt;
				87
				88	q = -(mp_limb_t) (n0 >= d0);
				89	n0 -= d0 & q;
				90	d0 >>= 1;
				91	q = -q;
				92
				93	while (--cnt >= 0)
				94	{
				95	mask = -(mp_limb_t) (n0 >= d0);
				96	n0 -= d0 & mask;
				97	d0 >>= 1;
				98	q = (q << 1) - mask;
				99	}
				100
				101	res.d0 = n0;
				102	res.d1 = q;
				103	return res;
				104	}
				105
				106	#elif HGCD2_DIV1_METHOD == 3
				107
				108	static inline mp_double_limb_t
				109	div1 (mp_limb_t n0, mp_limb_t d0)
				110	{
				111	mp_double_limb_t res;
				112	if (UNLIKELY ((d0 >> (GMP_LIMB_BITS - 3)) != 0)
				113	\|\| UNLIKELY (n0 >= (d0 << 3)))
				114	{
				115	res.d1 = n0 / d0;
				116	res.d0 = n0 - res.d1 * d0;
				117	}
				118	else
				119	{
				120	mp_limb_t q, mask;
				121
				122	d0 <<= 2;
				123
				124	mask = -(mp_limb_t) (n0 >= d0);
				125	n0 -= d0 & mask;
				126	q = 4 & mask;
				127
				128	d0 >>= 1;
				129	mask = -(mp_limb_t) (n0 >= d0);
				130	n0 -= d0 & mask;
				131	q += 2 & mask;
				132
				133	d0 >>= 1;
				134	mask = -(mp_limb_t) (n0 >= d0);
				135	n0 -= d0 & mask;
				136	q -= mask;
				137
				138	res.d0 = n0;
				139	res.d1 = q;
				140	}
				141	return res;
				142	}
				143
				144	#elif HGCD2_DIV1_METHOD == 4
				145
				146	/* Table quotients. We extract the NBITS most significant bits of the
				147	numerator limb, and the corresponding bits from the divisor limb, and use
				148	these to form an index into the table. This method is probably only useful
				149	for short pipelines with slow multiplication.
				150
				151	Possible improvements:
				152
				153	* Perhaps extract the highest NBITS of the divisor instead of the same bits
				154	as from the numerator. That would require another count_leading_zeros,
				155	and a post-multiply shift of the quotient.
				156
				157	* Compress tables? Their values are tiny, and there are lots of zero
				158	entries (which are never used).
				159
				160	* Round the table entries more cleverly?
				161	*/
				162
				163	#ifndef NBITS
				164	#define NBITS 5
				165	#endif
				166
				167	#if NBITS == 5
				168	/* This needs full division about 13.2% of the time. */
				169	static const unsigned char tab[512] = {
				170	17, 9, 5,4,3,2,2,2,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				171	18, 9, 6,4,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				172	19,10, 6,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
				173	20,10, 6,5,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
				174	21,11, 7,5,4,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
				175	22,11, 7,5,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
				176	23,12, 7,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
				177	24,12, 8,6,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
				178	25,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
				179	26,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
				180	27,14, 9,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
				181	28,14, 9,7,5,4,3,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
				182	29,15,10,7,5,4,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
				183	30,15,10,7,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
				184	31,16,10,7,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
				185	32,16,11,8,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
				186	};
				187	#elif NBITS == 6
				188	/* This needs full division about 9.8% of the time. */
				189	static const unsigned char tab[2048] = {
				190	33,17,11, 8, 6, 5,4,4,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
				191	1, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				192	34,17,11, 8, 6, 5,4,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
				193	1, 1, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				194	35,18,12, 9, 7, 5,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
				195	1, 1, 1, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				196	36,18,12, 9, 7, 6,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
				197	1, 1, 1, 1, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				198	37,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
				199	1, 1, 1, 1, 1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				200	38,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
				201	1, 1, 1, 1, 1, 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				202	39,20,13,10, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
				203	1, 1, 1, 1, 1, 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				204	40,20,14,10, 8, 6,5,5,4,3,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
				205	1, 1, 1, 1, 1, 1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				206	41,21,14,10, 8, 6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
				207	1, 1, 1, 1, 1, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				208	42,21,14,10, 8, 7,6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
				209	1, 1, 1, 1, 1, 1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				210	43,22,15,11, 8, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
				211	1, 1, 1, 1, 1, 1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				212	44,22,15,11, 9, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
				213	1, 1, 1, 1, 1, 1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				214	45,23,15,11, 9, 7,6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
				215	1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				216	46,23,16,11, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
				217	1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				218	47,24,16,12, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
				219	1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				220	48,24,16,12, 9, 8,6,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
				221	1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				222	49,25,17,12,10, 8,7,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
				223	1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				224	50,25,17,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
				225	1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				226	51,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
				227	1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
				228	52,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
				229	1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
				230	53,27,18,13,10, 9,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
				231	1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
				232	54,27,19,14,11, 9,7,6,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
				233	1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
				234	55,28,19,14,11, 9,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
				235	1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
				236	56,28,19,14,11, 9,8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
				237	1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
				238	57,29,20,14,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,1,
				239	1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
				240	58,29,20,15,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,
				241	1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
				242	59,30,20,15,12,10,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
				243	1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
				244	60,30,21,15,12,10,8,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
				245	1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
				246	61,31,21,15,12,10,8,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
				247	1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
				248	62,31,22,16,12,10,9,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
				249	1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
				250	63,32,22,16,13,10,9,7,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,1,
				251	1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
				252	64,32,22,16,13,10,9,8,7,6,5,5,4,4,4,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,
				253	1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
				254	};
				255	#else
				256	#error No table for provided NBITS
				257	#endif
				258
				259	static const unsigned char *tabp = tab - (1 << (NBITS - 1) << NBITS);
				260
				261	static inline mp_double_limb_t
				262	div1 (mp_limb_t n0, mp_limb_t d0)
				263	{
				264	int ncnt;
				265	size_t nbi, dbi;
				266	mp_limb_t q0;
				267	mp_limb_t r0;
				268	mp_limb_t mask;
				269	mp_double_limb_t res;
				270
				271	ASSERT (n0 >= d0); /* Actually only msb position is critical. */
				272
				273	count_leading_zeros (ncnt, n0);
				274	nbi = n0 << ncnt >> (GMP_LIMB_BITS - NBITS);
				275	dbi = d0 << ncnt >> (GMP_LIMB_BITS - NBITS);
				276
				277	q0 = tabp[(nbi << NBITS) + dbi];
				278	r0 = n0 - q0 * d0;
				279	mask = -(mp_limb_t) (r0 >= d0);
				280	q0 -= mask;
				281	r0 -= d0 & mask;
				282
				283	if (UNLIKELY (r0 >= d0))
				284	{
				285	q0 = n0 / d0;
				286	r0 = n0 - q0 * d0;
				287	}
				288
				289	res.d1 = q0;
				290	res.d0 = r0;
				291	return res;
				292	}
				293
				294	#elif HGCD2_DIV1_METHOD == 5
				295
				296	/* Table inverses of divisors. We don't bother with suppressing the msb from
				297	the tables. We index with the NBITS most significant divisor bits,
				298	including the always-set highest bit, but use addressing trickery via tabp
				299	to suppress it.
				300
				301	Possible improvements:
				302
				303	* Do first multiply using 32-bit operations on 64-bit computers. At least
				304	on most Arm64 cores, that uses 3 times less resources. It also saves on
				305	many x86-64 processors.
				306	*/
				307
				308	#ifndef NBITS
				309	#define NBITS 7
				310	#endif
				311
				312	#if NBITS == 5
				313	/* This needs full division about 1.63% of the time. */
				314	static const unsigned char tab[16] = {
				315	63, 59, 55, 52, 50, 47, 45, 43, 41, 39, 38, 36, 35, 34, 33, 32
				316	};
				317	static const unsigned char *tabp = tab - (1 << (NBITS - 1));
				318	#elif NBITS == 6
				319	/* This needs full division about 0.93% of the time. */
				320	static const unsigned char tab[32] = {
				321	127,123,119,116,112,109,106,104,101, 98, 96, 94, 92, 90, 88, 86,
				322	84, 82, 80, 79, 77, 76, 74, 73, 72, 70, 69, 68, 67, 66, 65, 64
				323	};
				324	static const unsigned char *tabp = tab - (1 << (NBITS - 1));
				325	#elif NBITS == 7
				326	/* This needs full division about 0.49% of the time. */
				327	static const unsigned char tab[64] = {
				328	255,251,247,243,239,236,233,229,226,223,220,217,214,211,209,206,
				329	203,201,198,196,194,191,189,187,185,183,181,179,177,175,173,171,
				330	169,167,166,164,162,161,159,158,156,155,153,152,150,149,147,146,
				331	145,143,142,141,140,139,137,136,135,134,133,132,131,130,129,128
				332	};
				333	static const unsigned char *tabp = tab - (1 << (NBITS - 1));
				334	#elif NBITS == 8
				335	/* This needs full division about 0.26% of the time. */
				336	static const unsigned short tab[128] = {
				337	511,507,503,499,495,491,488,484,480,477,473,470,467,463,460,457,
				338	454,450,447,444,441,438,435,433,430,427,424,421,419,416,413,411,
				339	408,406,403,401,398,396,393,391,389,386,384,382,380,377,375,373,
				340	371,369,367,365,363,361,359,357,355,353,351,349,347,345,343,342,
				341	340,338,336,335,333,331,329,328,326,325,323,321,320,318,317,315,
				342	314,312,311,309,308,306,305,303,302,301,299,298,296,295,294,292,
				343	291,290,288,287,286,285,283,282,281,280,279,277,276,275,274,273,
				344	272,270,269,268,267,266,265,264,263,262,261,260,259,258,257,256
				345	};
				346	static const unsigned short *tabp = tab - (1 << (NBITS - 1));
				347	#else
				348	#error No table for provided NBITS
				349	#endif
				350
				351	static inline mp_double_limb_t
				352	div1 (mp_limb_t n0, mp_limb_t d0)
				353	{
				354	int ncnt, dcnt;
				355	size_t dbi;
				356	mp_limb_t inv;
				357	mp_limb_t q0;
				358	mp_limb_t r0;
				359	mp_limb_t mask;
				360	mp_double_limb_t res;
				361
				362	count_leading_zeros (ncnt, n0);
				363	count_leading_zeros (dcnt, d0);
				364
				365	dbi = d0 << dcnt >> (GMP_LIMB_BITS - NBITS);
				366	inv = tabp[dbi];
				367	q0 = ((n0 << ncnt) >> (NBITS + 1)) * inv >> (GMP_LIMB_BITS - 1 + ncnt - dcnt);
				368	r0 = n0 - q0 * d0;
				369	mask = -(mp_limb_t) (r0 >= d0);
				370	q0 -= mask;
				371	r0 -= d0 & mask;
				372
				373	if (UNLIKELY (r0 >= d0))
				374	{
				375	q0 = n0 / d0;
				376	r0 = n0 - q0 * d0;
				377	}
				378
				379	res.d1 = q0;
				380	res.d0 = r0;
				381	return res;
				382	}
				383
				384	#else
				385	#error Unknown HGCD2_DIV1_METHOD
				386	#endif
				387
				388	#if HAVE_NATIVE_mpn_div_22
				389
				390	#define div2 mpn_div_22
				391	/* Two-limb division optimized for small quotients. */
				392	mp_limb_t div2 (mp_ptr, mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t);
				393
				394	#elif HGCD2_DIV2_METHOD == 1
				395
				396	static mp_limb_t
				397	div2 (mp_ptr rp,
				398	mp_limb_t n1, mp_limb_t n0,
				399	mp_limb_t d1, mp_limb_t d0)
				400	{
				401	mp_double_limb_t rq = div1 (n1, d1);
				402	if (UNLIKELY (rq.d1 > d1))
				403	{
				404	mp_limb_t n2, q, t1, t0;
				405	int c;
				406
				407	/* Normalize */
				408	count_leading_zeros (c, d1);
				409	ASSERT (c > 0);
				410
				411	n2 = n1 >> (GMP_LIMB_BITS - c);
				412	n1 = (n1 << c) \| (n0 >> (GMP_LIMB_BITS - c));
				413	n0 <<= c;
				414	d1 = (d1 << c) \| (d0 >> (GMP_LIMB_BITS - c));
				415	d0 <<= c;
				416
				417	udiv_qrnnd (q, n1, n2, n1, d1);
				418	umul_ppmm (t1, t0, q, d0);
				419	if (t1 > n1 \|\| (t1 == n1 && t0 > n0))
				420	{
				421	ASSERT (q > 0);
				422	q--;
				423	sub_ddmmss (t1, t0, t1, t0, d1, d0);
				424	}
				425	sub_ddmmss (n1, n0, n1, n0, t1, t0);
				426
				427	/* Undo normalization */
				428	rp[0] = (n0 >> c) \| (n1 << (GMP_LIMB_BITS - c));
				429	rp[1] = n1 >> c;
				430
				431	return q;
				432	}
				433	else
				434	{
				435	mp_limb_t q, t1, t0;
				436	n1 = rq.d0;
				437	q = rq.d1;
				438	umul_ppmm (t1, t0, q, d0);
				439	if (UNLIKELY (t1 >= n1) && (t1 > n1 \|\| t0 > n0))
				440	{
				441	ASSERT (q > 0);
				442	q--;
				443	sub_ddmmss (t1, t0, t1, t0, d1, d0);
				444	}
				445	sub_ddmmss (rp[1], rp[0], n1, n0, t1, t0);
				446	return q;
				447	}
				448	}
				449
				450	#elif HGCD2_DIV2_METHOD == 2
				451
				452	/* Bit-wise div2. Relies on fast count_leading_zeros. */
				453	static mp_limb_t
				454	div2 (mp_ptr rp,
				455	mp_limb_t n1, mp_limb_t n0,
				456	mp_limb_t d1, mp_limb_t d0)
				457	{
				458	mp_limb_t q = 0;
				459	int ncnt;
				460	int dcnt;
				461
				462	count_leading_zeros (ncnt, n1);
				463	count_leading_zeros (dcnt, d1);
				464	dcnt -= ncnt;
				465
				466	d1 = (d1 << dcnt) + (d0 >> 1 >> (GMP_LIMB_BITS - 1 - dcnt));
				467	d0 <<= dcnt;
				468
				469	do
				470	{
				471	mp_limb_t mask;
				472	q <<= 1;
				473	if (UNLIKELY (n1 == d1))
				474	mask = -(n0 >= d0);
				475	else
				476	mask = -(n1 > d1);
				477
				478	q -= mask;
				479
				480	sub_ddmmss (n1, n0, n1, n0, mask & d1, mask & d0);
				481
				482	d0 = (d1 << (GMP_LIMB_BITS - 1)) \| (d0 >> 1);
				483	d1 = d1 >> 1;
				484	}
				485	while (dcnt--);
				486
				487	rp[0] = n0;
				488	rp[1] = n1;
				489
				490	return q;
				491	}
				492	#else
				493	#error Unknown HGCD2_DIV2_METHOD
				494	#endif
				495
				496	/* Reduces a,b until \|a-b\| (almost) fits in one limb + 1 bit. Constructs
				497	matrix M. Returns 1 if we make progress, i.e. can perform at least
				498	one subtraction. Otherwise returns zero. */
				499
				500	/* FIXME: Possible optimizations:
				501
				502	The div2 function starts with checking the most significant bit of
				503	the numerator. We can maintained normalized operands here, call
				504	hgcd with normalized operands only, which should make the code
				505	simpler and possibly faster.
				506
				507	Experiment with table lookups on the most significant bits.
				508
				509	This function is also a candidate for assembler implementation.
				510	*/
				511	int
				512	mpn_hgcd2 (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl,
				513	struct hgcd_matrix1 *M)
				514	{
				515	mp_limb_t u00, u01, u10, u11;
				516
				517	if (ah < 2 \|\| bh < 2)
				518	return 0;
				519
				520	if (ah > bh \|\| (ah == bh && al > bl))
				521	{
				522	sub_ddmmss (ah, al, ah, al, bh, bl);
				523	if (ah < 2)
				524	return 0;
				525
				526	u00 = u01 = u11 = 1;
				527	u10 = 0;
				528	}
				529	else
				530	{
				531	sub_ddmmss (bh, bl, bh, bl, ah, al);
				532	if (bh < 2)
				533	return 0;
				534
				535	u00 = u10 = u11 = 1;
				536	u01 = 0;
				537	}
				538
				539	if (ah < bh)
				540	goto subtract_a;
				541
				542	for (;;)
				543	{
				544	ASSERT (ah >= bh);
				545	if (ah == bh)
				546	goto done;
				547
				548	if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
				549	{
				550	ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
				551	bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));
				552
				553	break;
				554	}
				555
				556	/* Subtract a -= q b, and multiply M from the right by (1 q ; 0
				557	1), affecting the second column of M. */
				558	ASSERT (ah > bh);
				559	sub_ddmmss (ah, al, ah, al, bh, bl);
				560
				561	if (ah < 2)
				562	goto done;
				563
				564	if (ah <= bh)
				565	{
				566	/* Use q = 1 */
				567	u01 += u00;
				568	u11 += u10;
				569	}
				570	else
				571	{
				572	mp_limb_t r[2];
				573	mp_limb_t q = div2 (r, ah, al, bh, bl);
				574	al = r[0]; ah = r[1];
				575	if (ah < 2)
				576	{
				577	/* A is too small, but q is correct. */
				578	u01 += q * u00;
				579	u11 += q * u10;
				580	goto done;
				581	}
				582	q++;
				583	u01 += q * u00;
				584	u11 += q * u10;
				585	}
				586	subtract_a:
				587	ASSERT (bh >= ah);
				588	if (ah == bh)
				589	goto done;
				590
				591	if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
				592	{
				593	ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
				594	bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));
				595
				596	goto subtract_a1;
				597	}
				598
				599	/* Subtract b -= q a, and multiply M from the right by (1 0 ; q
				600	1), affecting the first column of M. */
				601	sub_ddmmss (bh, bl, bh, bl, ah, al);
				602
				603	if (bh < 2)
				604	goto done;
				605
				606	if (bh <= ah)
				607	{
				608	/* Use q = 1 */
				609	u00 += u01;
				610	u10 += u11;
				611	}
				612	else
				613	{
				614	mp_limb_t r[2];
				615	mp_limb_t q = div2 (r, bh, bl, ah, al);
				616	bl = r[0]; bh = r[1];
				617	if (bh < 2)
				618	{
				619	/* B is too small, but q is correct. */
				620	u00 += q * u01;
				621	u10 += q * u11;
				622	goto done;
				623	}
				624	q++;
				625	u00 += q * u01;
				626	u10 += q * u11;
				627	}
				628	}
				629
				630	/* NOTE: Since we discard the least significant half limb, we don't get a
				631	truly maximal M (corresponding to \|a - b\| < 2^{GMP_LIMB_BITS +1}). */
				632	/* Single precision loop */
				633	for (;;)
				634	{
				635	ASSERT (ah >= bh);
				636
				637	ah -= bh;
				638	if (ah < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
				639	break;
				640
				641	if (ah <= bh)
				642	{
				643	/* Use q = 1 */
				644	u01 += u00;
				645	u11 += u10;
				646	}
				647	else
				648	{
				649	mp_double_limb_t rq = div1 (ah, bh);
				650	mp_limb_t q = rq.d1;
				651	ah = rq.d0;
				652
				653	if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
				654	{
				655	/* A is too small, but q is correct. */
				656	u01 += q * u00;
				657	u11 += q * u10;
				658	break;
				659	}
				660	q++;
				661	u01 += q * u00;
				662	u11 += q * u10;
				663	}
				664	subtract_a1:
				665	ASSERT (bh >= ah);
				666
				667	bh -= ah;
				668	if (bh < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
				669	break;
				670
				671	if (bh <= ah)
				672	{
				673	/* Use q = 1 */
				674	u00 += u01;
				675	u10 += u11;
				676	}
				677	else
				678	{
				679	mp_double_limb_t rq = div1 (bh, ah);
				680	mp_limb_t q = rq.d1;
				681	bh = rq.d0;
				682
				683	if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
				684	{
				685	/* B is too small, but q is correct. */
				686	u00 += q * u01;
				687	u10 += q * u11;
				688	break;
				689	}
				690	q++;
				691	u00 += q * u01;
				692	u10 += q * u11;
				693	}
				694	}
				695
				696	done:
				697	M->u[0][0] = u00; M->u[0][1] = u01;
				698	M->u[1][0] = u10; M->u[1][1] = u11;
				699
				700	return 1;
				701	}
				702
				703	/* Sets (r;b) = (a;b) M, with M = (u00, u01; u10, u11). Vector must
				704	* have space for n + 1 limbs. Uses three buffers to avoid a copy*/
				705	mp_size_t
				706	mpn_hgcd_mul_matrix1_vector (const struct hgcd_matrix1 *M,
				707	mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n)
				708	{
				709	mp_limb_t ah, bh;
				710
				711	/* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
				712
				713	r = u00 * a
				714	r += u10 * b
				715	b *= u11
				716	b += u01 * a
				717	*/
				718
				719	#if HAVE_NATIVE_mpn_addaddmul_1msb0
				720	ah = mpn_addaddmul_1msb0 (rp, ap, bp, n, M->u[0][0], M->u[1][0]);
				721	bh = mpn_addaddmul_1msb0 (bp, bp, ap, n, M->u[1][1], M->u[0][1]);
				722	#else
				723	ah = mpn_mul_1 (rp, ap, n, M->u[0][0]);
				724	ah += mpn_addmul_1 (rp, bp, n, M->u[1][0]);
				725
				726	bh = mpn_mul_1 (bp, bp, n, M->u[1][1]);
				727	bh += mpn_addmul_1 (bp, ap, n, M->u[0][1]);
				728	#endif
				729	rp[n] = ah;
				730	bp[n] = bh;
				731
				732	n += (ah \| bh) > 0;
				733	return n;
				734	}