Blame - third_party/gmp/mpz/bin_uiui.c - RealtimeRoboticsGroup/test

blob: b77628fdfc23a4ec112eedd5af9c8983b184c799 [file] [log] [blame]

Austin Schuh	bb1338c	2024-06-15 19:31:16 -0700	[diff] [blame]	1	/* mpz_bin_uiui - compute n over k.
				2
				3	Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
				4
				5	Copyright 2010-2012, 2015-2018 Free Software Foundation, Inc.
				6
				7	This file is part of the GNU MP Library.
				8
				9	The GNU MP Library is free software; you can redistribute it and/or modify
				10	it under the terms of either:
				11
				12	* the GNU Lesser General Public License as published by the Free
				13	Software Foundation; either version 3 of the License, or (at your
				14	option) any later version.
				15
				16	or
				17
				18	* the GNU General Public License as published by the Free Software
				19	Foundation; either version 2 of the License, or (at your option) any
				20	later version.
				21
				22	or both in parallel, as here.
				23
				24	The GNU MP Library is distributed in the hope that it will be useful, but
				25	WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
				26	or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
				27	for more details.
				28
				29	You should have received copies of the GNU General Public License and the
				30	GNU Lesser General Public License along with the GNU MP Library. If not,
				31	see https://www.gnu.org/licenses/. */
				32
				33	#include "gmp-impl.h"
				34	#include "longlong.h"
				35
				36	#ifndef BIN_GOETGHELUCK_THRESHOLD
				37	#define BIN_GOETGHELUCK_THRESHOLD 512
				38	#endif
				39	#ifndef BIN_UIUI_ENABLE_SMALLDC
				40	#define BIN_UIUI_ENABLE_SMALLDC 1
				41	#endif
				42	#ifndef BIN_UIUI_RECURSIVE_SMALLDC
				43	#define BIN_UIUI_RECURSIVE_SMALLDC (GMP_NUMB_BITS > 32)
				44	#endif
				45
				46	/* Algorithm:
				47
				48	Accumulate chunks of factors first limb-by-limb (using one of mul0-mul8)
				49	which are then accumulated into mpn numbers. The first inner loop
				50	accumulates divisor factors, the 2nd inner loop accumulates exactly the same
				51	number of dividend factors. We avoid accumulating more for the divisor,
				52	even with its smaller factors, since we else cannot guarantee divisibility.
				53
				54	Since we know each division will yield an integer, we compute the quotient
				55	using Hensel norm: If the quotient is limited by 2^t, we compute A / B mod
				56	2^t.
				57
				58	Improvements:
				59
				60	(1) An obvious improvement to this code would be to compute mod 2^t
				61	everywhere. Unfortunately, we cannot determine t beforehand, unless we
				62	invoke some approximation, such as Stirling's formula. Of course, we don't
				63	need t to be tight. However, it is not clear that this would help much,
				64	our numbers are kept reasonably small already.
				65
				66	(2) Compute nmax/kmax semi-accurately, without scalar division or a loop.
				67	Extracting the 3 msb, then doing a table lookup using cnt*8+msb as index,
				68	would make it both reasonably accurate and fast. (We could use a table
				69	stored into a limb, perhaps.) The table should take the removed factors of
				70	2 into account (those done on-the-fly in mulN).
				71
				72	(3) The first time in the loop we compute the odd part of a
				73	factorial in kp, we might use oddfac_1 for this task.
				74	*/
				75
				76	/* This threshold determines how large divisor to accumulate before we call
				77	bdiv. Perhaps we should never call bdiv, and accumulate all we are told,
				78	since we are just basecase code anyway? Presumably, this depends on the
				79	relative speed of the asymptotically fast code and this code. */
				80	#define SOME_THRESHOLD 20
				81
				82	/* Multiply-into-limb functions. These remove factors of 2 on-the-fly. FIXME:
				83	All versions of MAXFACS don't take this 2 removal into account now, meaning
				84	that then, shifting just adds some overhead. (We remove factors from the
				85	completed limb anyway.) */
				86
				87	static mp_limb_t
				88	mul1 (mp_limb_t m)
				89	{
				90	return m;
				91	}
				92
				93	static mp_limb_t
				94	mul2 (mp_limb_t m)
				95	{
				96	/* We need to shift before multiplying, to avoid an overflow. */
				97	mp_limb_t m01 = (m \| 1) * ((m + 1) >> 1);
				98	return m01;
				99	}
				100
				101	static mp_limb_t
				102	mul3 (mp_limb_t m)
				103	{
				104	mp_limb_t m01 = (m + 0) * (m + 1) >> 1;
				105	mp_limb_t m2 = (m + 2);
				106	return m01 * m2;
				107	}
				108
				109	static mp_limb_t
				110	mul4 (mp_limb_t m)
				111	{
				112	mp_limb_t m03 = (m + 0) * (m + 3) >> 1;
				113	return m03 * (m03 + 1); /* mul2 (m03) ? */
				114	}
				115
				116	static mp_limb_t
				117	mul5 (mp_limb_t m)
				118	{
				119	mp_limb_t m03 = (m + 0) * (m + 3) >> 1;
				120	mp_limb_t m034 = m03 * (m + 4);
				121	return (m03 + 1) * m034;
				122	}
				123
				124	static mp_limb_t
				125	mul6 (mp_limb_t m)
				126	{
				127	mp_limb_t m05 = (m + 0) * (m + 5);
				128	mp_limb_t m1234 = (m05 + 5) * (m05 + 5) >> 3;
				129	return m1234 * (m05 >> 1);
				130	}
				131
				132	static mp_limb_t
				133	mul7 (mp_limb_t m)
				134	{
				135	mp_limb_t m05 = (m + 0) * (m + 5);
				136	mp_limb_t m1234 = (m05 + 5) * (m05 + 5) >> 3;
				137	mp_limb_t m056 = m05 * (m + 6) >> 1;
				138	return m1234 * m056;
				139	}
				140
				141	static mp_limb_t
				142	mul8 (mp_limb_t m)
				143	{
				144	mp_limb_t m07 = (m + 0) * (m + 7);
				145	mp_limb_t m0257 = m07 * (m07 + 10) >> 3;
				146	mp_limb_t m1346 = m07 + 9 + m0257;
				147	return m0257 * m1346;
				148	}
				149
				150	/*
				151	static mp_limb_t
				152	mul9 (mp_limb_t m)
				153	{
				154	return (m + 8) * mul8 (m) ;
				155	}
				156
				157	static mp_limb_t
				158	mul10 (mp_limb_t m)
				159	{
				160	mp_limb_t m09 = (m + 0) * (m + 9);
				161	mp_limb_t m18 = (m09 >> 1) + 4;
				162	mp_limb_t m0369 = m09 * (m09 + 18) >> 3;
				163	mp_limb_t m2457 = m09 * 2 + 35 + m0369;
				164	return ((m0369 * m2457) >> 1) * m18;
				165	}
				166	*/
				167
				168	typedef mp_limb_t (* mulfunc_t) (mp_limb_t);
				169
				170	static const mulfunc_t mulfunc[] = {mul1,mul2,mul3,mul4,mul5,mul6,mul7,mul8 /* ,mul9,mul10 */};
				171	#define M (numberof(mulfunc))
				172
				173	/* Number of factors-of-2 removed by the corresponding mulN function. */
				174	static const unsigned char tcnttab[] = {0, 1, 1, 2, 2, 4, 4, 6 /,6,8/};
				175
				176	#if 1
				177	/* This variant is inaccurate but share the code with other functions. */
				178	#define MAXFACS(max,l) \
				179	do { \
				180	(max) = log_n_max (l); \
				181	} while (0)
				182	#else
				183
				184	/* This variant is exact(?) but uses a loop. It takes the 2 removal
				185	of mulN into account. */
				186	static const unsigned long ftab[] =
				187	#if GMP_NUMB_BITS == 64
				188	/* 1 to 8 factors per iteration */
				189	{CNST_LIMB(0xffffffffffffffff),CNST_LIMB(0x16a09e667),0x32cbfc,0x16a08,0x24c0,0xa11,0x345,0x1ab /,0xe9,0x8e /};
				190	#endif
				191	#if GMP_NUMB_BITS == 32
				192	/* 1 to 7 factors per iteration */
				193	{0xffffffff,0x16a09,0x7ff,0x168,0x6f,0x3d,0x20 /* ,0x17 */};
				194	#endif
				195
				196	#define MAXFACS(max,l) \
				197	do { \
				198	int __i; \
				199	for (__i = numberof (ftab) - 1; l > ftab[__i]; __i--) \
				200	; \
				201	(max) = __i + 1; \
				202	} while (0)
				203	#endif
				204
				205	/* Entry i contains (i!/2^t)^(-1) where t is chosen such that the parenthesis
				206	is an odd integer. */
				207	static const mp_limb_t facinv[] = { ONE_LIMB_ODD_FACTORIAL_INVERSES_TABLE };
				208
				209	static void
				210	mpz_bdiv_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
				211	{
				212	unsigned nmax, kmax, nmaxnow, numfac;
				213	mp_ptr np, kp;
				214	mp_size_t nn, kn, alloc;
				215	mp_limb_t i, j, t, iii, jjj, cy, dinv;
				216	int cnt;
				217	mp_size_t maxn;
				218	TMP_DECL;
				219
				220	ASSERT (k > ODD_FACTORIAL_TABLE_LIMIT);
				221	TMP_MARK;
				222
				223	maxn = 1 + n / GMP_NUMB_BITS; /* absolutely largest result size (limbs) */
				224
				225	/* FIXME: This allocation might be insufficient, but is usually way too
				226	large. */
				227	alloc = SOME_THRESHOLD - 1 + MAX (3 * maxn / 2, SOME_THRESHOLD);
				228	alloc = MIN (alloc, (mp_size_t) k) + 1;
				229	TMP_ALLOC_LIMBS_2 (np, alloc, kp, SOME_THRESHOLD + 1);
				230
				231	MAXFACS (nmax, n);
				232	ASSERT (nmax <= M);
				233	MAXFACS (kmax, k);
				234	ASSERT (kmax <= M);
				235	ASSERT (k >= M);
				236
				237	i = n - k + 1;
				238
				239	np[0] = 1; nn = 1;
				240
				241	numfac = 1;
				242	j = ODD_FACTORIAL_TABLE_LIMIT + 1;
				243	jjj = ODD_FACTORIAL_TABLE_MAX;
				244	ASSERT (__gmp_oddfac_table[ODD_FACTORIAL_TABLE_LIMIT] == ODD_FACTORIAL_TABLE_MAX);
				245
				246	while (1)
				247	{
				248	kp[0] = jjj; /* store new factors */
				249	kn = 1;
				250	t = k - j + 1;
				251	kmax = MIN (kmax, t);
				252
				253	while (kmax != 0 && kn < SOME_THRESHOLD)
				254	{
				255	jjj = mulfunc[kmax - 1] (j);
				256	j += kmax; /* number of factors used */
				257	count_trailing_zeros (cnt, jjj); /* count low zeros */
				258	jjj >>= cnt; /* remove remaining low zeros */
				259	cy = mpn_mul_1 (kp, kp, kn, jjj); /* accumulate new factors */
				260	kp[kn] = cy;
				261	kn += cy != 0;
				262	t = k - j + 1;
				263	kmax = MIN (kmax, t);
				264	}
				265	numfac = j - numfac;
				266
				267	while (numfac != 0)
				268	{
				269	nmaxnow = MIN (nmax, numfac);
				270	iii = mulfunc[nmaxnow - 1] (i);
				271	i += nmaxnow; /* number of factors used */
				272	count_trailing_zeros (cnt, iii); /* count low zeros */
				273	iii >>= cnt; /* remove remaining low zeros */
				274	cy = mpn_mul_1 (np, np, nn, iii); /* accumulate new factors */
				275	np[nn] = cy;
				276	nn += cy != 0;
				277	numfac -= nmaxnow;
				278	}
				279
				280	ASSERT (nn < alloc);
				281
				282	binvert_limb (dinv, kp[0]);
				283	nn += (np[nn - 1] >= kp[kn - 1]);
				284	nn -= kn;
				285	mpn_sbpi1_bdiv_q (np, np, nn, kp, MIN(kn,nn), -dinv);
				286	mpn_neg (np, np, nn);
				287
				288	if (kmax == 0)
				289	break;
				290	numfac = j;
				291
				292	jjj = mulfunc[kmax - 1] (j);
				293	j += kmax; /* number of factors used */
				294	count_trailing_zeros (cnt, jjj); /* count low zeros */
				295	jjj >>= cnt; /* remove remaining low zeros */
				296	}
				297
				298	/* Put back the right number of factors of 2. */
				299	popc_limb (cnt, n - k);
				300	popc_limb (j, k);
				301	cnt += j;
				302	popc_limb (j, n);
				303	cnt -= j;
				304	if (cnt != 0)
				305	{
				306	ASSERT (cnt < GMP_NUMB_BITS); /* can happen, but not for intended use */
				307	cy = mpn_lshift (np, np, nn, cnt);
				308	np[nn] = cy;
				309	nn += cy != 0;
				310	}
				311
				312	nn -= np[nn - 1] == 0; /* normalisation */
				313
				314	kp = MPZ_NEWALLOC (r, nn);
				315	SIZ(r) = nn;
				316	MPN_COPY (kp, np, nn);
				317	TMP_FREE;
				318	}
				319
				320	static void
				321	mpz_smallk_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
				322	{
				323	unsigned nmax, numfac;
				324	mp_ptr rp;
				325	mp_size_t rn, alloc;
				326	mp_limb_t i, iii, cy;
				327	unsigned i2cnt, cnt;
				328
				329	MAXFACS (nmax, n);
				330	nmax = MIN (nmax, M);
				331
				332	i = n - k + 1;
				333
				334	i2cnt = __gmp_fac2cnt_table[k / 2 - 1]; /* low zeros count */
				335	if (nmax >= k)
				336	{
				337	MPZ_NEWALLOC (r, 1) [0] = mulfunc[k - 1] (i) * facinv[k - 2] >>
				338	(i2cnt - tcnttab[k - 1]);
				339	SIZ(r) = 1;
				340	return;
				341	}
				342
				343	count_leading_zeros (cnt, (mp_limb_t) n);
				344	cnt = GMP_LIMB_BITS - cnt;
				345	alloc = cnt * k / GMP_NUMB_BITS + 3; /* FIXME: ensure rounding is enough. */
				346	rp = MPZ_NEWALLOC (r, alloc);
				347
				348	rp[0] = mulfunc[nmax - 1] (i);
				349	rn = 1;
				350	i += nmax; /* number of factors used */
				351	i2cnt -= tcnttab[nmax - 1]; /* low zeros count */
				352	numfac = k - nmax;
				353	do
				354	{
				355	nmax = MIN (nmax, numfac);
				356	iii = mulfunc[nmax - 1] (i);
				357	i += nmax; /* number of factors used */
				358	i2cnt -= tcnttab[nmax - 1]; /* update low zeros count */
				359	cy = mpn_mul_1 (rp, rp, rn, iii); /* accumulate new factors */
				360	rp[rn] = cy;
				361	rn += cy != 0;
				362	numfac -= nmax;
				363	} while (numfac != 0);
				364
				365	ASSERT (rn < alloc);
				366
				367	mpn_pi1_bdiv_q_1 (rp, rp, rn, __gmp_oddfac_table[k], facinv[k - 2], i2cnt);
				368	/* A two-fold, branch-free normalisation is possible :*/
				369	/* rn -= rp[rn - 1] == 0; */
				370	/* rn -= rp[rn - 1] == 0; */
				371	MPN_NORMALIZE_NOT_ZERO (rp, rn);
				372
				373	SIZ(r) = rn;
				374	}
				375
				376	/* Algorithm:
				377
				378	Plain and simply multiply things together.
				379
				380	We tabulate factorials (k!/2^t)^(-1) mod B (where t is chosen such
				381	that k!/2^t is odd).
				382
				383	*/
				384
				385	static mp_limb_t
				386	bc_bin_uiui (unsigned int n, unsigned int k)
				387	{
				388	return ((__gmp_oddfac_table[n] * facinv[k - 2] * facinv[n - k - 2])
				389	<< (__gmp_fac2cnt_table[n / 2 - 1] - __gmp_fac2cnt_table[k / 2 - 1] - __gmp_fac2cnt_table[(n-k) / 2 - 1]))
				390	& GMP_NUMB_MASK;
				391	}
				392
				393	/* Algorithm:
				394
				395	Recursively exploit the relation
				396	bin(n,k) = bin(n,k>>1)*bin(n-k>>1,k-k>>1)/bin(k,k>>1) .
				397
				398	Values for binomial(k,k>>1) that fit in a limb are precomputed
				399	(with inverses).
				400	*/
				401
				402	/* bin2kk[i - ODD_CENTRAL_BINOMIAL_OFFSET] =
				403	binomial(i2,i)/2^t (where t is chosen so that it is odd). /
				404	static const mp_limb_t bin2kk[] = { ONE_LIMB_ODD_CENTRAL_BINOMIAL_TABLE };
				405
				406	/* bin2kkinv[i] = bin2kk[i]^-1 mod B */
				407	static const mp_limb_t bin2kkinv[] = { ONE_LIMB_ODD_CENTRAL_BINOMIAL_INVERSE_TABLE };
				408
				409	/* bin2kk[i] = binomial((i+MIN_S)2,i+MIN_S)/2^t. This table contains the t values. /
				410	static const unsigned char fac2bin[] = { CENTRAL_BINOMIAL_2FAC_TABLE };
				411
				412	static void
				413	mpz_smallkdc_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
				414	{
				415	mp_ptr rp;
				416	mp_size_t rn;
				417	unsigned long int hk;
				418
				419	hk = k >> 1;
				420
				421	if ((! BIN_UIUI_RECURSIVE_SMALLDC) \|\| hk <= ODD_FACTORIAL_TABLE_LIMIT)
				422	mpz_smallk_bin_uiui (r, n, hk);
				423	else
				424	mpz_smallkdc_bin_uiui (r, n, hk);
				425	k -= hk;
				426	n -= hk;
				427	if (n <= ODD_FACTORIAL_EXTTABLE_LIMIT) {
				428	mp_limb_t cy;
				429	rn = SIZ (r);
				430	rp = MPZ_REALLOC (r, rn + 1);
				431	cy = mpn_mul_1 (rp, rp, rn, bc_bin_uiui (n, k));
				432	rp [rn] = cy;
				433	rn += cy != 0;
				434	} else {
				435	mp_limb_t buffer[ODD_CENTRAL_BINOMIAL_TABLE_LIMIT + 3];
				436	mpz_t t;
				437
				438	ALLOC (t) = ODD_CENTRAL_BINOMIAL_TABLE_LIMIT + 3;
				439	PTR (t) = buffer;
				440	if ((! BIN_UIUI_RECURSIVE_SMALLDC) \|\| k <= ODD_FACTORIAL_TABLE_LIMIT)
				441	mpz_smallk_bin_uiui (t, n, k);
				442	else
				443	mpz_smallkdc_bin_uiui (t, n, k);
				444	mpz_mul (r, r, t);
				445	rp = PTR (r);
				446	rn = SIZ (r);
				447	}
				448
				449	mpn_pi1_bdiv_q_1 (rp, rp, rn, bin2kk[k - ODD_CENTRAL_BINOMIAL_OFFSET],
				450	bin2kkinv[k - ODD_CENTRAL_BINOMIAL_OFFSET],
				451	fac2bin[k - ODD_CENTRAL_BINOMIAL_OFFSET] - (k != hk));
				452	/* A two-fold, branch-free normalisation is possible :*/
				453	/* rn -= rp[rn - 1] == 0; */
				454	/* rn -= rp[rn - 1] == 0; */
				455	MPN_NORMALIZE_NOT_ZERO (rp, rn);
				456
				457	SIZ(r) = rn;
				458	}
				459
				460	/* mpz_goetgheluck_bin_uiui(RESULT, N, K) -- Set RESULT to binomial(N,K).
				461	*
				462	* Contributed to the GNU project by Marco Bodrato.
				463	*
				464	* Implementation of the algorithm by P. Goetgheluck, "Computing
				465	* Binomial Coefficients", The American Mathematical Monthly, Vol. 94,
				466	* No. 4 (April 1987), pp. 360-365.
				467	*
				468	* Acknowledgment: Peter Luschny did spot the slowness of the previous
				469	* code and suggested the reference.
				470	*/
				471
				472	/* TODO: Remove duplicated constants / macros / static functions...
				473	*/
				474
				475	/*************************************************************/
				476	/* Section macros: common macros, for swing/fac/bin (&sieve) */
				477	/*************************************************************/
				478
				479	#define FACTOR_LIST_APPEND(PR, MAX_PR, VEC, I) \
				480	if ((PR) > (MAX_PR)) { \
				481	(VEC)[(I)++] = (PR); \
				482	(PR) = 1; \
				483	}
				484
				485	#define FACTOR_LIST_STORE(P, PR, MAX_PR, VEC, I) \
				486	do { \
				487	if ((PR) > (MAX_PR)) { \
				488	(VEC)[(I)++] = (PR); \
				489	(PR) = (P); \
				490	} else \
				491	(PR) *= (P); \
				492	} while (0)
				493
				494	#define LOOP_ON_SIEVE_CONTINUE(prime,end,sieve) \
				495	__max_i = (end); \
				496	\
				497	do { \
				498	++__i; \
				499	if (((sieve)[__index] & __mask) == 0) \
				500	{ \
				501	mp_limb_t prime; \
				502	prime = id_to_n(__i)
				503
				504	#define LOOP_ON_SIEVE_BEGIN(prime,start,end,off,sieve) \
				505	do { \
				506	mp_limb_t __mask, __index, __max_i, __i; \
				507	\
				508	__i = (start)-(off); \
				509	__index = __i / GMP_LIMB_BITS; \
				510	__mask = CNST_LIMB(1) << (__i % GMP_LIMB_BITS); \
				511	__i += (off); \
				512	\
				513	LOOP_ON_SIEVE_CONTINUE(prime,end,sieve)
				514
				515	#define LOOP_ON_SIEVE_STOP \
				516	} \
				517	__mask = __mask << 1 \| __mask >> (GMP_LIMB_BITS-1); \
				518	__index += __mask & 1; \
				519	} while (__i <= __max_i)
				520
				521	#define LOOP_ON_SIEVE_END \
				522	LOOP_ON_SIEVE_STOP; \
				523	} while (0)
				524
				525	/*********************************************************/
				526	/* Section sieve: sieving functions and tools for primes */
				527	/*********************************************************/
				528
				529	#if WANT_ASSERT
				530	static mp_limb_t
				531	bit_to_n (mp_limb_t bit) { return (bit*3+4)\|1; }
				532	#endif
				533
				534	/* id_to_n (x) = bit_to_n (x-1) = (id3+1)\|1/
				535	static mp_limb_t
				536	id_to_n (mp_limb_t id) { return id*3+1+(id&1); }
				537
				538	/* n_to_bit (n) = ((n-1)&(-CNST_LIMB(2)))/3U-1 */
				539	static mp_limb_t
				540	n_to_bit (mp_limb_t n) { return ((n-5)\|1)/3U; }
				541
				542	static mp_size_t
				543	primesieve_size (mp_limb_t n) { return n_to_bit(n) / GMP_LIMB_BITS + 1; }
				544
				545	/*********************************************************/
				546	/* Section binomial: fast binomial implementation */
				547	/*********************************************************/
				548
				549	#define COUNT_A_PRIME(P, N, K, PR, MAX_PR, VEC, I) \
				550	do { \
				551	mp_limb_t __a, __b, __prime, __ma,__mb; \
				552	__prime = (P); \
				553	__a = (N); __b = (K); __mb = 0; \
				554	FACTOR_LIST_APPEND(PR, MAX_PR, VEC, I); \
				555	do { \
				556	__mb += __b % __prime; __b /= __prime; \
				557	__ma = __a % __prime; __a /= __prime; \
				558	if (__ma < __mb) { \
				559	__mb = 1; (PR) *= __prime; \
				560	} else __mb = 0; \
				561	} while (__a >= __prime); \
				562	} while (0)
				563
				564	#define SH_COUNT_A_PRIME(P, N, K, PR, MAX_PR, VEC, I) \
				565	do { \
				566	mp_limb_t __prime; \
				567	__prime = (P); \
				568	if (((N) % __prime) < ((K) % __prime)) { \
				569	FACTOR_LIST_STORE (__prime, PR, MAX_PR, VEC, I); \
				570	} \
				571	} while (0)
				572
				573	/* Returns an approximation of the sqare root of x.
				574	* It gives:
				575	* limb_apprsqrt (x) ^ 2 <= x < (limb_apprsqrt (x)+1) ^ 2
				576	* or
				577	* x <= limb_apprsqrt (x) ^ 2 <= x * 9/8
				578	*/
				579	static mp_limb_t
				580	limb_apprsqrt (mp_limb_t x)
				581	{
				582	int s;
				583
				584	ASSERT (x > 2);
				585	count_leading_zeros (s, x);
				586	s = (GMP_LIMB_BITS - s) >> 1;
				587	return ((CNST_LIMB(1) << s) + (x >> s)) >> 1;
				588	}
				589
				590	static void
				591	mpz_goetgheluck_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
				592	{
				593	mp_limb_t sieve, factors, count;
				594	mp_limb_t prod, max_prod;
				595	mp_size_t j;
				596	TMP_DECL;
				597
				598	ASSERT (BIN_GOETGHELUCK_THRESHOLD >= 13);
				599	ASSERT (n >= 25);
				600
				601	TMP_MARK;
				602	sieve = TMP_ALLOC_LIMBS (primesieve_size (n));
				603
				604	count = gmp_primesieve (sieve, n) + 1;
				605	factors = TMP_ALLOC_LIMBS (count / log_n_max (n) + 1);
				606
				607	max_prod = GMP_NUMB_MAX / n;
				608
				609	/* Handle primes = 2, 3 separately. */
				610	popc_limb (count, n - k);
				611	popc_limb (j, k);
				612	count += j;
				613	popc_limb (j, n);
				614	count -= j;
				615	prod = CNST_LIMB(1) << count;
				616
				617	j = 0;
				618	COUNT_A_PRIME (3, n, k, prod, max_prod, factors, j);
				619
				620	/* Accumulate prime factors from 5 to n/2 */
				621	{
				622	mp_limb_t s;
				623
				624	s = limb_apprsqrt(n);
				625	s = n_to_bit (s);
				626	ASSERT (bit_to_n (s+1) * bit_to_n (s+1) > n);
				627	ASSERT (s <= n_to_bit (n >> 1));
				628	LOOP_ON_SIEVE_BEGIN (prime, n_to_bit (5), s, 0,sieve);
				629	COUNT_A_PRIME (prime, n, k, prod, max_prod, factors, j);
				630	LOOP_ON_SIEVE_STOP;
				631
				632	ASSERT (max_prod <= GMP_NUMB_MAX / 2);
				633	max_prod <<= 1;
				634
				635	LOOP_ON_SIEVE_CONTINUE (prime, n_to_bit (n >> 1),sieve);
				636	SH_COUNT_A_PRIME (prime, n, k, prod, max_prod, factors, j);
				637	LOOP_ON_SIEVE_END;
				638
				639	max_prod >>= 1;
				640	}
				641
				642	/* Store primes from (n-k)+1 to n */
				643	ASSERT (n_to_bit (n - k) < n_to_bit (n));
				644
				645	LOOP_ON_SIEVE_BEGIN (prime, n_to_bit (n - k) + 1, n_to_bit (n), 0,sieve);
				646	FACTOR_LIST_STORE (prime, prod, max_prod, factors, j);
				647	LOOP_ON_SIEVE_END;
				648
				649	if (LIKELY (j != 0))
				650	{
				651	factors[j++] = prod;
				652	mpz_prodlimbs (r, factors, j);
				653	}
				654	else
				655	{
				656	MPZ_NEWALLOC (r, 1)[0] = prod;
				657	SIZ (r) = 1;
				658	}
				659	TMP_FREE;
				660	}
				661
				662	#undef COUNT_A_PRIME
				663	#undef SH_COUNT_A_PRIME
				664	#undef LOOP_ON_SIEVE_END
				665	#undef LOOP_ON_SIEVE_STOP
				666	#undef LOOP_ON_SIEVE_BEGIN
				667	#undef LOOP_ON_SIEVE_CONTINUE
				668
				669	/*********************************************************/
				670	/* End of implementation of Goetgheluck's algorithm */
				671	/*********************************************************/
				672
				673	void
				674	mpz_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
				675	{
				676	if (UNLIKELY (n < k)) {
				677	SIZ (r) = 0;
				678	#if BITS_PER_ULONG > GMP_NUMB_BITS
				679	} else if (UNLIKELY (n > GMP_NUMB_MAX)) {
				680	mpz_t tmp;
				681
				682	mpz_init_set_ui (tmp, n);
				683	mpz_bin_ui (r, tmp, k);
				684	mpz_clear (tmp);
				685	#endif
				686	} else {
				687	ASSERT (n <= GMP_NUMB_MAX);
				688	/* Rewrite bin(n,k) as bin(n,n-k) if that is smaller. */
				689	k = MIN (k, n - k);
				690	if (k < 2) {
				691	MPZ_NEWALLOC (r, 1)[0] = k ? n : 1; /* 1 + ((-k) & (n-1)); */
				692	SIZ(r) = 1;
				693	} else if (n <= ODD_FACTORIAL_EXTTABLE_LIMIT) { /* k >= 2, n >= 4 */
				694	MPZ_NEWALLOC (r, 1)[0] = bc_bin_uiui (n, k);
				695	SIZ(r) = 1;
				696	} else if (k <= ODD_FACTORIAL_TABLE_LIMIT)
				697	mpz_smallk_bin_uiui (r, n, k);
				698	else if (BIN_UIUI_ENABLE_SMALLDC &&
				699	k <= (BIN_UIUI_RECURSIVE_SMALLDC ? ODD_CENTRAL_BINOMIAL_TABLE_LIMIT : ODD_FACTORIAL_TABLE_LIMIT)* 2)
				700	mpz_smallkdc_bin_uiui (r, n, k);
				701	else if (ABOVE_THRESHOLD (k, BIN_GOETGHELUCK_THRESHOLD) &&
				702	k > (n >> 4)) /* k > ODD_FACTORIAL_TABLE_LIMIT */
				703	mpz_goetgheluck_bin_uiui (r, n, k);
				704	else
				705	mpz_bdiv_bin_uiui (r, n, k);
				706	}
				707	}