Blame - third_party/gmp/mpn/generic/perfsqr.c - RealtimeRoboticsGroup/test

blob: 38a1a91ed1d70d2ecd5319b99824e2a156091aa6 [file] [log] [blame]

Austin Schuh	dace2a6	2020-08-18 10:56:48 -0700	[diff] [blame]	1	/* mpn_perfect_square_p(u,usize) -- Return non-zero if U is a perfect square,
				2	zero otherwise.
				3
				4	Copyright 1991, 1993, 1994, 1996, 1997, 2000-2002, 2005, 2012 Free Software
				5	Foundation, Inc.
				6
				7	This file is part of the GNU MP Library.
				8
				9	The GNU MP Library is free software; you can redistribute it and/or modify
				10	it under the terms of either:
				11
				12	* the GNU Lesser General Public License as published by the Free
				13	Software Foundation; either version 3 of the License, or (at your
				14	option) any later version.
				15
				16	or
				17
				18	* the GNU General Public License as published by the Free Software
				19	Foundation; either version 2 of the License, or (at your option) any
				20	later version.
				21
				22	or both in parallel, as here.
				23
				24	The GNU MP Library is distributed in the hope that it will be useful, but
				25	WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
				26	or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
				27	for more details.
				28
				29	You should have received copies of the GNU General Public License and the
				30	GNU Lesser General Public License along with the GNU MP Library. If not,
				31	see https://www.gnu.org/licenses/. */
				32
				33	#include <stdio.h> /* for NULL */
				34	#include "gmp-impl.h"
				35	#include "longlong.h"
				36
				37	#include "perfsqr.h"
				38
				39
				40	/* change this to "#define TRACE(x) x" for diagnostics */
				41	#define TRACE(x)
				42
				43
				44
				45	/* PERFSQR_MOD_* detects non-squares using residue tests.
				46
				47	A macro PERFSQR_MOD_TEST is setup by gen-psqr.c in perfsqr.h. It takes
				48	{up,usize} modulo a selected modulus to get a remainder r. For 32-bit or
				49	64-bit limbs this modulus will be 2^24-1 or 2^48-1 using PERFSQR_MOD_34,
				50	or for other limb or nail sizes a PERFSQR_PP is chosen and PERFSQR_MOD_PP
				51	used. PERFSQR_PP_NORM and PERFSQR_PP_INVERTED are pre-calculated in this
				52	case too.
				53
				54	PERFSQR_MOD_TEST then makes various calls to PERFSQR_MOD_1 or
				55	PERFSQR_MOD_2 with divisors d which are factors of the modulus, and table
				56	data indicating residues and non-residues modulo those divisors. The
				57	table data is in 1 or 2 limbs worth of bits respectively, per the size of
				58	each d.
				59
				60	A "modexact" style remainder is taken to reduce r modulo d.
				61	PERFSQR_MOD_IDX implements this, producing an index "idx" for use with
				62	the table data. Notice there's just one multiplication by a constant
				63	"inv", for each d.
				64
				65	The modexact doesn't produce a true r%d remainder, instead idx satisfies
				66	"-(idx<<PERFSQR_MOD_BITS) == r mod d". Because d is odd, this factor
				67	-2^PERFSQR_MOD_BITS is a one-to-one mapping between r and idx, and is
				68	accounted for by having the table data suitably permuted.
				69
				70	The remainder r fits within PERFSQR_MOD_BITS which is less than a limb.
				71	In fact the GMP_LIMB_BITS - PERFSQR_MOD_BITS spare bits are enough to fit
				72	each divisor d meaning the modexact multiply can take place entirely
				73	within one limb, giving the compiler the chance to optimize it, in a way
				74	that say umul_ppmm would not give.
				75
				76	There's no need for the divisors d to be prime, in fact gen-psqr.c makes
				77	a deliberate effort to combine factors so as to reduce the number of
				78	separate tests done on r. But such combining is limited to d <=
				79	2*GMP_LIMB_BITS so that the table data fits in at most 2 limbs.
				80
				81	Alternatives:
				82
				83	It'd be possible to use bigger divisors d, and more than 2 limbs of table
				84	data, but this doesn't look like it would be of much help to the prime
				85	factors in the usual moduli 2^24-1 or 2^48-1.
				86
				87	The moduli 2^24-1 or 2^48-1 are nothing particularly special, they're
				88	just easy to calculate (see mpn_mod_34lsub1) and have a nice set of prime
				89	factors. 2^32-1 and 2^64-1 would be equally easy to calculate, but have
				90	fewer prime factors.
				91
				92	The nails case usually ends up using mpn_mod_1, which is a lot slower
				93	than mpn_mod_34lsub1. Perhaps other such special moduli could be found
				94	for the nails case. Two-term things like 2^30-2^15-1 might be
				95	candidates. Or at worst some on-the-fly de-nailing would allow the plain
				96	2^24-1 to be used. Currently nails are too preliminary to be worried
				97	about.
				98
				99	*/
				100
				101	#define PERFSQR_MOD_MASK ((CNST_LIMB(1) << PERFSQR_MOD_BITS) - 1)
				102
				103	#define MOD34_BITS (GMP_NUMB_BITS / 4 * 3)
				104	#define MOD34_MASK ((CNST_LIMB(1) << MOD34_BITS) - 1)
				105
				106	#define PERFSQR_MOD_34(r, up, usize) \
				107	do { \
				108	(r) = mpn_mod_34lsub1 (up, usize); \
				109	(r) = ((r) & MOD34_MASK) + ((r) >> MOD34_BITS); \
				110	} while (0)
				111
				112	/* FIXME: The %= here isn't good, and might destroy any savings from keeping
				113	the PERFSQR_MOD_IDX stuff within a limb (rather than needing umul_ppmm).
				114	Maybe a new sort of mpn_preinv_mod_1 could accept an unnormalized divisor
				115	and a shift count, like mpn_preinv_divrem_1. But mod_34lsub1 is our
				116	normal case, so lets not worry too much about mod_1. */
				117	#define PERFSQR_MOD_PP(r, up, usize) \
				118	do { \
				119	if (BELOW_THRESHOLD (usize, PREINV_MOD_1_TO_MOD_1_THRESHOLD)) \
				120	{ \
				121	(r) = mpn_preinv_mod_1 (up, usize, PERFSQR_PP_NORM, \
				122	PERFSQR_PP_INVERTED); \
				123	(r) %= PERFSQR_PP; \
				124	} \
				125	else \
				126	{ \
				127	(r) = mpn_mod_1 (up, usize, PERFSQR_PP); \
				128	} \
				129	} while (0)
				130
				131	#define PERFSQR_MOD_IDX(idx, r, d, inv) \
				132	do { \
				133	mp_limb_t q; \
				134	ASSERT ((r) <= PERFSQR_MOD_MASK); \
				135	ASSERT ((((inv) * (d)) & PERFSQR_MOD_MASK) == 1); \
				136	ASSERT (MP_LIMB_T_MAX / (d) >= PERFSQR_MOD_MASK); \
				137	\
				138	q = ((r) * (inv)) & PERFSQR_MOD_MASK; \
				139	ASSERT (r == ((q * (d)) & PERFSQR_MOD_MASK)); \
				140	(idx) = (q * (d)) >> PERFSQR_MOD_BITS; \
				141	} while (0)
				142
				143	#define PERFSQR_MOD_1(r, d, inv, mask) \
				144	do { \
				145	unsigned idx; \
				146	ASSERT ((d) <= GMP_LIMB_BITS); \
				147	PERFSQR_MOD_IDX(idx, r, d, inv); \
				148	TRACE (printf (" PERFSQR_MOD_1 d=%u r=%lu idx=%u\n", \
				149	d, r%d, idx)); \
				150	if ((((mask) >> idx) & 1) == 0) \
				151	{ \
				152	TRACE (printf (" non-square\n")); \
				153	return 0; \
				154	} \
				155	} while (0)
				156
				157	/* The expression "(int) idx - GMP_LIMB_BITS < 0" lets the compiler use the
				158	sign bit from "idx-GMP_LIMB_BITS", which might help avoid a branch. */
				159	#define PERFSQR_MOD_2(r, d, inv, mhi, mlo) \
				160	do { \
				161	mp_limb_t m; \
				162	unsigned idx; \
				163	ASSERT ((d) <= 2*GMP_LIMB_BITS); \
				164	\
				165	PERFSQR_MOD_IDX (idx, r, d, inv); \
				166	TRACE (printf (" PERFSQR_MOD_2 d=%u r=%lu idx=%u\n", \
				167	d, r%d, idx)); \
				168	m = ((int) idx - GMP_LIMB_BITS < 0 ? (mlo) : (mhi)); \
				169	idx %= GMP_LIMB_BITS; \
				170	if (((m >> idx) & 1) == 0) \
				171	{ \
				172	TRACE (printf (" non-square\n")); \
				173	return 0; \
				174	} \
				175	} while (0)
				176
				177
				178	int
				179	mpn_perfect_square_p (mp_srcptr up, mp_size_t usize)
				180	{
				181	ASSERT (usize >= 1);
				182
				183	TRACE (gmp_printf ("mpn_perfect_square_p %Nd\n", up, usize));
				184
				185	/* The first test excludes 212/256 (82.8%) of the perfect square candidates
				186	in O(1) time. */
				187	{
				188	unsigned idx = up[0] % 0x100;
				189	if (((sq_res_0x100[idx / GMP_LIMB_BITS]
				190	>> (idx % GMP_LIMB_BITS)) & 1) == 0)
				191	return 0;
				192	}
				193
				194	#if 0
				195	/* Check that we have even multiplicity of 2, and then check that the rest is
				196	a possible perfect square. Leave disabled until we can determine this
				197	really is an improvement. It it is, it could completely replace the
				198	simple probe above, since this should throw out more non-squares, but at
				199	the expense of somewhat more cycles. */
				200	{
				201	mp_limb_t lo;
				202	int cnt;
				203	lo = up[0];
				204	while (lo == 0)
				205	up++, lo = up[0], usize--;
				206	count_trailing_zeros (cnt, lo);
				207	if ((cnt & 1) != 0)
				208	return 0; /* return of not even multiplicity of 2 */
				209	lo >>= cnt; /* shift down to align lowest non-zero bit */
				210	lo >>= 1; /* shift away lowest non-zero bit */
				211	if ((lo & 3) != 0)
				212	return 0;
				213	}
				214	#endif
				215
				216
				217	/* The second test uses mpn_mod_34lsub1 or mpn_mod_1 to detect non-squares
				218	according to their residues modulo small primes (or powers of
				219	primes). See perfsqr.h. */
				220	PERFSQR_MOD_TEST (up, usize);
				221
				222
				223	/* For the third and last test, we finally compute the square root,
				224	to make sure we've really got a perfect square. */
				225	{
				226	mp_ptr root_ptr;
				227	int res;
				228	TMP_DECL;
				229
				230	TMP_MARK;
				231	root_ptr = TMP_ALLOC_LIMBS ((usize + 1) / 2);
				232
				233	/* Iff mpn_sqrtrem returns zero, the square is perfect. */
				234	res = ! mpn_sqrtrem (root_ptr, NULL, up, usize);
				235	TMP_FREE;
				236
				237	return res;
				238	}
				239	}