Blame - third_party/gmp/longlong.h - RealtimeRoboticsGroup/test

blob: bc3fc81cb0a2cf652b60b2ae3367ebf6f2ff3c80 [file] [log] [blame]

Austin Schuh	bb1338c	2024-06-15 19:31:16 -0700	[diff] [blame]	1	/* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
				2
				3	Copyright 1991-1994, 1996, 1997, 1999-2005, 2007-2009, 2011-2020 Free Software
				4	Foundation, Inc.
				5
				6	This file is part of the GNU MP Library.
				7
				8	The GNU MP Library is free software; you can redistribute it and/or modify
				9	it under the terms of either:
				10
				11	* the GNU Lesser General Public License as published by the Free
				12	Software Foundation; either version 3 of the License, or (at your
				13	option) any later version.
				14
				15	or
				16
				17	* the GNU General Public License as published by the Free Software
				18	Foundation; either version 2 of the License, or (at your option) any
				19	later version.
				20
				21	or both in parallel, as here.
				22
				23	The GNU MP Library is distributed in the hope that it will be useful, but
				24	WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
				25	or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
				26	for more details.
				27
				28	You should have received copies of the GNU General Public License and the
				29	GNU Lesser General Public License along with the GNU MP Library. If not,
				30	see https://www.gnu.org/licenses/. */
				31
				32	/* You have to define the following before including this file:
				33
				34	UWtype -- An unsigned type, default type for operations (typically a "word")
				35	UHWtype -- An unsigned type, at least half the size of UWtype
				36	UDWtype -- An unsigned type, at least twice as large a UWtype
				37	W_TYPE_SIZE -- size in bits of UWtype
				38
				39	SItype, USItype -- Signed and unsigned 32 bit types
				40	DItype, UDItype -- Signed and unsigned 64 bit types
				41
				42	On a 32 bit machine UWtype should typically be USItype;
				43	on a 64 bit machine, UWtype should typically be UDItype.
				44
				45	Optionally, define:
				46
				47	LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files
				48	NO_ASM -- Disable inline asm
				49
				50
				51	CAUTION! Using this version of longlong.h outside of GMP is not safe. You
				52	need to include gmp.h and gmp-impl.h, or certain things might not work as
				53	expected.
				54	*/
				55
				56	#define __BITS4 (W_TYPE_SIZE / 4)
				57	#define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
				58	#define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
				59	#define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
				60
				61	/* This is used to make sure no undesirable sharing between different libraries
				62	that use this file takes place. */
				63	#ifndef __MPN
				64	#define __MPN(x) __##x
				65	#endif
				66
				67	/* Define auxiliary asm macros.
				68
				69	1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
				70	UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
				71	word product in HIGH_PROD and LOW_PROD.
				72
				73	2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
				74	UDWtype product. This is just a variant of umul_ppmm.
				75
				76	3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
				77	denominator) divides a UDWtype, composed by the UWtype integers
				78	HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
				79	in QUOTIENT and the remainder in REMAINDER. HIGH_NUMERATOR must be less
				80	than DENOMINATOR for correct operation. If, in addition, the most
				81	significant bit of DENOMINATOR must be 1, then the pre-processor symbol
				82	UDIV_NEEDS_NORMALIZATION is defined to 1.
				83
				84	4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
				85	denominator). Like udiv_qrnnd but the numbers are signed. The quotient
				86	is rounded towards 0.
				87
				88	5) count_leading_zeros(count, x) counts the number of zero-bits from the
				89	msb to the first non-zero bit in the UWtype X. This is the number of
				90	steps X needs to be shifted left to set the msb. Undefined for X == 0,
				91	unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
				92
				93	6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
				94	from the least significant end.
				95
				96	7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
				97	high_addend_2, low_addend_2) adds two UWtype integers, composed by
				98	HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
				99	respectively. The result is placed in HIGH_SUM and LOW_SUM. Overflow
				100	(i.e. carry out) is not stored anywhere, and is lost.
				101
				102	8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
				103	high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
				104	composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
				105	LOW_SUBTRAHEND_2 respectively. The result is placed in HIGH_DIFFERENCE
				106	and LOW_DIFFERENCE. Overflow (i.e. carry out) is not stored anywhere,
				107	and is lost.
				108
				109	If any of these macros are left undefined for a particular CPU,
				110	C macros are used.
				111
				112
				113	Notes:
				114
				115	For add_ssaaaa the two high and two low addends can both commute, but
				116	unfortunately gcc only supports one "%" commutative in each asm block.
				117	This has always been so but is only documented in recent versions
				118	(eg. pre-release 3.3). Having two or more "%"s can cause an internal
				119	compiler error in certain rare circumstances.
				120
				121	Apparently it was only the last "%" that was ever actually respected, so
				122	the code has been updated to leave just that. Clearly there's a free
				123	choice whether high or low should get it, if there's a reason to favour
				124	one over the other. Also obviously when the constraints on the two
				125	operands are identical there's no benefit to the reloader in any "%" at
				126	all.
				127
				128	*/
				129
				130	/* The CPUs come in alphabetical order below.
				131
				132	Please add support for more CPUs here, or improve the current support
				133	for the CPUs below! */
				134
				135
				136	/* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
				137	3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
				138	Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
				139	__builtin_ctzll.
				140
				141	These builtins are only used when we check what code comes out, on some
				142	chips they're merely libgcc calls, where we will instead want an inline
				143	in that case (either asm or generic C).
				144
				145	These builtins are better than an asm block of the same insn, since an
				146	asm block doesn't give gcc any information about scheduling or resource
				147	usage. We keep an asm block for use on prior versions of gcc though.
				148
				149	For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
				150	it's not used (for count_leading_zeros) because it generally gives extra
				151	code to ensure the result is 0 when the input is 0, which we don't need
				152	or want. */
				153
				154	#ifdef _LONG_LONG_LIMB
				155	#define count_leading_zeros_gcc_clz(count,x) \
				156	do { \
				157	ASSERT ((x) != 0); \
				158	(count) = __builtin_clzll (x); \
				159	} while (0)
				160	#else
				161	#define count_leading_zeros_gcc_clz(count,x) \
				162	do { \
				163	ASSERT ((x) != 0); \
				164	(count) = __builtin_clzl (x); \
				165	} while (0)
				166	#endif
				167
				168	#ifdef _LONG_LONG_LIMB
				169	#define count_trailing_zeros_gcc_ctz(count,x) \
				170	do { \
				171	ASSERT ((x) != 0); \
				172	(count) = __builtin_ctzll (x); \
				173	} while (0)
				174	#else
				175	#define count_trailing_zeros_gcc_ctz(count,x) \
				176	do { \
				177	ASSERT ((x) != 0); \
				178	(count) = __builtin_ctzl (x); \
				179	} while (0)
				180	#endif
				181
				182
				183	/* FIXME: The macros using external routines like __MPN(count_leading_zeros)
				184	don't need to be under !NO_ASM */
				185	#if ! defined (NO_ASM)
				186
				187	#if defined (__alpha) && W_TYPE_SIZE == 64
				188	/* Most alpha-based machines, except Cray systems. */
				189	#if defined (__GNUC__)
				190	#if __GMP_GNUC_PREREQ (3,3)
				191	#define umul_ppmm(ph, pl, m0, m1) \
				192	do { \
				193	UDItype __m0 = (m0), __m1 = (m1); \
				194	(ph) = __builtin_alpha_umulh (__m0, __m1); \
				195	(pl) = __m0 * __m1; \
				196	} while (0)
				197	#else
				198	#define umul_ppmm(ph, pl, m0, m1) \
				199	do { \
				200	UDItype __m0 = (m0), __m1 = (m1); \
				201	__asm__ ("umulh %r1,%2,%0" \
				202	: "=r" (ph) \
				203	: "%rJ" (__m0), "rI" (__m1)); \
				204	(pl) = __m0 * __m1; \
				205	} while (0)
				206	#endif
				207	#else /* ! __GNUC__ */
				208	#include <machine/builtins.h>
				209	#define umul_ppmm(ph, pl, m0, m1) \
				210	do { \
				211	UDItype __m0 = (m0), __m1 = (m1); \
				212	(ph) = __UMULH (__m0, __m1); \
				213	(pl) = __m0 * __m1; \
				214	} while (0)
				215	#endif
				216	#ifndef LONGLONG_STANDALONE
				217	#define udiv_qrnnd(q, r, n1, n0, d) \
				218	do { UWtype __di; \
				219	__di = __MPN(invert_limb) (d); \
				220	udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
				221	} while (0)
				222	#define UDIV_PREINV_ALWAYS 1
				223	#define UDIV_NEEDS_NORMALIZATION 1
				224	#endif /* LONGLONG_STANDALONE */
				225
				226	/* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
				227	always goes into libgmp.so, even when not actually used. */
				228	#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
				229
				230	#if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
				231	#define count_leading_zeros(COUNT,X) \
				232	__asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
				233	#define count_trailing_zeros(COUNT,X) \
				234	__asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
				235	#endif /* clz/ctz using cix */
				236
				237	#if ! defined (count_leading_zeros) \
				238	&& defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
				239	/* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
				240	"$31" is written explicitly in the asm, since an "r" constraint won't
				241	select reg 31. There seems no need to worry about "r31" syntax for cray,
				242	since gcc itself (pre-release 3.4) emits just $31 in various places. */
				243	#define ALPHA_CMPBGE_0(dst, src) \
				244	do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
				245	/* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
				246	them, locating the highest non-zero byte. A second __clz_tab lookup
				247	counts the leading zero bits in that byte, giving the result. */
				248	#define count_leading_zeros(count, x) \
				249	do { \
				250	UWtype __clz__b, __clz__c, __clz__x = (x); \
				251	ALPHA_CMPBGE_0 (__clz__b, __clz__x); /* zero bytes */ \
				252	__clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F]; /* 8 to 1 byte */ \
				253	__clz__b = __clz__b * 8 - 7; /* 57 to 1 shift */ \
				254	__clz__x >>= __clz__b; \
				255	__clz__c = __clz_tab [__clz__x]; /* 8 to 1 bit */ \
				256	__clz__b = 65 - __clz__b; \
				257	(count) = __clz__b - __clz__c; \
				258	} while (0)
				259	#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
				260	#endif /* clz using cmpbge */
				261
				262	#if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
				263	#if HAVE_ATTRIBUTE_CONST
				264	long __MPN(count_leading_zeros) (UDItype) __attribute__ ((const));
				265	#else
				266	long __MPN(count_leading_zeros) (UDItype);
				267	#endif
				268	#define count_leading_zeros(count, x) \
				269	((count) = __MPN(count_leading_zeros) (x))
				270	#endif /* clz using mpn */
				271	#endif /* __alpha */
				272
				273	#if defined (__AVR) && W_TYPE_SIZE == 8
				274	#define umul_ppmm(ph, pl, m0, m1) \
				275	do { \
				276	unsigned short __p = (unsigned short) (m0) * (m1); \
				277	(ph) = __p >> 8; \
				278	(pl) = __p; \
				279	} while (0)
				280	#endif /* AVR */
				281
				282	#if defined (_CRAY) && W_TYPE_SIZE == 64
				283	#include <intrinsics.h>
				284	#define UDIV_PREINV_ALWAYS 1
				285	#define UDIV_NEEDS_NORMALIZATION 1
				286	long __MPN(count_leading_zeros) (UDItype);
				287	#define count_leading_zeros(count, x) \
				288	((count) = _leadz ((UWtype) (x)))
				289	#if defined (_CRAYIEEE) /* I.e., Cray T90/ieee, T3D, and T3E */
				290	#define umul_ppmm(ph, pl, m0, m1) \
				291	do { \
				292	UDItype __m0 = (m0), __m1 = (m1); \
				293	(ph) = _int_mult_upper (__m0, __m1); \
				294	(pl) = __m0 * __m1; \
				295	} while (0)
				296	#ifndef LONGLONG_STANDALONE
				297	#define udiv_qrnnd(q, r, n1, n0, d) \
				298	do { UWtype __di; \
				299	__di = __MPN(invert_limb) (d); \
				300	udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
				301	} while (0)
				302	#endif /* LONGLONG_STANDALONE */
				303	#endif /* _CRAYIEEE */
				304	#endif /* _CRAY */
				305
				306	#if defined (__ia64) && W_TYPE_SIZE == 64
				307	/* This form encourages gcc (pre-release 3.4 at least) to emit predicated
				308	"sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency. The generic
				309	code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
				310	register, which takes an extra cycle. */
				311	#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
				312	do { \
				313	UWtype __x; \
				314	__x = (al) - (bl); \
				315	if ((al) < (bl)) \
				316	(sh) = (ah) - (bh) - 1; \
				317	else \
				318	(sh) = (ah) - (bh); \
				319	(sl) = __x; \
				320	} while (0)
				321	#if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
				322	/* Do both product parts in assembly, since that gives better code with
				323	all gcc versions. Some callers will just use the upper part, and in
				324	that situation we waste an instruction, but not any cycles. */
				325	#define umul_ppmm(ph, pl, m0, m1) \
				326	__asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0" \
				327	: "=&f" (ph), "=f" (pl) \
				328	: "f" (m0), "f" (m1))
				329	#define count_leading_zeros(count, x) \
				330	do { \
				331	UWtype _x = (x), _y, _a, _c; \
				332	__asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x)); \
				333	__asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y \| _y)); \
				334	_c = (_a - 1) << 3; \
				335	_x >>= _c; \
				336	if (_x >= 1 << 4) \
				337	_x >>= 4, _c += 4; \
				338	if (_x >= 1 << 2) \
				339	_x >>= 2, _c += 2; \
				340	_c += _x >> 1; \
				341	(count) = W_TYPE_SIZE - 1 - _c; \
				342	} while (0)
				343	/* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
				344	based, and we don't need a special case for x==0 here */
				345	#define count_trailing_zeros(count, x) \
				346	do { \
				347	UWtype __ctz_x = (x); \
				348	__asm__ ("popcnt %0 = %1" \
				349	: "=r" (count) \
				350	: "r" ((__ctz_x-1) & ~__ctz_x)); \
				351	} while (0)
				352	#endif
				353	#if defined (__INTEL_COMPILER)
				354	#include <ia64intrin.h>
				355	#define umul_ppmm(ph, pl, m0, m1) \
				356	do { \
				357	UWtype __m0 = (m0), __m1 = (m1); \
				358	ph = _m64_xmahu (__m0, __m1, 0); \
				359	pl = __m0 * __m1; \
				360	} while (0)
				361	#endif
				362	#ifndef LONGLONG_STANDALONE
				363	#define udiv_qrnnd(q, r, n1, n0, d) \
				364	do { UWtype __di; \
				365	__di = __MPN(invert_limb) (d); \
				366	udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
				367	} while (0)
				368	#define UDIV_PREINV_ALWAYS 1
				369	#define UDIV_NEEDS_NORMALIZATION 1
				370	#endif
				371	#endif
				372
				373
				374	#if defined (__GNUC__)
				375
				376	/* We sometimes need to clobber "cc" with gcc2, but that would not be
				377	understood by gcc1. Use cpp to avoid major code duplication. */
				378	#if __GNUC__ < 2
				379	#define __CLOBBER_CC
				380	#define __AND_CLOBBER_CC
				381	#else /* __GNUC__ >= 2 */
				382	#define __CLOBBER_CC : "cc"
				383	#define __AND_CLOBBER_CC , "cc"
				384	#endif /* __GNUC__ < 2 */
				385
				386	#if (defined (__a29k__) \|\| defined (_AM29K)) && W_TYPE_SIZE == 32
				387	#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
				388	__asm__ ("add %1,%4,%5\n\taddc %0,%2,%3" \
				389	: "=r" (sh), "=&r" (sl) \
				390	: "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
				391	#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
				392	__asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3" \
				393	: "=r" (sh), "=&r" (sl) \
				394	: "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
				395	#define umul_ppmm(xh, xl, m0, m1) \
				396	do { \
				397	USItype __m0 = (m0), __m1 = (m1); \
				398	__asm__ ("multiplu %0,%1,%2" \
				399	: "=r" (xl) \
				400	: "r" (__m0), "r" (__m1)); \
				401	__asm__ ("multmu %0,%1,%2" \
				402	: "=r" (xh) \
				403	: "r" (__m0), "r" (__m1)); \
				404	} while (0)
				405	#define udiv_qrnnd(q, r, n1, n0, d) \
				406	__asm__ ("dividu %0,%3,%4" \
				407	: "=r" (q), "=q" (r) \
				408	: "1" (n1), "r" (n0), "r" (d))
				409	#define count_leading_zeros(count, x) \
				410	__asm__ ("clz %0,%1" \
				411	: "=r" (count) \
				412	: "r" (x))
				413	#define COUNT_LEADING_ZEROS_0 32
				414	#endif /* __a29k__ */
				415
				416	#if defined (__arc__)
				417	#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
				418	__asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3" \
				419	: "=r" (sh), \
				420	"=&r" (sl) \
				421	: "r" ((USItype) (ah)), \
				422	"rICal" ((USItype) (bh)), \
				423	"%r" ((USItype) (al)), \
				424	"rICal" ((USItype) (bl)))
				425	#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
				426	__asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
				427	: "=r" (sh), \
				428	"=&r" (sl) \
				429	: "r" ((USItype) (ah)), \
				430	"rICal" ((USItype) (bh)), \
				431	"r" ((USItype) (al)), \
				432	"rICal" ((USItype) (bl)))
				433	#endif
				434
				435	#if defined (__arm__) && (defined (__thumb2__) \|\| !defined (__thumb__)) \
				436	&& W_TYPE_SIZE == 32
				437	#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
				438	do { \
				439	if (__builtin_constant_p (bl) && -(USItype)(bl) < 0x100) \
				440	__asm__ ("subs\t%1, %4, %5\n\tadc\t%0, %2, %3" \
				441	: "=r" (sh), "=&r" (sl) \
				442	: "r" (ah), "rI" (bh), \
				443	"%r" (al), "rI" (-(USItype)(bl)) __CLOBBER_CC); \
				444	else \
				445	__asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3" \
				446	: "=r" (sh), "=&r" (sl) \
				447	: "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC); \
				448	} while (0)
				449	/* FIXME: Extend the immediate range for the low word by using both ADDS and
				450	SUBS, since they set carry in the same way. Note: We need separate
				451	definitions for thumb and non-thumb to to th absense of RSC under thumb. */
				452	#if defined (__thumb__)
				453	#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
				454	do { \
				455	if (__builtin_constant_p (ah) && __builtin_constant_p (bh) \
				456	&& (ah) == (bh)) \
				457	__asm__ ("subs\t%1, %2, %3\n\tsbc\t%0, %0, %0" \
				458	: "=r" (sh), "=r" (sl) \
				459	: "r" (al), "rI" (bl) __CLOBBER_CC); \
				460	else if (__builtin_constant_p (al)) \
				461	__asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3" \
				462	: "=r" (sh), "=&r" (sl) \
				463	: "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
				464	else if (__builtin_constant_p (bl)) \
				465	__asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
				466	: "=r" (sh), "=&r" (sl) \
				467	: "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
				468	else \
				469	__asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
				470	: "=r" (sh), "=&r" (sl) \
				471	: "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
				472	} while (0)
				473	#else
				474	#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
				475	do { \
				476	if (__builtin_constant_p (ah) && __builtin_constant_p (bh) \
				477	&& (ah) == (bh)) \
				478	__asm__ ("subs\t%1, %2, %3\n\tsbc\t%0, %0, %0" \
				479	: "=r" (sh), "=r" (sl) \
				480	: "r" (al), "rI" (bl) __CLOBBER_CC); \
				481	else if (__builtin_constant_p (al)) \
				482	{ \
				483	if (__builtin_constant_p (ah)) \
				484	__asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \
				485	: "=r" (sh), "=&r" (sl) \
				486	: "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
				487	else \
				488	__asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3" \
				489	: "=r" (sh), "=&r" (sl) \
				490	: "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
				491	} \
				492	else if (__builtin_constant_p (ah)) \
				493	{ \
				494	if (__builtin_constant_p (bl)) \
				495	__asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2" \
				496	: "=r" (sh), "=&r" (sl) \
				497	: "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
				498	else \
				499	__asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \
				500	: "=r" (sh), "=&r" (sl) \
				501	: "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
				502	} \
				503	else if (__builtin_constant_p (bl)) \
				504	__asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
				505	: "=r" (sh), "=&r" (sl) \
				506	: "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
				507	else /* only bh might be a constant */ \
				508	__asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
				509	: "=r" (sh), "=&r" (sl) \
				510	: "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
				511	} while (0)
				512	#endif
				513	#if defined (__ARM_ARCH_2__) \|\| defined (__ARM_ARCH_2A__) \
				514	\|\| defined (__ARM_ARCH_3__)
				515	#define umul_ppmm(xh, xl, a, b) \
				516	do { \
				517	register USItype __t0, __t1, __t2; \
				518	__asm__ ("%@ Inlined umul_ppmm\n" \
				519	" mov %2, %5, lsr #16\n" \
				520	" mov %0, %6, lsr #16\n" \
				521	" bic %3, %5, %2, lsl #16\n" \
				522	" bic %4, %6, %0, lsl #16\n" \
				523	" mul %1, %3, %4\n" \
				524	" mul %4, %2, %4\n" \
				525	" mul %3, %0, %3\n" \
				526	" mul %0, %2, %0\n" \
				527	" adds %3, %4, %3\n" \
				528	" addcs %0, %0, #65536\n" \
				529	" adds %1, %1, %3, lsl #16\n" \
				530	" adc %0, %0, %3, lsr #16" \
				531	: "=&r" ((USItype) (xh)), "=r" ((USItype) (xl)), \
				532	"=&r" (__t0), "=&r" (__t1), "=r" (__t2) \
				533	: "r" ((USItype) (a)), "r" ((USItype) (b)) __CLOBBER_CC); \
				534	} while (0)
				535	#ifndef LONGLONG_STANDALONE
				536	#define udiv_qrnnd(q, r, n1, n0, d) \
				537	do { UWtype __r; \
				538	(q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \
				539	(r) = __r; \
				540	} while (0)
				541	extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
				542	#endif /* LONGLONG_STANDALONE */
				543	#else /* ARMv4 or newer */
				544	#define umul_ppmm(xh, xl, a, b) \
				545	__asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
				546	#define smul_ppmm(xh, xl, a, b) \
				547	__asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
				548	#ifndef LONGLONG_STANDALONE
				549	#define udiv_qrnnd(q, r, n1, n0, d) \
				550	do { UWtype __di; \
				551	__di = __MPN(invert_limb) (d); \
				552	udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
				553	} while (0)
				554	#define UDIV_PREINV_ALWAYS 1
				555	#define UDIV_NEEDS_NORMALIZATION 1
				556	#endif /* LONGLONG_STANDALONE */
				557	#endif /* defined(__ARM_ARCH_2__) ... */
				558	#define count_leading_zeros(count, x) count_leading_zeros_gcc_clz(count, x)
				559	#define count_trailing_zeros(count, x) count_trailing_zeros_gcc_ctz(count, x)
				560	#endif /* __arm__ */
				561
				562	#if defined (__aarch64__) && W_TYPE_SIZE == 64
				563	/* FIXME: Extend the immediate range for the low word by using both
				564	ADDS and SUBS, since they set carry in the same way. */
				565	#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
				566	do { \
				567	if (__builtin_constant_p (bl) && -(UDItype)(bl) < 0x1000) \
				568	__asm__ ("subs\t%1, %x4, %5\n\tadc\t%0, %x2, %x3" \
				569	: "=r" (sh), "=&r" (sl) \
				570	: "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)), \
				571	"%r" ((UDItype)(al)), "rI" (-(UDItype)(bl)) __CLOBBER_CC);\
				572	else \
				573	__asm__ ("adds\t%1, %x4, %5\n\tadc\t%0, %x2, %x3" \
				574	: "=r" (sh), "=&r" (sl) \
				575	: "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)), \
				576	"%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC);\
				577	} while (0)
				578	#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
				579	do { \
				580	if (__builtin_constant_p (bl) && -(UDItype)(bl) < 0x1000) \
				581	__asm__ ("adds\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3" \
				582	: "=r,r" (sh), "=&r,&r" (sl) \
				583	: "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)), \
				584	"r,Z" ((UDItype)(al)), "rI,r" (-(UDItype)(bl)) __CLOBBER_CC);\
				585	else \
				586	__asm__ ("subs\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3" \
				587	: "=r,r" (sh), "=&r,&r" (sl) \
				588	: "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)), \
				589	"r,Z" ((UDItype)(al)), "rI,r" ((UDItype)(bl)) __CLOBBER_CC);\
				590	} while(0);
				591	#if __GMP_GNUC_PREREQ (4,9)
				592	#define umul_ppmm(w1, w0, u, v) \
				593	do { \
				594	typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \
				595	__ll_UTItype __ll = (__ll_UTItype)(u) * (v); \
				596	w1 = __ll >> 64; \
				597	w0 = __ll; \
				598	} while (0)
				599	#endif
				600	#if !defined (umul_ppmm)
				601	#define umul_ppmm(ph, pl, m0, m1) \
				602	do { \
				603	UDItype __m0 = (m0), __m1 = (m1); \
				604	__asm__ ("umulh\t%0, %1, %2" : "=r" (ph) : "r" (__m0), "r" (__m1)); \
				605	(pl) = __m0 * __m1; \
				606	} while (0)
				607	#endif
				608	#define count_leading_zeros(count, x) count_leading_zeros_gcc_clz(count, x)
				609	#define count_trailing_zeros(count, x) count_trailing_zeros_gcc_ctz(count, x)
				610	#endif /* __aarch64__ */
				611
				612	#if defined (__clipper__) && W_TYPE_SIZE == 32
				613	#define umul_ppmm(w1, w0, u, v) \
				614	({union {UDItype __ll; \
				615	struct {USItype __l, __h;} __i; \
				616	} __x; \
				617	__asm__ ("mulwux %2,%0" \
				618	: "=r" (__x.__ll) \
				619	: "%0" ((USItype)(u)), "r" ((USItype)(v))); \
				620	(w1) = __x.__i.__h; (w0) = __x.__i.__l;})
				621	#define smul_ppmm(w1, w0, u, v) \
				622	({union {DItype __ll; \
				623	struct {SItype __l, __h;} __i; \
				624	} __x; \
				625	__asm__ ("mulwx %2,%0" \
				626	: "=r" (__x.__ll) \
				627	: "%0" ((SItype)(u)), "r" ((SItype)(v))); \
				628	(w1) = __x.__i.__h; (w0) = __x.__i.__l;})
				629	#define __umulsidi3(u, v) \
				630	({UDItype __w; \
				631	__asm__ ("mulwux %2,%0" \
				632	: "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v))); \
				633	__w; })
				634	#endif /* __clipper__ */
				635
				636	/* Fujitsu vector computers. */
				637	#if defined (__uxp__) && W_TYPE_SIZE == 32
				638	#define umul_ppmm(ph, pl, u, v) \
				639	do { \
				640	union {UDItype __ll; \
				641	struct {USItype __h, __l;} __i; \
				642	} __x; \
				643	__asm__ ("mult.lu %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));\
				644	(ph) = __x.__i.__h; \
				645	(pl) = __x.__i.__l; \
				646	} while (0)
				647	#define smul_ppmm(ph, pl, u, v) \
				648	do { \
				649	union {UDItype __ll; \
				650	struct {USItype __h, __l;} __i; \
				651	} __x; \
				652	__asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v)); \
				653	(ph) = __x.__i.__h; \
				654	(pl) = __x.__i.__l; \
				655	} while (0)
				656	#endif
				657
				658	#if defined (__gmicro__) && W_TYPE_SIZE == 32
				659	#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
				660	__asm__ ("add.w %5,%1\n\taddx %3,%0" \
				661	: "=g" (sh), "=&g" (sl) \
				662	: "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
				663	"%1" ((USItype)(al)), "g" ((USItype)(bl)))
				664	#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
				665	__asm__ ("sub.w %5,%1\n\tsubx %3,%0" \
				666	: "=g" (sh), "=&g" (sl) \
				667	: "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
				668	"1" ((USItype)(al)), "g" ((USItype)(bl)))
				669	#define umul_ppmm(ph, pl, m0, m1) \
				670	__asm__ ("mulx %3,%0,%1" \
				671	: "=g" (ph), "=r" (pl) \
				672	: "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
				673	#define udiv_qrnnd(q, r, nh, nl, d) \
				674	__asm__ ("divx %4,%0,%1" \
				675	: "=g" (q), "=r" (r) \
				676	: "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
				677	#define count_leading_zeros(count, x) \
				678	__asm__ ("bsch/1 %1,%0" \
				679	: "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
				680	#endif
				681
				682	#if defined (__hppa) && W_TYPE_SIZE == 32
				683	#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
				684	__asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0" \
				685	: "=r" (sh), "=&r" (sl) \
				686	: "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
				687	#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
				688	__asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0" \
				689	: "=r" (sh), "=&r" (sl) \
				690	: "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
				691	#if defined (_PA_RISC1_1)
				692	#define umul_ppmm(wh, wl, u, v) \
				693	do { \
				694	union {UDItype __ll; \
				695	struct {USItype __h, __l;} __i; \
				696	} __x; \
				697	__asm__ ("xmpyu %1,%2,%0" : "=f" (__x.__ll) : "f" (u), "*f" (v)); \
				698	(wh) = __x.__i.__h; \
				699	(wl) = __x.__i.__l; \
				700	} while (0)
				701	#endif
				702	#define count_leading_zeros(count, x) \
				703	do { \
				704	USItype __tmp; \
				705	__asm__ ( \
				706	"ldi 1,%0\n" \
				707	" extru,= %1,15,16,%%r0 ; Bits 31..16 zero?\n" \
				708	" extru,tr %1,15,16,%1 ; No. Shift down, skip add.\n" \
				709	" ldo 16(%0),%0 ; Yes. Perform add.\n" \
				710	" extru,= %1,23,8,%%r0 ; Bits 15..8 zero?\n" \
				711	" extru,tr %1,23,8,%1 ; No. Shift down, skip add.\n" \
				712	" ldo 8(%0),%0 ; Yes. Perform add.\n" \
				713	" extru,= %1,27,4,%%r0 ; Bits 7..4 zero?\n" \
				714	" extru,tr %1,27,4,%1 ; No. Shift down, skip add.\n" \
				715	" ldo 4(%0),%0 ; Yes. Perform add.\n" \
				716	" extru,= %1,29,2,%%r0 ; Bits 3..2 zero?\n" \
				717	" extru,tr %1,29,2,%1 ; No. Shift down, skip add.\n" \
				718	" ldo 2(%0),%0 ; Yes. Perform add.\n" \
				719	" extru %1,30,1,%1 ; Extract bit 1.\n" \
				720	" sub %0,%1,%0 ; Subtract it.\n" \
				721	: "=r" (count), "=r" (__tmp) : "1" (x)); \
				722	} while (0)
				723	#endif /* hppa */
				724
				725	/* These macros are for ABI=2.0w. In ABI=2.0n they can't be used, since GCC
				726	(3.2) puts longlong into two adjacent 32-bit registers. Presumably this
				727	is just a case of no direct support for 2.0n but treating it like 1.0. */
				728	#if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
				729	#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
				730	__asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0" \
				731	: "=r" (sh), "=&r" (sl) \
				732	: "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
				733	#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
				734	__asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0" \
				735	: "=r" (sh), "=&r" (sl) \
				736	: "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
				737	#endif /* hppa */
				738
				739	#if (defined (__i370__) \|\| defined (__s390__) \|\| defined (__mvs__)) && W_TYPE_SIZE == 32
				740	#if defined (__zarch__) \|\| defined (HAVE_HOST_CPU_s390_zarch)
				741	#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
				742	do { \
				743	/* if (__builtin_constant_p (bl)) \
				744	__asm__ ("alfi\t%1,%o5\n\talcr\t%0,%3" \
				745	: "=r" (sh), "=&r" (sl) \
				746	: "0" (ah), "r" (bh), "%1" (al), "n" (bl) __CLOBBER_CC);\
				747	else \
				748	*/ __asm__ ("alr\t%1,%5\n\talcr\t%0,%3" \
				749	: "=r" (sh), "=&r" (sl) \
				750	: "0" (ah), "r" (bh), "%1" (al), "r" (bl)__CLOBBER_CC); \
				751	} while (0)
				752	#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
				753	do { \
				754	/* if (__builtin_constant_p (bl)) \
				755	__asm__ ("slfi\t%1,%o5\n\tslbr\t%0,%3" \
				756	: "=r" (sh), "=&r" (sl) \
				757	: "0" (ah), "r" (bh), "1" (al), "n" (bl) __CLOBBER_CC); \
				758	else \
				759	*/ __asm__ ("slr\t%1,%5\n\tslbr\t%0,%3" \
				760	: "=r" (sh), "=&r" (sl) \
				761	: "0" (ah), "r" (bh), "1" (al), "r" (bl) __CLOBBER_CC); \
				762	} while (0)
				763	#if __GMP_GNUC_PREREQ (4,5)
				764	#define umul_ppmm(xh, xl, m0, m1) \
				765	do { \
				766	union {UDItype __ll; \
				767	struct {USItype __h, __l;} __i; \
				768	} __x; \
				769	__x.__ll = (UDItype) (m0) * (UDItype) (m1); \
				770	(xh) = __x.__i.__h; (xl) = __x.__i.__l; \
				771	} while (0)
				772	#else
				773	#if 0
				774	/* FIXME: this fails if gcc knows about the 64-bit registers. Use only
				775	with a new enough processor pretending we have 32-bit registers. */
				776	#define umul_ppmm(xh, xl, m0, m1) \
				777	do { \
				778	union {UDItype __ll; \
				779	struct {USItype __h, __l;} __i; \
				780	} __x; \
				781	__asm__ ("mlr\t%0,%2" \
				782	: "=r" (__x.__ll) \
				783	: "%0" (m0), "r" (m1)); \
				784	(xh) = __x.__i.__h; (xl) = __x.__i.__l; \
				785	} while (0)
				786	#else
				787	#define umul_ppmm(xh, xl, m0, m1) \
				788	do { \
				789	/* When we have 64-bit regs and gcc is aware of that, we cannot simply use
				790	DImode for the product, since that would be allocated to a single 64-bit
				791	register, whereas mlr uses the low 32-bits of an even-odd register pair.
				792	*/ \
				793	register USItype __r0 __asm__ ("0"); \
				794	register USItype __r1 __asm__ ("1") = (m0); \
				795	__asm__ ("mlr\t%0,%3" \
				796	: "=r" (__r0), "=r" (__r1) \
				797	: "r" (__r1), "r" (m1)); \
				798	(xh) = __r0; (xl) = __r1; \
				799	} while (0)
				800	#endif /* if 0 */
				801	#endif
				802	#if 0
				803	/* FIXME: this fails if gcc knows about the 64-bit registers. Use only
				804	with a new enough processor pretending we have 32-bit registers. */
				805	#define udiv_qrnnd(q, r, n1, n0, d) \
				806	do { \
				807	union {UDItype __ll; \
				808	struct {USItype __h, __l;} __i; \
				809	} __x; \
				810	__x.__i.__h = n1; __x.__i.__l = n0; \
				811	__asm__ ("dlr\t%0,%2" \
				812	: "=r" (__x.__ll) \
				813	: "0" (__x.__ll), "r" (d)); \
				814	(q) = __x.__i.__l; (r) = __x.__i.__h; \
				815	} while (0)
				816	#else
				817	#define udiv_qrnnd(q, r, n1, n0, d) \
				818	do { \
				819	register USItype __r0 __asm__ ("0") = (n1); \
				820	register USItype __r1 __asm__ ("1") = (n0); \
				821	__asm__ ("dlr\t%0,%4" \
				822	: "=r" (__r0), "=r" (__r1) \
				823	: "r" (__r0), "r" (__r1), "r" (d)); \
				824	(q) = __r1; (r) = __r0; \
				825	} while (0)
				826	#endif /* if 0 */
				827	#else /* if __zarch__ */
				828	/* FIXME: this fails if gcc knows about the 64-bit registers. */
				829	#define smul_ppmm(xh, xl, m0, m1) \
				830	do { \
				831	union {DItype __ll; \
				832	struct {USItype __h, __l;} __i; \
				833	} __x; \
				834	__asm__ ("mr\t%0,%2" \
				835	: "=r" (__x.__ll) \
				836	: "%0" (m0), "r" (m1)); \
				837	(xh) = __x.__i.__h; (xl) = __x.__i.__l; \
				838	} while (0)
				839	/* FIXME: this fails if gcc knows about the 64-bit registers. */
				840	#define sdiv_qrnnd(q, r, n1, n0, d) \
				841	do { \
				842	union {DItype __ll; \
				843	struct {USItype __h, __l;} __i; \
				844	} __x; \
				845	__x.__i.__h = n1; __x.__i.__l = n0; \
				846	__asm__ ("dr\t%0,%2" \
				847	: "=r" (__x.__ll) \
				848	: "0" (__x.__ll), "r" (d)); \
				849	(q) = __x.__i.__l; (r) = __x.__i.__h; \
				850	} while (0)
				851	#endif /* if __zarch__ */
				852	#endif
				853
				854	#if defined (__s390x__) && W_TYPE_SIZE == 64
				855	/* We need to cast operands with register constraints, otherwise their types
				856	will be assumed to be SImode by gcc. For these machines, such operations
				857	will insert a value into the low 32 bits, and leave the high 32 bits with
				858	garbage. */
				859	#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
				860	do { \
				861	__asm__ ("algr\t%1,%5\n\talcgr\t%0,%3" \
				862	: "=r" (sh), "=&r" (sl) \
				863	: "0" ((UDItype)(ah)), "r" ((UDItype)(bh)), \
				864	"%1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \
				865	} while (0)
				866	#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
				867	do { \
				868	__asm__ ("slgr\t%1,%5\n\tslbgr\t%0,%3" \
				869	: "=r" (sh), "=&r" (sl) \
				870	: "0" ((UDItype)(ah)), "r" ((UDItype)(bh)), \
				871	"1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \
				872	} while (0)
				873	#define umul_ppmm(xh, xl, m0, m1) \
				874	do { \
				875	union {unsigned int __attribute__ ((mode(TI))) __ll; \
				876	struct {UDItype __h, __l;} __i; \
				877	} __x; \
				878	__asm__ ("mlgr\t%0,%2" \
				879	: "=r" (__x.__ll) \
				880	: "%0" ((UDItype)(m0)), "r" ((UDItype)(m1))); \
				881	(xh) = __x.__i.__h; (xl) = __x.__i.__l; \
				882	} while (0)
				883	#define udiv_qrnnd(q, r, n1, n0, d) \
				884	do { \
				885	union {unsigned int __attribute__ ((mode(TI))) __ll; \
				886	struct {UDItype __h, __l;} __i; \
				887	} __x; \
				888	__x.__i.__h = n1; __x.__i.__l = n0; \
				889	__asm__ ("dlgr\t%0,%2" \
				890	: "=r" (__x.__ll) \
				891	: "0" (__x.__ll), "r" ((UDItype)(d))); \
				892	(q) = __x.__i.__l; (r) = __x.__i.__h; \
				893	} while (0)
				894	#if 0 /* FIXME: Enable for z10 (?) */
				895	#define count_leading_zeros(cnt, x) \
				896	do { \
				897	union {unsigned int __attribute__ ((mode(TI))) __ll; \
				898	struct {UDItype __h, __l;} __i; \
				899	} __clr_cnt; \
				900	__asm__ ("flogr\t%0,%1" \
				901	: "=r" (__clr_cnt.__ll) \
				902	: "r" (x) __CLOBBER_CC); \
				903	(cnt) = __clr_cnt.__i.__h; \
				904	} while (0)
				905	#endif
				906	#endif
				907
				908	/* On x86 and x86_64, every asm implicitly clobbers "flags" and "fpsr",
				909	so we don't need __CLOBBER_CC. */
				910	#if (defined (__i386__) \|\| defined (__i486__)) && W_TYPE_SIZE == 32
				911	#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
				912	__asm__ ("addl %5,%k1\n\tadcl %3,%k0" \
				913	: "=r" (sh), "=&r" (sl) \
				914	: "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
				915	"%1" ((USItype)(al)), "g" ((USItype)(bl)))
				916	#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
				917	__asm__ ("subl %5,%k1\n\tsbbl %3,%k0" \
				918	: "=r" (sh), "=&r" (sl) \
				919	: "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
				920	"1" ((USItype)(al)), "g" ((USItype)(bl)))
				921	#define umul_ppmm(w1, w0, u, v) \
				922	__asm__ ("mull %3" \
				923	: "=a" (w0), "=d" (w1) \
				924	: "%0" ((USItype)(u)), "rm" ((USItype)(v)))
				925	#define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
				926	__asm__ ("divl %4" /* stringification in K&R C */ \
				927	: "=a" (q), "=d" (r) \
				928	: "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
				929
				930	#if HAVE_HOST_CPU_i586 \|\| HAVE_HOST_CPU_pentium \|\| HAVE_HOST_CPU_pentiummmx
				931	/* Pentium bsrl takes between 10 and 72 cycles depending where the most
				932	significant 1 bit is, hence the use of the following alternatives. bsfl
				933	is slow too, between 18 and 42 depending where the least significant 1
				934	bit is, so let the generic count_trailing_zeros below make use of the
				935	count_leading_zeros here too. */
				936
				937	#if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
				938	/* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
				939	cache miss reading from __clz_tab. For P55 it's favoured over the float
				940	below so as to avoid mixing MMX and x87, since the penalty for switching
				941	between the two is about 100 cycles.
				942
				943	The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
				944	16, -1 for 8, or 0 otherwise. This could be written equivalently as
				945	follows, but as of gcc 2.95.2 it results in conditional jumps.
				946
				947	__shift = -(__n < 0x1000000);
				948	__shift -= (__n < 0x10000);
				949	__shift -= (__n < 0x100);
				950
				951	The middle two sbbl and cmpl's pair, and with luck something gcc
				952	generates might pair with the first cmpl and the last sbbl. The "32+1"
				953	constant could be folded into __clz_tab[], but it doesn't seem worth
				954	making a different table just for that. */
				955
				956	#define count_leading_zeros(c,n) \
				957	do { \
				958	USItype __n = (n); \
				959	USItype __shift; \
				960	__asm__ ("cmpl $0x1000000, %1\n" \
				961	"sbbl %0, %0\n" \
				962	"cmpl $0x10000, %1\n" \
				963	"sbbl $0, %0\n" \
				964	"cmpl $0x100, %1\n" \
				965	"sbbl $0, %0\n" \
				966	: "=&r" (__shift) : "r" (__n)); \
				967	__shift = __shift*8 + 24 + 1; \
				968	(c) = 32 + 1 - __shift - __clz_tab[__n >> __shift]; \
				969	} while (0)
				970	#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
				971	#define COUNT_LEADING_ZEROS_0 31 /* n==0 indistinguishable from n==1 */
				972
				973	#else /* ! pentiummmx \|\| LONGLONG_STANDALONE */
				974	/* The following should be a fixed 14 cycles or so. Some scheduling
				975	opportunities should be available between the float load/store too. This
				976	sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
				977	apparently suggested by the Intel optimizing manual (don't know exactly
				978	where). gcc 2.95 or up will be best for this, so the "double" is
				979	correctly aligned on the stack. */
				980	#define count_leading_zeros(c,n) \
				981	do { \
				982	union { \
				983	double d; \
				984	unsigned a[2]; \
				985	} __u; \
				986	__u.d = (UWtype) (n); \
				987	(c) = 0x3FF + 31 - (__u.a[1] >> 20); \
				988	} while (0)
				989	#define COUNT_LEADING_ZEROS_0 (0x3FF + 31)
				990	#endif /* pentiummx */
				991
				992	#else /* ! pentium */
				993
				994	#if __GMP_GNUC_PREREQ (3,4) /* using bsrl */
				995	#define count_leading_zeros(count,x) count_leading_zeros_gcc_clz(count,x)
				996	#endif /* gcc clz */
				997
				998	/* On P6, gcc prior to 3.0 generates a partial register stall for
				999	__cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
				1000	being 1 code byte smaller. "31-__cbtmp" is a workaround, probably at the
				1001	cost of one extra instruction. Do this for "i386" too, since that means
				1002	generic x86. */
				1003	#if ! defined (count_leading_zeros) && __GNUC__ < 3 \
				1004	&& (HAVE_HOST_CPU_i386 \
				1005	\|\| HAVE_HOST_CPU_i686 \
				1006	\|\| HAVE_HOST_CPU_pentiumpro \
				1007	\|\| HAVE_HOST_CPU_pentium2 \
				1008	\|\| HAVE_HOST_CPU_pentium3)
				1009	#define count_leading_zeros(count, x) \
				1010	do { \
				1011	USItype __cbtmp; \
				1012	ASSERT ((x) != 0); \
				1013	__asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \
				1014	(count) = 31 - __cbtmp; \
				1015	} while (0)
				1016	#endif /* gcc<3 asm bsrl */
				1017
				1018	#ifndef count_leading_zeros
				1019	#define count_leading_zeros(count, x) \
				1020	do { \
				1021	USItype __cbtmp; \
				1022	ASSERT ((x) != 0); \
				1023	__asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \
				1024	(count) = __cbtmp ^ 31; \
				1025	} while (0)
				1026	#endif /* asm bsrl */
				1027
				1028	#if __GMP_GNUC_PREREQ (3,4) /* using bsfl */
				1029	#define count_trailing_zeros(count,x) count_trailing_zeros_gcc_ctz(count,x)
				1030	#endif /* gcc ctz */
				1031
				1032	#ifndef count_trailing_zeros
				1033	#define count_trailing_zeros(count, x) \
				1034	do { \
				1035	ASSERT ((x) != 0); \
				1036	__asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x))); \
				1037	} while (0)
				1038	#endif /* asm bsfl */
				1039
				1040	#endif /* ! pentium */
				1041
				1042	#endif /* 80x86 */
				1043
				1044	#if defined (__amd64__) && W_TYPE_SIZE == 64
				1045	#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
				1046	__asm__ ("addq %5,%q1\n\tadcq %3,%q0" \
				1047	: "=r" (sh), "=&r" (sl) \
				1048	: "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \
				1049	"%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
				1050	#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
				1051	__asm__ ("subq %5,%q1\n\tsbbq %3,%q0" \
				1052	: "=r" (sh), "=&r" (sl) \
				1053	: "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \
				1054	"1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
				1055	#if X86_ASM_MULX \
				1056	&& (HAVE_HOST_CPU_haswell \|\| HAVE_HOST_CPU_broadwell \
				1057	\|\| HAVE_HOST_CPU_skylake \|\| HAVE_HOST_CPU_bd4 \|\| HAVE_HOST_CPU_zen)
				1058	#define umul_ppmm(w1, w0, u, v) \
				1059	__asm__ ("mulx\t%3, %0, %1" \
				1060	: "=r" (w0), "=r" (w1) \
				1061	: "%d" ((UDItype)(u)), "rm" ((UDItype)(v)))
				1062	#else
				1063	#define umul_ppmm(w1, w0, u, v) \
				1064	__asm__ ("mulq\t%3" \
				1065	: "=a" (w0), "=d" (w1) \
				1066	: "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
				1067	#endif
				1068	#define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
				1069	__asm__ ("divq %4" /* stringification in K&R C */ \
				1070	: "=a" (q), "=d" (r) \
				1071	: "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
				1072
				1073	#if HAVE_HOST_CPU_haswell \|\| HAVE_HOST_CPU_broadwell \|\| HAVE_HOST_CPU_skylake \
				1074	\|\| HAVE_HOST_CPU_k10 \|\| HAVE_HOST_CPU_bd1 \|\| HAVE_HOST_CPU_bd2 \
				1075	\|\| HAVE_HOST_CPU_bd3 \|\| HAVE_HOST_CPU_bd4 \|\| HAVE_HOST_CPU_zen \
				1076	\|\| HAVE_HOST_CPU_bobcat \|\| HAVE_HOST_CPU_jaguar
				1077	#define count_leading_zeros(count, x) \
				1078	do { \
				1079	/* This is lzcnt, spelled for older assemblers. Destination and */ \
				1080	/* source must be a 64-bit registers, hence cast and %q. */ \
				1081	__asm__ ("rep;bsr\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \
				1082	} while (0)
				1083	#define COUNT_LEADING_ZEROS_0 64
				1084	#else
				1085	#define count_leading_zeros(count, x) \
				1086	do { \
				1087	UDItype __cbtmp; \
				1088	ASSERT ((x) != 0); \
				1089	__asm__ ("bsr\t%1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x))); \
				1090	(count) = __cbtmp ^ 63; \
				1091	} while (0)
				1092	#endif
				1093
				1094	#if HAVE_HOST_CPU_bd2 \|\| HAVE_HOST_CPU_bd3 \|\| HAVE_HOST_CPU_bd4 \
				1095	\|\| HAVE_HOST_CPU_zen \|\| HAVE_HOST_CPU_jaguar
				1096	#define count_trailing_zeros(count, x) \
				1097	do { \
				1098	/* This is tzcnt, spelled for older assemblers. Destination and */ \
				1099	/* source must be a 64-bit registers, hence cast and %q. */ \
				1100	__asm__ ("rep;bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \
				1101	} while (0)
				1102	#define COUNT_TRAILING_ZEROS_0 64
				1103	#else
				1104	#define count_trailing_zeros(count, x) \
				1105	do { \
				1106	ASSERT ((x) != 0); \
				1107	__asm__ ("bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \
				1108	} while (0)
				1109	#endif
				1110	#endif /* __amd64__ */
				1111
				1112	#if defined (__i860__) && W_TYPE_SIZE == 32
				1113	#define rshift_rhlc(r,h,l,c) \
				1114	__asm__ ("shr %3,r0,r0\;shrd %1,%2,%0" \
				1115	"=r" (r) : "r" (h), "r" (l), "rn" (c))
				1116	#endif /* i860 */
				1117
				1118	#if defined (__i960__) && W_TYPE_SIZE == 32
				1119	#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
				1120	__asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0" \
				1121	: "=r" (sh), "=&r" (sl) \
				1122	: "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
				1123	#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
				1124	__asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0" \
				1125	: "=r" (sh), "=&r" (sl) \
				1126	: "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
				1127	#define umul_ppmm(w1, w0, u, v) \
				1128	({union {UDItype __ll; \
				1129	struct {USItype __l, __h;} __i; \
				1130	} __x; \
				1131	__asm__ ("emul %2,%1,%0" \
				1132	: "=d" (__x.__ll) : "%dI" (u), "dI" (v)); \
				1133	(w1) = __x.__i.__h; (w0) = __x.__i.__l;})
				1134	#define __umulsidi3(u, v) \
				1135	({UDItype __w; \
				1136	__asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v)); \
				1137	__w; })
				1138	#define udiv_qrnnd(q, r, nh, nl, d) \
				1139	do { \
				1140	union {UDItype __ll; \
				1141	struct {USItype __l, __h;} __i; \
				1142	} __nn; \
				1143	__nn.__i.__h = (nh); __nn.__i.__l = (nl); \
				1144	__asm__ ("ediv %d,%n,%0" \
				1145	: "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d)); \
				1146	(r) = __rq.__i.__l; (q) = __rq.__i.__h; \
				1147	} while (0)
				1148	#define count_leading_zeros(count, x) \
				1149	do { \
				1150	USItype __cbtmp; \
				1151	__asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x)); \
				1152	(count) = __cbtmp ^ 31; \
				1153	} while (0)
				1154	#define COUNT_LEADING_ZEROS_0 (-32) /* sic */
				1155	#if defined (__i960mx) /* what is the proper symbol to test??? */
				1156	#define rshift_rhlc(r,h,l,c) \
				1157	do { \
				1158	union {UDItype __ll; \
				1159	struct {USItype __l, __h;} __i; \
				1160	} __nn; \
				1161	__nn.__i.__h = (h); __nn.__i.__l = (l); \
				1162	__asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c)); \
				1163	}
				1164	#endif /* i960mx */
				1165	#endif /* i960 */
				1166
				1167	#if (defined (__mc68000__) \|\| defined (__mc68020__) \|\| defined(mc68020) \
				1168	\|\| defined (__m68k__) \|\| defined (__mc5200__) \|\| defined (__mc5206e__) \
				1169	\|\| defined (__mc5307__)) && W_TYPE_SIZE == 32
				1170	#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
				1171	__asm__ ("add%.l %5,%1\n\taddx%.l %3,%0" \
				1172	: "=d" (sh), "=&d" (sl) \
				1173	: "0" ((USItype)(ah)), "d" ((USItype)(bh)), \
				1174	"%1" ((USItype)(al)), "g" ((USItype)(bl)))
				1175	#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
				1176	__asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0" \
				1177	: "=d" (sh), "=&d" (sl) \
				1178	: "0" ((USItype)(ah)), "d" ((USItype)(bh)), \
				1179	"1" ((USItype)(al)), "g" ((USItype)(bl)))
				1180	/* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r. */
				1181	#if defined (__mc68020__) \|\| defined(mc68020) \
				1182	\|\| defined (__mc68030__) \|\| defined (mc68030) \
				1183	\|\| defined (__mc68040__) \|\| defined (mc68040) \
				1184	\|\| defined (__mcpu32__) \|\| defined (mcpu32) \
				1185	\|\| defined (__NeXT__)
				1186	#define umul_ppmm(w1, w0, u, v) \
				1187	__asm__ ("mulu%.l %3,%1:%0" \
				1188	: "=d" (w0), "=d" (w1) \
				1189	: "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
				1190	#define udiv_qrnnd(q, r, n1, n0, d) \
				1191	__asm__ ("divu%.l %4,%1:%0" \
				1192	: "=d" (q), "=d" (r) \
				1193	: "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
				1194	#define sdiv_qrnnd(q, r, n1, n0, d) \
				1195	__asm__ ("divs%.l %4,%1:%0" \
				1196	: "=d" (q), "=d" (r) \
				1197	: "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
				1198	#else /* for other 68k family members use 16x16->32 multiplication */
				1199	#define umul_ppmm(xh, xl, a, b) \
				1200	do { USItype __umul_tmp1, __umul_tmp2; \
				1201	__asm__ ("\| Inlined umul_ppmm\n" \
				1202	" move%.l %5,%3\n" \
				1203	" move%.l %2,%0\n" \
				1204	" move%.w %3,%1\n" \
				1205	" swap %3\n" \
				1206	" swap %0\n" \
				1207	" mulu%.w %2,%1\n" \
				1208	" mulu%.w %3,%0\n" \
				1209	" mulu%.w %2,%3\n" \
				1210	" swap %2\n" \
				1211	" mulu%.w %5,%2\n" \
				1212	" add%.l %3,%2\n" \
				1213	" jcc 1f\n" \
				1214	" add%.l %#0x10000,%0\n" \
				1215	"1: move%.l %2,%3\n" \
				1216	" clr%.w %2\n" \
				1217	" swap %2\n" \
				1218	" swap %3\n" \
				1219	" clr%.w %3\n" \
				1220	" add%.l %3,%1\n" \
				1221	" addx%.l %2,%0\n" \
				1222	" \| End inlined umul_ppmm" \
				1223	: "=&d" (xh), "=&d" (xl), \
				1224	"=d" (__umul_tmp1), "=&d" (__umul_tmp2) \
				1225	: "%2" ((USItype)(a)), "d" ((USItype)(b))); \
				1226	} while (0)
				1227	#endif /* not mc68020 */
				1228	/* The '020, '030, '040 and '060 have bitfield insns.
				1229	GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
				1230	exclude bfffo on that chip (bitfield insns not available). */
				1231	#if (defined (__mc68020__) \|\| defined (mc68020) \
				1232	\|\| defined (__mc68030__) \|\| defined (mc68030) \
				1233	\|\| defined (__mc68040__) \|\| defined (mc68040) \
				1234	\|\| defined (__mc68060__) \|\| defined (mc68060) \
				1235	\|\| defined (__NeXT__)) \
				1236	&& ! defined (__mcpu32__)
				1237	#define count_leading_zeros(count, x) \
				1238	__asm__ ("bfffo %1{%b2:%b2},%0" \
				1239	: "=d" (count) \
				1240	: "od" ((USItype) (x)), "n" (0))
				1241	#define COUNT_LEADING_ZEROS_0 32
				1242	#endif
				1243	#endif /* mc68000 */
				1244
				1245	#if defined (__m88000__) && W_TYPE_SIZE == 32
				1246	#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
				1247	__asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3" \
				1248	: "=r" (sh), "=&r" (sl) \
				1249	: "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
				1250	#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
				1251	__asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3" \
				1252	: "=r" (sh), "=&r" (sl) \
				1253	: "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
				1254	#define count_leading_zeros(count, x) \
				1255	do { \
				1256	USItype __cbtmp; \
				1257	__asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x)); \
				1258	(count) = __cbtmp ^ 31; \
				1259	} while (0)
				1260	#define COUNT_LEADING_ZEROS_0 63 /* sic */
				1261	#if defined (__m88110__)
				1262	#define umul_ppmm(wh, wl, u, v) \
				1263	do { \
				1264	union {UDItype __ll; \
				1265	struct {USItype __h, __l;} __i; \
				1266	} __x; \
				1267	__asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v)); \
				1268	(wh) = __x.__i.__h; \
				1269	(wl) = __x.__i.__l; \
				1270	} while (0)
				1271	#define udiv_qrnnd(q, r, n1, n0, d) \
				1272	({union {UDItype __ll; \
				1273	struct {USItype __h, __l;} __i; \
				1274	} __x, __q; \
				1275	__x.__i.__h = (n1); __x.__i.__l = (n0); \
				1276	__asm__ ("divu.d %0,%1,%2" \
				1277	: "=r" (__q.__ll) : "r" (__x.__ll), "r" (d)); \
				1278	(r) = (n0) - __q.__l * (d); (q) = __q.__l; })
				1279	#endif /* __m88110__ */
				1280	#endif /* __m88000__ */
				1281
				1282	#if defined (__mips) && W_TYPE_SIZE == 32
				1283	#if __GMP_GNUC_PREREQ (4,4)
				1284	#define umul_ppmm(w1, w0, u, v) \
				1285	do { \
				1286	UDItype __ll = (UDItype)(u) * (v); \
				1287	w1 = __ll >> 32; \
				1288	w0 = __ll; \
				1289	} while (0)
				1290	#endif
				1291	#if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
				1292	#define umul_ppmm(w1, w0, u, v) \
				1293	__asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
				1294	#endif
				1295	#if !defined (umul_ppmm)
				1296	#define umul_ppmm(w1, w0, u, v) \
				1297	__asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1" \
				1298	: "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
				1299	#endif
				1300	#endif /* __mips */
				1301
				1302	#if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
				1303	#if defined (_MIPS_ARCH_MIPS64R6)
				1304	#define umul_ppmm(w1, w0, u, v) \
				1305	do { \
				1306	UDItype __m0 = (u), __m1 = (v); \
				1307	(w0) = __m0 * __m1; \
				1308	__asm__ ("dmuhu\t%0, %1, %2" : "=d" (w1) : "d" (__m0), "d" (__m1)); \
				1309	} while (0)
				1310	#endif
				1311	#if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (4,4)
				1312	#define umul_ppmm(w1, w0, u, v) \
				1313	do { \
				1314	typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \
				1315	__ll_UTItype __ll = (__ll_UTItype)(u) * (v); \
				1316	w1 = __ll >> 64; \
				1317	w0 = __ll; \
				1318	} while (0)
				1319	#endif
				1320	#if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
				1321	#define umul_ppmm(w1, w0, u, v) \
				1322	__asm__ ("dmultu %2,%3" \
				1323	: "=l" (w0), "=h" (w1) \
				1324	: "d" ((UDItype)(u)), "d" ((UDItype)(v)))
				1325	#endif
				1326	#if !defined (umul_ppmm)
				1327	#define umul_ppmm(w1, w0, u, v) \
				1328	__asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1" \
				1329	: "=d" (w0), "=d" (w1) \
				1330	: "d" ((UDItype)(u)), "d" ((UDItype)(v)))
				1331	#endif
				1332	#endif /* __mips */
				1333
				1334	#if defined (__mmix__) && W_TYPE_SIZE == 64
				1335	#define umul_ppmm(w1, w0, u, v) \
				1336	__asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v))
				1337	#endif
				1338
				1339	#if defined (__ns32000__) && W_TYPE_SIZE == 32
				1340	#define umul_ppmm(w1, w0, u, v) \
				1341	({union {UDItype __ll; \
				1342	struct {USItype __l, __h;} __i; \
				1343	} __x; \
				1344	__asm__ ("meid %2,%0" \
				1345	: "=g" (__x.__ll) \
				1346	: "%0" ((USItype)(u)), "g" ((USItype)(v))); \
				1347	(w1) = __x.__i.__h; (w0) = __x.__i.__l;})
				1348	#define __umulsidi3(u, v) \
				1349	({UDItype __w; \
				1350	__asm__ ("meid %2,%0" \
				1351	: "=g" (__w) \
				1352	: "%0" ((USItype)(u)), "g" ((USItype)(v))); \
				1353	__w; })
				1354	#define udiv_qrnnd(q, r, n1, n0, d) \
				1355	({union {UDItype __ll; \
				1356	struct {USItype __l, __h;} __i; \
				1357	} __x; \
				1358	__x.__i.__h = (n1); __x.__i.__l = (n0); \
				1359	__asm__ ("deid %2,%0" \
				1360	: "=g" (__x.__ll) \
				1361	: "0" (__x.__ll), "g" ((USItype)(d))); \
				1362	(r) = __x.__i.__l; (q) = __x.__i.__h; })
				1363	#define count_trailing_zeros(count,x) \
				1364	do { \
				1365	__asm__ ("ffsd %2,%0" \
				1366	: "=r" (count) \
				1367	: "0" ((USItype) 0), "r" ((USItype) (x))); \
				1368	} while (0)
				1369	#endif /* __ns32000__ */
				1370
				1371	/* In the past we had a block of various #defines tested
				1372	_ARCH_PPC - AIX
				1373	_ARCH_PWR - AIX
				1374	__powerpc__ - gcc
				1375	__POWERPC__ - BEOS
				1376	__ppc__ - Darwin
				1377	PPC - old gcc, GNU/Linux, SysV
				1378	The plain PPC test was not good for vxWorks, since PPC is defined on all
				1379	CPUs there (eg. m68k too), as a constant one is expected to compare
				1380	CPU_FAMILY against.
				1381
				1382	At any rate, this was pretty unattractive and a bit fragile. The use of
				1383	HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
				1384	getting the desired effect.
				1385
				1386	ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
				1387	the system vendor compilers. (Is that vendor compilers with inline asm,
				1388	or what?) */
				1389
				1390	#if (HAVE_HOST_CPU_FAMILY_power \|\| HAVE_HOST_CPU_FAMILY_powerpc) \
				1391	&& W_TYPE_SIZE == 32
				1392	#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
				1393	do { \
				1394	if (__builtin_constant_p (bh) && (bh) == 0) \
				1395	__asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2" \
				1396	: "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl) \
				1397	__CLOBBER_CC); \
				1398	else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \
				1399	__asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2" \
				1400	: "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl) \
				1401	__CLOBBER_CC); \
				1402	else \
				1403	__asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3" \
				1404	: "=r" (sh), "=&r" (sl) \
				1405	: "r" (ah), "r" (bh), "%r" (al), "rI" (bl) \
				1406	__CLOBBER_CC); \
				1407	} while (0)
				1408	#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
				1409	do { \
				1410	if (__builtin_constant_p (ah) && (ah) == 0) \
				1411	__asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2" \
				1412	: "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl) \
				1413	__CLOBBER_CC); \
				1414	else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0) \
				1415	__asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2" \
				1416	: "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl) \
				1417	__CLOBBER_CC); \
				1418	else if (__builtin_constant_p (bh) && (bh) == 0) \
				1419	__asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2" \
				1420	: "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl) \
				1421	__CLOBBER_CC); \
				1422	else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \
				1423	__asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2" \
				1424	: "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl) \
				1425	__CLOBBER_CC); \
				1426	else \
				1427	__asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2" \
				1428	: "=r" (sh), "=&r" (sl) \
				1429	: "r" (ah), "r" (bh), "rI" (al), "r" (bl) \
				1430	__CLOBBER_CC); \
				1431	} while (0)
				1432	#define count_leading_zeros(count, x) \
				1433	__asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x))
				1434	#define COUNT_LEADING_ZEROS_0 32
				1435	#if HAVE_HOST_CPU_FAMILY_powerpc
				1436	#if __GMP_GNUC_PREREQ (4,4)
				1437	#define umul_ppmm(w1, w0, u, v) \
				1438	do { \
				1439	UDItype __ll = (UDItype)(u) * (v); \
				1440	w1 = __ll >> 32; \
				1441	w0 = __ll; \
				1442	} while (0)
				1443	#endif
				1444	#if !defined (umul_ppmm)
				1445	#define umul_ppmm(ph, pl, m0, m1) \
				1446	do { \
				1447	USItype __m0 = (m0), __m1 = (m1); \
				1448	__asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \
				1449	(pl) = __m0 * __m1; \
				1450	} while (0)
				1451	#endif
				1452	#define smul_ppmm(ph, pl, m0, m1) \
				1453	do { \
				1454	SItype __m0 = (m0), __m1 = (m1); \
				1455	__asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \
				1456	(pl) = __m0 * __m1; \
				1457	} while (0)
				1458	#else
				1459	#define smul_ppmm(xh, xl, m0, m1) \
				1460	__asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
				1461	#define sdiv_qrnnd(q, r, nh, nl, d) \
				1462	__asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
				1463	#endif
				1464	#endif /* 32-bit POWER architecture variants. */
				1465
				1466	/* We should test _IBMR2 here when we add assembly support for the system
				1467	vendor compilers. */
				1468	#if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
				1469	#if !defined (_LONG_LONG_LIMB)
				1470	/* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values. So
				1471	use adde etc only when not _LONG_LONG_LIMB. */
				1472	#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
				1473	do { \
				1474	if (__builtin_constant_p (bh) && (bh) == 0) \
				1475	__asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2" \
				1476	: "=r" (sh), "=&r" (sl) \
				1477	: "r" ((UDItype)(ah)), \
				1478	"%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) \
				1479	__CLOBBER_CC); \
				1480	else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \
				1481	__asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2" \
				1482	: "=r" (sh), "=&r" (sl) \
				1483	: "r" ((UDItype)(ah)), \
				1484	"%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) \
				1485	__CLOBBER_CC); \
				1486	else \
				1487	__asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3" \
				1488	: "=r" (sh), "=&r" (sl) \
				1489	: "r" ((UDItype)(ah)), "r" ((UDItype)(bh)), \
				1490	"%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) \
				1491	__CLOBBER_CC); \
				1492	} while (0)
				1493	/* We use "*rI" for the constant operand here, since with just "I", gcc barfs.
				1494	This might seem strange, but gcc folds away the dead code late. */
				1495	#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
				1496	do { \
				1497	if (__builtin_constant_p (bl) && bl > -0x8000 && bl <= 0x8000) { \
				1498	if (__builtin_constant_p (ah) && (ah) == 0) \
				1499	__asm__ ("addic %1,%3,%4\n\tsubfze %0,%2" \
				1500	: "=r" (sh), "=&r" (sl) \
				1501	: "r" ((UDItype)(bh)), \
				1502	"rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \
				1503	__CLOBBER_CC); \
				1504	else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0) \
				1505	__asm__ ("addic %1,%3,%4\n\tsubfme %0,%2" \
				1506	: "=r" (sh), "=&r" (sl) \
				1507	: "r" ((UDItype)(bh)), \
				1508	"rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \
				1509	__CLOBBER_CC); \
				1510	else if (__builtin_constant_p (bh) && (bh) == 0) \
				1511	__asm__ ("addic %1,%3,%4\n\taddme %0,%2" \
				1512	: "=r" (sh), "=&r" (sl) \
				1513	: "r" ((UDItype)(ah)), \
				1514	"rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \
				1515	__CLOBBER_CC); \
				1516	else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \
				1517	__asm__ ("addic %1,%3,%4\n\taddze %0,%2" \
				1518	: "=r" (sh), "=&r" (sl) \
				1519	: "r" ((UDItype)(ah)), \
				1520	"rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \
				1521	__CLOBBER_CC); \
				1522	else \
				1523	__asm__ ("addic %1,%4,%5\n\tsubfe %0,%3,%2" \
				1524	: "=r" (sh), "=&r" (sl) \
				1525	: "r" ((UDItype)(ah)), "r" ((UDItype)(bh)), \
				1526	"rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \
				1527	__CLOBBER_CC); \
				1528	} else { \
				1529	if (__builtin_constant_p (ah) && (ah) == 0) \
				1530	__asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2" \
				1531	: "=r" (sh), "=&r" (sl) \
				1532	: "r" ((UDItype)(bh)), \
				1533	"rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \
				1534	__CLOBBER_CC); \
				1535	else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0) \
				1536	__asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2" \
				1537	: "=r" (sh), "=&r" (sl) \
				1538	: "r" ((UDItype)(bh)), \
				1539	"rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \
				1540	__CLOBBER_CC); \
				1541	else if (__builtin_constant_p (bh) && (bh) == 0) \
				1542	__asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2" \
				1543	: "=r" (sh), "=&r" (sl) \
				1544	: "r" ((UDItype)(ah)), \
				1545	"rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \
				1546	__CLOBBER_CC); \
				1547	else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \
				1548	__asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2" \
				1549	: "=r" (sh), "=&r" (sl) \
				1550	: "r" ((UDItype)(ah)), \
				1551	"rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \
				1552	__CLOBBER_CC); \
				1553	else \
				1554	__asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2" \
				1555	: "=r" (sh), "=&r" (sl) \
				1556	: "r" ((UDItype)(ah)), "r" ((UDItype)(bh)), \
				1557	"rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \
				1558	__CLOBBER_CC); \
				1559	} \
				1560	} while (0)
				1561	#endif /* ! _LONG_LONG_LIMB */
				1562	#define count_leading_zeros(count, x) \
				1563	__asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
				1564	#define COUNT_LEADING_ZEROS_0 64
				1565	#if __GMP_GNUC_PREREQ (4,8)
				1566	#define umul_ppmm(w1, w0, u, v) \
				1567	do { \
				1568	typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \
				1569	__ll_UTItype __ll = (__ll_UTItype)(u) * (v); \
				1570	w1 = __ll >> 64; \
				1571	w0 = __ll; \
				1572	} while (0)
				1573	#endif
				1574	#if !defined (umul_ppmm)
				1575	#define umul_ppmm(ph, pl, m0, m1) \
				1576	do { \
				1577	UDItype __m0 = (m0), __m1 = (m1); \
				1578	__asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1)); \
				1579	(pl) = __m0 * __m1; \
				1580	} while (0)
				1581	#endif
				1582	#define smul_ppmm(ph, pl, m0, m1) \
				1583	do { \
				1584	DItype __m0 = (m0), __m1 = (m1); \
				1585	__asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1)); \
				1586	(pl) = __m0 * __m1; \
				1587	} while (0)
				1588	#endif /* 64-bit PowerPC. */
				1589
				1590	#if defined (__pyr__) && W_TYPE_SIZE == 32
				1591	#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
				1592	__asm__ ("addw %5,%1\n\taddwc %3,%0" \
				1593	: "=r" (sh), "=&r" (sl) \
				1594	: "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
				1595	"%1" ((USItype)(al)), "g" ((USItype)(bl)))
				1596	#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
				1597	__asm__ ("subw %5,%1\n\tsubwb %3,%0" \
				1598	: "=r" (sh), "=&r" (sl) \
				1599	: "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
				1600	"1" ((USItype)(al)), "g" ((USItype)(bl)))
				1601	/* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP. */
				1602	#define umul_ppmm(w1, w0, u, v) \
				1603	({union {UDItype __ll; \
				1604	struct {USItype __h, __l;} __i; \
				1605	} __x; \
				1606	__asm__ ("movw %1,%R0\n\tuemul %2,%0" \
				1607	: "=&r" (__x.__ll) \
				1608	: "g" ((USItype) (u)), "g" ((USItype)(v))); \
				1609	(w1) = __x.__i.__h; (w0) = __x.__i.__l;})
				1610	#endif /* __pyr__ */
				1611
				1612	#if defined (__ibm032__) /* RT/ROMP */ && W_TYPE_SIZE == 32
				1613	#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
				1614	__asm__ ("a %1,%5\n\tae %0,%3" \
				1615	: "=r" (sh), "=&r" (sl) \
				1616	: "0" ((USItype)(ah)), "r" ((USItype)(bh)), \
				1617	"%1" ((USItype)(al)), "r" ((USItype)(bl)))
				1618	#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
				1619	__asm__ ("s %1,%5\n\tse %0,%3" \
				1620	: "=r" (sh), "=&r" (sl) \
				1621	: "0" ((USItype)(ah)), "r" ((USItype)(bh)), \
				1622	"1" ((USItype)(al)), "r" ((USItype)(bl)))
				1623	#define smul_ppmm(ph, pl, m0, m1) \
				1624	__asm__ ( \
				1625	"s r2,r2\n" \
				1626	" mts r10,%2\n" \
				1627	" m r2,%3\n" \
				1628	" m r2,%3\n" \
				1629	" m r2,%3\n" \
				1630	" m r2,%3\n" \
				1631	" m r2,%3\n" \
				1632	" m r2,%3\n" \
				1633	" m r2,%3\n" \
				1634	" m r2,%3\n" \
				1635	" m r2,%3\n" \
				1636	" m r2,%3\n" \
				1637	" m r2,%3\n" \
				1638	" m r2,%3\n" \
				1639	" m r2,%3\n" \
				1640	" m r2,%3\n" \
				1641	" m r2,%3\n" \
				1642	" m r2,%3\n" \
				1643	" cas %0,r2,r0\n" \
				1644	" mfs r10,%1" \
				1645	: "=r" (ph), "=r" (pl) \
				1646	: "%r" ((USItype)(m0)), "r" ((USItype)(m1)) \
				1647	: "r2")
				1648	#define count_leading_zeros(count, x) \
				1649	do { \
				1650	if ((x) >= 0x10000) \
				1651	__asm__ ("clz %0,%1" \
				1652	: "=r" (count) : "r" ((USItype)(x) >> 16)); \
				1653	else \
				1654	{ \
				1655	__asm__ ("clz %0,%1" \
				1656	: "=r" (count) : "r" ((USItype)(x))); \
				1657	(count) += 16; \
				1658	} \
				1659	} while (0)
				1660	#endif /* RT/ROMP */
				1661
				1662	#if defined (__riscv64) && W_TYPE_SIZE == 64
				1663	#define umul_ppmm(ph, pl, u, v) \
				1664	do { \
				1665	UDItype __u = (u), __v = (v); \
				1666	(pl) = __u * __v; \
				1667	__asm__ ("mulhu\t%2, %1, %0" : "=r" (ph) : "%r" (__u), "r" (__v)); \
				1668	} while (0)
				1669	#endif
				1670
				1671	#if (defined (__SH2__) \|\| defined (__SH3__) \|\| defined (__SH4__)) && W_TYPE_SIZE == 32
				1672	#define umul_ppmm(w1, w0, u, v) \
				1673	__asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0" \
				1674	: "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
				1675	#endif
				1676
				1677	#if defined (__sparc__) && W_TYPE_SIZE == 32
				1678	#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
				1679	__asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0" \
				1680	: "=r" (sh), "=&r" (sl) \
				1681	: "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl) \
				1682	__CLOBBER_CC)
				1683	#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
				1684	__asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0" \
				1685	: "=r" (sh), "=&r" (sl) \
				1686	: "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl) \
				1687	__CLOBBER_CC)
				1688	/* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
				1689	doesn't define anything to indicate that to us, it only sets __sparcv8. */
				1690	#if defined (__sparc_v9__) \|\| defined (__sparcv9)
				1691	/* Perhaps we should use floating-point operations here? */
				1692	#if 0
				1693	/* Triggers a bug making mpz/tests/t-gcd.c fail.
				1694	Perhaps we simply need explicitly zero-extend the inputs? */
				1695	#define umul_ppmm(w1, w0, u, v) \
				1696	__asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" : \
				1697	"=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
				1698	#else
				1699	/* Use v8 umul until above bug is fixed. */
				1700	#define umul_ppmm(w1, w0, u, v) \
				1701	__asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
				1702	#endif
				1703	/* Use a plain v8 divide for v9. */
				1704	#define udiv_qrnnd(q, r, n1, n0, d) \
				1705	do { \
				1706	USItype __q; \
				1707	__asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \
				1708	: "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \
				1709	(r) = (n0) - __q * (d); \
				1710	(q) = __q; \
				1711	} while (0)
				1712	#else
				1713	#if defined (__sparc_v8__) /* gcc normal */ \
				1714	\|\| defined (__sparcv8) /* gcc solaris */ \
				1715	\|\| HAVE_HOST_CPU_supersparc
				1716	/* Don't match immediate range because, 1) it is not often useful,
				1717	2) the 'I' flag thinks of the range as a 13 bit signed interval,
				1718	while we want to match a 13 bit interval, sign extended to 32 bits,
				1719	but INTERPRETED AS UNSIGNED. */
				1720	#define umul_ppmm(w1, w0, u, v) \
				1721	__asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
				1722
				1723	#if HAVE_HOST_CPU_supersparc
				1724	#else
				1725	/* Don't use this on SuperSPARC because its udiv only handles 53 bit
				1726	dividends and will trap to the kernel for the rest. */
				1727	#define udiv_qrnnd(q, r, n1, n0, d) \
				1728	do { \
				1729	USItype __q; \
				1730	__asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \
				1731	: "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \
				1732	(r) = (n0) - __q * (d); \
				1733	(q) = __q; \
				1734	} while (0)
				1735	#endif /* HAVE_HOST_CPU_supersparc */
				1736
				1737	#else /* ! __sparc_v8__ */
				1738	#if defined (__sparclite__)
				1739	/* This has hardware multiply but not divide. It also has two additional
				1740	instructions scan (ffs from high bit) and divscc. */
				1741	#define umul_ppmm(w1, w0, u, v) \
				1742	__asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
				1743	#define udiv_qrnnd(q, r, n1, n0, d) \
				1744	__asm__ ("! Inlined udiv_qrnnd\n" \
				1745	" wr %%g0,%2,%%y ! Not a delayed write for sparclite\n" \
				1746	" tst %%g0\n" \
				1747	" divscc %3,%4,%%g1\n" \
				1748	" divscc %%g1,%4,%%g1\n" \
				1749	" divscc %%g1,%4,%%g1\n" \
				1750	" divscc %%g1,%4,%%g1\n" \
				1751	" divscc %%g1,%4,%%g1\n" \
				1752	" divscc %%g1,%4,%%g1\n" \
				1753	" divscc %%g1,%4,%%g1\n" \
				1754	" divscc %%g1,%4,%%g1\n" \
				1755	" divscc %%g1,%4,%%g1\n" \
				1756	" divscc %%g1,%4,%%g1\n" \
				1757	" divscc %%g1,%4,%%g1\n" \
				1758	" divscc %%g1,%4,%%g1\n" \
				1759	" divscc %%g1,%4,%%g1\n" \
				1760	" divscc %%g1,%4,%%g1\n" \
				1761	" divscc %%g1,%4,%%g1\n" \
				1762	" divscc %%g1,%4,%%g1\n" \
				1763	" divscc %%g1,%4,%%g1\n" \
				1764	" divscc %%g1,%4,%%g1\n" \
				1765	" divscc %%g1,%4,%%g1\n" \
				1766	" divscc %%g1,%4,%%g1\n" \
				1767	" divscc %%g1,%4,%%g1\n" \
				1768	" divscc %%g1,%4,%%g1\n" \
				1769	" divscc %%g1,%4,%%g1\n" \
				1770	" divscc %%g1,%4,%%g1\n" \
				1771	" divscc %%g1,%4,%%g1\n" \
				1772	" divscc %%g1,%4,%%g1\n" \
				1773	" divscc %%g1,%4,%%g1\n" \
				1774	" divscc %%g1,%4,%%g1\n" \
				1775	" divscc %%g1,%4,%%g1\n" \
				1776	" divscc %%g1,%4,%%g1\n" \
				1777	" divscc %%g1,%4,%%g1\n" \
				1778	" divscc %%g1,%4,%0\n" \
				1779	" rd %%y,%1\n" \
				1780	" bl,a 1f\n" \
				1781	" add %1,%4,%1\n" \
				1782	"1: ! End of inline udiv_qrnnd" \
				1783	: "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d) \
				1784	: "%g1" __AND_CLOBBER_CC)
				1785	#define count_leading_zeros(count, x) \
				1786	__asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
				1787	/* Early sparclites return 63 for an argument of 0, but they warn that future
				1788	implementations might change this. Therefore, leave COUNT_LEADING_ZEROS_0
				1789	undefined. */
				1790	#endif /* __sparclite__ */
				1791	#endif /* __sparc_v8__ */
				1792	#endif /* __sparc_v9__ */
				1793	/* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd. */
				1794	#ifndef umul_ppmm
				1795	#define umul_ppmm(w1, w0, u, v) \
				1796	__asm__ ("! Inlined umul_ppmm\n" \
				1797	" wr %%g0,%2,%%y ! SPARC has 0-3 delay insn after a wr\n" \
				1798	" sra %3,31,%%g2 ! Don't move this insn\n" \
				1799	" and %2,%%g2,%%g2 ! Don't move this insn\n" \
				1800	" andcc %%g0,0,%%g1 ! Don't move this insn\n" \
				1801	" mulscc %%g1,%3,%%g1\n" \
				1802	" mulscc %%g1,%3,%%g1\n" \
				1803	" mulscc %%g1,%3,%%g1\n" \
				1804	" mulscc %%g1,%3,%%g1\n" \
				1805	" mulscc %%g1,%3,%%g1\n" \
				1806	" mulscc %%g1,%3,%%g1\n" \
				1807	" mulscc %%g1,%3,%%g1\n" \
				1808	" mulscc %%g1,%3,%%g1\n" \
				1809	" mulscc %%g1,%3,%%g1\n" \
				1810	" mulscc %%g1,%3,%%g1\n" \
				1811	" mulscc %%g1,%3,%%g1\n" \
				1812	" mulscc %%g1,%3,%%g1\n" \
				1813	" mulscc %%g1,%3,%%g1\n" \
				1814	" mulscc %%g1,%3,%%g1\n" \
				1815	" mulscc %%g1,%3,%%g1\n" \
				1816	" mulscc %%g1,%3,%%g1\n" \
				1817	" mulscc %%g1,%3,%%g1\n" \
				1818	" mulscc %%g1,%3,%%g1\n" \
				1819	" mulscc %%g1,%3,%%g1\n" \
				1820	" mulscc %%g1,%3,%%g1\n" \
				1821	" mulscc %%g1,%3,%%g1\n" \
				1822	" mulscc %%g1,%3,%%g1\n" \
				1823	" mulscc %%g1,%3,%%g1\n" \
				1824	" mulscc %%g1,%3,%%g1\n" \
				1825	" mulscc %%g1,%3,%%g1\n" \
				1826	" mulscc %%g1,%3,%%g1\n" \
				1827	" mulscc %%g1,%3,%%g1\n" \
				1828	" mulscc %%g1,%3,%%g1\n" \
				1829	" mulscc %%g1,%3,%%g1\n" \
				1830	" mulscc %%g1,%3,%%g1\n" \
				1831	" mulscc %%g1,%3,%%g1\n" \
				1832	" mulscc %%g1,%3,%%g1\n" \
				1833	" mulscc %%g1,0,%%g1\n" \
				1834	" add %%g1,%%g2,%0\n" \
				1835	" rd %%y,%1" \
				1836	: "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v) \
				1837	: "%g1", "%g2" __AND_CLOBBER_CC)
				1838	#endif
				1839	#ifndef udiv_qrnnd
				1840	#ifndef LONGLONG_STANDALONE
				1841	#define udiv_qrnnd(q, r, n1, n0, d) \
				1842	do { UWtype __r; \
				1843	(q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \
				1844	(r) = __r; \
				1845	} while (0)
				1846	extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
				1847	#endif /* LONGLONG_STANDALONE */
				1848	#endif /* udiv_qrnnd */
				1849	#endif /* __sparc__ */
				1850
				1851	#if defined (__sparc__) && W_TYPE_SIZE == 64
				1852	#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
				1853	__asm__ ( \
				1854	"addcc %r4,%5,%1\n" \
				1855	" addccc %r6,%7,%%g0\n" \
				1856	" addc %r2,%3,%0" \
				1857	: "=r" (sh), "=&r" (sl) \
				1858	: "rJ" ((UDItype)(ah)), "rI" ((UDItype)(bh)), \
				1859	"%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)), \
				1860	"%rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32) \
				1861	__CLOBBER_CC)
				1862	#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
				1863	__asm__ ( \
				1864	"subcc %r4,%5,%1\n" \
				1865	" subccc %r6,%7,%%g0\n" \
				1866	" subc %r2,%3,%0" \
				1867	: "=r" (sh), "=&r" (sl) \
				1868	: "rJ" ((UDItype)(ah)), "rI" ((UDItype)(bh)), \
				1869	"rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)), \
				1870	"rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32) \
				1871	__CLOBBER_CC)
				1872	#if __VIS__ >= 0x300
				1873	#undef add_ssaaaa
				1874	#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
				1875	__asm__ ( \
				1876	"addcc %r4, %5, %1\n" \
				1877	" addxc %r2, %r3, %0" \
				1878	: "=r" (sh), "=&r" (sl) \
				1879	: "rJ" ((UDItype)(ah)), "rJ" ((UDItype)(bh)), \
				1880	"%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC)
				1881	#define umul_ppmm(ph, pl, m0, m1) \
				1882	do { \
				1883	UDItype __m0 = (m0), __m1 = (m1); \
				1884	(pl) = __m0 * __m1; \
				1885	__asm__ ("umulxhi\t%2, %1, %0" \
				1886	: "=r" (ph) \
				1887	: "%r" (__m0), "r" (__m1)); \
				1888	} while (0)
				1889	#define count_leading_zeros(count, x) \
				1890	__asm__ ("lzd\t%1,%0" : "=r" (count) : "r" (x))
				1891	/* Needed by count_leading_zeros_32 in sparc64.h. */
				1892	#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
				1893	#endif
				1894	#endif
				1895
				1896	#if (defined (__vax) \|\| defined (__vax__)) && W_TYPE_SIZE == 32
				1897	#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
				1898	__asm__ ("addl2 %5,%1\n\tadwc %3,%0" \
				1899	: "=g" (sh), "=&g" (sl) \
				1900	: "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
				1901	"%1" ((USItype)(al)), "g" ((USItype)(bl)))
				1902	#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
				1903	__asm__ ("subl2 %5,%1\n\tsbwc %3,%0" \
				1904	: "=g" (sh), "=&g" (sl) \
				1905	: "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
				1906	"1" ((USItype)(al)), "g" ((USItype)(bl)))
				1907	#define smul_ppmm(xh, xl, m0, m1) \
				1908	do { \
				1909	union {UDItype __ll; \
				1910	struct {USItype __l, __h;} __i; \
				1911	} __x; \
				1912	USItype __m0 = (m0), __m1 = (m1); \
				1913	__asm__ ("emul %1,%2,$0,%0" \
				1914	: "=g" (__x.__ll) : "g" (__m0), "g" (__m1)); \
				1915	(xh) = __x.__i.__h; (xl) = __x.__i.__l; \
				1916	} while (0)
				1917	#define sdiv_qrnnd(q, r, n1, n0, d) \
				1918	do { \
				1919	union {DItype __ll; \
				1920	struct {SItype __l, __h;} __i; \
				1921	} __x; \
				1922	__x.__i.__h = n1; __x.__i.__l = n0; \
				1923	__asm__ ("ediv %3,%2,%0,%1" \
				1924	: "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d)); \
				1925	} while (0)
				1926	#if 0
				1927	/* FIXME: This instruction appears to be unimplemented on some systems (vax
				1928	8800 maybe). */
				1929	#define count_trailing_zeros(count,x) \
				1930	do { \
				1931	__asm__ ("ffs 0, 31, %1, %0" \
				1932	: "=g" (count) \
				1933	: "g" ((USItype) (x))); \
				1934	} while (0)
				1935	#endif
				1936	#endif /* vax */
				1937
				1938	#if defined (__z8000__) && W_TYPE_SIZE == 16
				1939	#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
				1940	__asm__ ("add %H1,%H5\n\tadc %H0,%H3" \
				1941	: "=r" (sh), "=&r" (sl) \
				1942	: "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \
				1943	"%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
				1944	#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
				1945	__asm__ ("sub %H1,%H5\n\tsbc %H0,%H3" \
				1946	: "=r" (sh), "=&r" (sl) \
				1947	: "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \
				1948	"1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
				1949	#define umul_ppmm(xh, xl, m0, m1) \
				1950	do { \
				1951	union {long int __ll; \
				1952	struct {unsigned int __h, __l;} __i; \
				1953	} __x; \
				1954	unsigned int __m0 = (m0), __m1 = (m1); \
				1955	__asm__ ("mult %S0,%H3" \
				1956	: "=r" (__x.__i.__h), "=r" (__x.__i.__l) \
				1957	: "%1" (m0), "rQR" (m1)); \
				1958	(xh) = __x.__i.__h; (xl) = __x.__i.__l; \
				1959	(xh) += ((((signed int) __m0 >> 15) & __m1) \
				1960	+ (((signed int) __m1 >> 15) & __m0)); \
				1961	} while (0)
				1962	#endif /* __z8000__ */
				1963
				1964	#endif /* __GNUC__ */
				1965
				1966	#endif /* NO_ASM */
				1967
				1968
				1969	/* FIXME: "sidi" here is highly doubtful, should sometimes be "diti". */
				1970	#if !defined (umul_ppmm) && defined (__umulsidi3)
				1971	#define umul_ppmm(ph, pl, m0, m1) \
				1972	do { \
				1973	UDWtype __ll = __umulsidi3 (m0, m1); \
				1974	ph = (UWtype) (__ll >> W_TYPE_SIZE); \
				1975	pl = (UWtype) __ll; \
				1976	} while (0)
				1977	#endif
				1978
				1979	#if !defined (__umulsidi3)
				1980	#define __umulsidi3(u, v) \
				1981	({UWtype __hi, __lo; \
				1982	umul_ppmm (__hi, __lo, u, v); \
				1983	((UDWtype) __hi << W_TYPE_SIZE) \| __lo; })
				1984	#endif
				1985
				1986
				1987	#if defined (__cplusplus)
				1988	#define __longlong_h_C "C"
				1989	#else
				1990	#define __longlong_h_C
				1991	#endif
				1992
				1993	/* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist. The "_r"
				1994	forms have "reversed" arguments, meaning the pointer is last, which
				1995	sometimes allows better parameter passing, in particular on 64-bit
				1996	hppa. */
				1997
				1998	#define mpn_umul_ppmm __MPN(umul_ppmm)
				1999	extern __longlong_h_C UWtype mpn_umul_ppmm (UWtype *, UWtype, UWtype);
				2000
				2001	#if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm \
				2002	&& ! defined (LONGLONG_STANDALONE)
				2003	#define umul_ppmm(wh, wl, u, v) \
				2004	do { \
				2005	UWtype __umul_ppmm__p0; \
				2006	(wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));\
				2007	(wl) = __umul_ppmm__p0; \
				2008	} while (0)
				2009	#endif
				2010
				2011	#define mpn_umul_ppmm_r __MPN(umul_ppmm_r)
				2012	extern __longlong_h_C UWtype mpn_umul_ppmm_r (UWtype, UWtype, UWtype *);
				2013
				2014	#if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r \
				2015	&& ! defined (LONGLONG_STANDALONE)
				2016	#define umul_ppmm(wh, wl, u, v) \
				2017	do { \
				2018	UWtype __umul_p0; \
				2019	(wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_p0); \
				2020	(wl) = __umul_p0; \
				2021	} while (0)
				2022	#endif
				2023
				2024	#define mpn_udiv_qrnnd __MPN(udiv_qrnnd)
				2025	extern __longlong_h_C UWtype mpn_udiv_qrnnd (UWtype *, UWtype, UWtype, UWtype);
				2026
				2027	#if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd \
				2028	&& ! defined (LONGLONG_STANDALONE)
				2029	#define udiv_qrnnd(q, r, n1, n0, d) \
				2030	do { \
				2031	UWtype __udiv_qrnnd_r; \
				2032	(q) = mpn_udiv_qrnnd (&__udiv_qrnnd_r, \
				2033	(UWtype) (n1), (UWtype) (n0), (UWtype) d); \
				2034	(r) = __udiv_qrnnd_r; \
				2035	} while (0)
				2036	#endif
				2037
				2038	#define mpn_udiv_qrnnd_r __MPN(udiv_qrnnd_r)
				2039	extern __longlong_h_C UWtype mpn_udiv_qrnnd_r (UWtype, UWtype, UWtype, UWtype *);
				2040
				2041	#if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r \
				2042	&& ! defined (LONGLONG_STANDALONE)
				2043	#define udiv_qrnnd(q, r, n1, n0, d) \
				2044	do { \
				2045	UWtype __udiv_qrnnd_r; \
				2046	(q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d, \
				2047	&__udiv_qrnnd_r); \
				2048	(r) = __udiv_qrnnd_r; \
				2049	} while (0)
				2050	#endif
				2051
				2052
				2053	/* If this machine has no inline assembler, use C macros. */
				2054
				2055	#if !defined (add_ssaaaa)
				2056	#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
				2057	do { \
				2058	UWtype __x; \
				2059	__x = (al) + (bl); \
				2060	(sh) = (ah) + (bh) + (__x < (al)); \
				2061	(sl) = __x; \
				2062	} while (0)
				2063	#endif
				2064
				2065	#if !defined (sub_ddmmss)
				2066	#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
				2067	do { \
				2068	UWtype __x; \
				2069	__x = (al) - (bl); \
				2070	(sh) = (ah) - (bh) - ((al) < (bl)); \
				2071	(sl) = __x; \
				2072	} while (0)
				2073	#endif
				2074
				2075	/* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
				2076	smul_ppmm. */
				2077	#if !defined (umul_ppmm) && defined (smul_ppmm)
				2078	#define umul_ppmm(w1, w0, u, v) \
				2079	do { \
				2080	UWtype __w1; \
				2081	UWtype __xm0 = (u), __xm1 = (v); \
				2082	smul_ppmm (__w1, w0, __xm0, __xm1); \
				2083	(w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \
				2084	+ (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \
				2085	} while (0)
				2086	#endif
				2087
				2088	/* If we still don't have umul_ppmm, define it using plain C.
				2089
				2090	For reference, when this code is used for squaring (ie. u and v identical
				2091	expressions), gcc recognises __x1 and __x2 are the same and generates 3
				2092	multiplies, not 4. The subsequent additions could be optimized a bit,
				2093	but the only place GMP currently uses such a square is mpn_sqr_basecase,
				2094	and chips obliged to use this generic C umul will have plenty of worse
				2095	performance problems than a couple of extra instructions on the diagonal
				2096	of sqr_basecase. */
				2097
				2098	#if !defined (umul_ppmm)
				2099	#define umul_ppmm(w1, w0, u, v) \
				2100	do { \
				2101	UWtype __x0, __x1, __x2, __x3; \
				2102	UHWtype __ul, __vl, __uh, __vh; \
				2103	UWtype __u = (u), __v = (v); \
				2104	\
				2105	__ul = __ll_lowpart (__u); \
				2106	__uh = __ll_highpart (__u); \
				2107	__vl = __ll_lowpart (__v); \
				2108	__vh = __ll_highpart (__v); \
				2109	\
				2110	__x0 = (UWtype) __ul * __vl; \
				2111	__x1 = (UWtype) __ul * __vh; \
				2112	__x2 = (UWtype) __uh * __vl; \
				2113	__x3 = (UWtype) __uh * __vh; \
				2114	\
				2115	__x1 += __ll_highpart (__x0);/* this can't give carry */ \
				2116	__x1 += __x2; /* but this indeed can */ \
				2117	if (__x1 < __x2) /* did we get it? */ \
				2118	__x3 += __ll_B; /* yes, add it in the proper pos. */ \
				2119	\
				2120	(w1) = __x3 + __ll_highpart (__x1); \
				2121	(w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0); \
				2122	} while (0)
				2123	#endif
				2124
				2125	/* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
				2126	exist in one form or another. */
				2127	#if !defined (smul_ppmm)
				2128	#define smul_ppmm(w1, w0, u, v) \
				2129	do { \
				2130	UWtype __w1; \
				2131	UWtype __xm0 = (u), __xm1 = (v); \
				2132	umul_ppmm (__w1, w0, __xm0, __xm1); \
				2133	(w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \
				2134	- (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \
				2135	} while (0)
				2136	#endif
				2137
				2138	/* Define this unconditionally, so it can be used for debugging. */
				2139	#define __udiv_qrnnd_c(q, r, n1, n0, d) \
				2140	do { \
				2141	UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m; \
				2142	\
				2143	ASSERT ((d) != 0); \
				2144	ASSERT ((n1) < (d)); \
				2145	\
				2146	__d1 = __ll_highpart (d); \
				2147	__d0 = __ll_lowpart (d); \
				2148	\
				2149	__q1 = (n1) / __d1; \
				2150	__r1 = (n1) - __q1 * __d1; \
				2151	__m = __q1 * __d0; \
				2152	__r1 = __r1 * __ll_B \| __ll_highpart (n0); \
				2153	if (__r1 < __m) \
				2154	{ \
				2155	__q1--, __r1 += (d); \
				2156	if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
				2157	if (__r1 < __m) \
				2158	__q1--, __r1 += (d); \
				2159	} \
				2160	__r1 -= __m; \
				2161	\
				2162	__q0 = __r1 / __d1; \
				2163	__r0 = __r1 - __q0 * __d1; \
				2164	__m = __q0 * __d0; \
				2165	__r0 = __r0 * __ll_B \| __ll_lowpart (n0); \
				2166	if (__r0 < __m) \
				2167	{ \
				2168	__q0--, __r0 += (d); \
				2169	if (__r0 >= (d)) \
				2170	if (__r0 < __m) \
				2171	__q0--, __r0 += (d); \
				2172	} \
				2173	__r0 -= __m; \
				2174	\
				2175	(q) = __q1 * __ll_B \| __q0; \
				2176	(r) = __r0; \
				2177	} while (0)
				2178
				2179	/* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
				2180	__udiv_w_sdiv (defined in libgcc or elsewhere). */
				2181	#if !defined (udiv_qrnnd) && defined (sdiv_qrnnd) \
				2182	&& ! defined (LONGLONG_STANDALONE)
				2183	#define udiv_qrnnd(q, r, nh, nl, d) \
				2184	do { \
				2185	UWtype __r; \
				2186	(q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d); \
				2187	(r) = __r; \
				2188	} while (0)
				2189	__GMP_DECLSPEC UWtype __MPN(udiv_w_sdiv) (UWtype *, UWtype, UWtype, UWtype);
				2190	#endif
				2191
				2192	/* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c. */
				2193	#if !defined (udiv_qrnnd)
				2194	#define UDIV_NEEDS_NORMALIZATION 1
				2195	#define udiv_qrnnd __udiv_qrnnd_c
				2196	#endif
				2197
				2198	#if !defined (count_leading_zeros)
				2199	#define count_leading_zeros(count, x) \
				2200	do { \
				2201	UWtype __xr = (x); \
				2202	UWtype __a; \
				2203	\
				2204	if (W_TYPE_SIZE == 32) \
				2205	{ \
				2206	__a = __xr < ((UWtype) 1 << 2*__BITS4) \
				2207	? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1) \
				2208	: (__xr < ((UWtype) 1 << 3__BITS4) ? 2__BITS4 + 1 \
				2209	: 3*__BITS4 + 1); \
				2210	} \
				2211	else \
				2212	{ \
				2213	for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8) \
				2214	if (((__xr >> __a) & 0xff) != 0) \
				2215	break; \
				2216	++__a; \
				2217	} \
				2218	\
				2219	(count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a]; \
				2220	} while (0)
				2221	/* This version gives a well-defined value for zero. */
				2222	#define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
				2223	#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
				2224	#define COUNT_LEADING_ZEROS_SLOW
				2225	#endif
				2226
				2227	/* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
				2228	#if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
				2229	#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
				2230	#endif
				2231
				2232	#ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
				2233	extern const unsigned char __GMP_DECLSPEC __clz_tab[129];
				2234	#endif
				2235
				2236	#if !defined (count_trailing_zeros)
				2237	#if !defined (COUNT_LEADING_ZEROS_SLOW)
				2238	/* Define count_trailing_zeros using an asm count_leading_zeros. */
				2239	#define count_trailing_zeros(count, x) \
				2240	do { \
				2241	UWtype __ctz_x = (x); \
				2242	UWtype __ctz_c; \
				2243	ASSERT (__ctz_x != 0); \
				2244	count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x); \
				2245	(count) = W_TYPE_SIZE - 1 - __ctz_c; \
				2246	} while (0)
				2247	#else
				2248	/* Define count_trailing_zeros in plain C, assuming small counts are common.
				2249	We use clz_tab without ado, since the C count_leading_zeros above will have
				2250	pulled it in. */
				2251	#define count_trailing_zeros(count, x) \
				2252	do { \
				2253	UWtype __ctz_x = (x); \
				2254	int __ctz_c; \
				2255	\
				2256	if (LIKELY ((__ctz_x & 0xff) != 0)) \
				2257	(count) = __clz_tab[__ctz_x & -__ctz_x] - 2; \
				2258	else \
				2259	{ \
				2260	for (__ctz_c = 8 - 2; __ctz_c < W_TYPE_SIZE - 2; __ctz_c += 8) \
				2261	{ \
				2262	__ctz_x >>= 8; \
				2263	if (LIKELY ((__ctz_x & 0xff) != 0)) \
				2264	break; \
				2265	} \
				2266	\
				2267	(count) = __ctz_c + __clz_tab[__ctz_x & -__ctz_x]; \
				2268	} \
				2269	} while (0)
				2270	#endif
				2271	#endif
				2272
				2273	#ifndef UDIV_NEEDS_NORMALIZATION
				2274	#define UDIV_NEEDS_NORMALIZATION 0
				2275	#endif
				2276
				2277	/* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
				2278	that hence the latter should always be used. */
				2279	#ifndef UDIV_PREINV_ALWAYS
				2280	#define UDIV_PREINV_ALWAYS 0
				2281	#endif