blob: bc3fc81cb0a2cf652b60b2ae3367ebf6f2ff3c80 [file] [log] [blame]
Austin Schuhbb1338c2024-06-15 19:31:16 -07001/* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
2
3Copyright 1991-1994, 1996, 1997, 1999-2005, 2007-2009, 2011-2020 Free Software
4Foundation, Inc.
5
6This file is part of the GNU MP Library.
7
8The GNU MP Library is free software; you can redistribute it and/or modify
9it under the terms of either:
10
11 * the GNU Lesser General Public License as published by the Free
12 Software Foundation; either version 3 of the License, or (at your
13 option) any later version.
14
15or
16
17 * the GNU General Public License as published by the Free Software
18 Foundation; either version 2 of the License, or (at your option) any
19 later version.
20
21or both in parallel, as here.
22
23The GNU MP Library is distributed in the hope that it will be useful, but
24WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26for more details.
27
28You should have received copies of the GNU General Public License and the
29GNU Lesser General Public License along with the GNU MP Library. If not,
30see https://www.gnu.org/licenses/. */
31
32/* You have to define the following before including this file:
33
34 UWtype -- An unsigned type, default type for operations (typically a "word")
35 UHWtype -- An unsigned type, at least half the size of UWtype
36 UDWtype -- An unsigned type, at least twice as large a UWtype
37 W_TYPE_SIZE -- size in bits of UWtype
38
39 SItype, USItype -- Signed and unsigned 32 bit types
40 DItype, UDItype -- Signed and unsigned 64 bit types
41
42 On a 32 bit machine UWtype should typically be USItype;
43 on a 64 bit machine, UWtype should typically be UDItype.
44
45 Optionally, define:
46
47 LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files
48 NO_ASM -- Disable inline asm
49
50
51 CAUTION! Using this version of longlong.h outside of GMP is not safe. You
52 need to include gmp.h and gmp-impl.h, or certain things might not work as
53 expected.
54*/
55
56#define __BITS4 (W_TYPE_SIZE / 4)
57#define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
58#define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
59#define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
60
61/* This is used to make sure no undesirable sharing between different libraries
62 that use this file takes place. */
63#ifndef __MPN
64#define __MPN(x) __##x
65#endif
66
67/* Define auxiliary asm macros.
68
69 1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
70 UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
71 word product in HIGH_PROD and LOW_PROD.
72
73 2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
74 UDWtype product. This is just a variant of umul_ppmm.
75
76 3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
77 denominator) divides a UDWtype, composed by the UWtype integers
78 HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
79 in QUOTIENT and the remainder in REMAINDER. HIGH_NUMERATOR must be less
80 than DENOMINATOR for correct operation. If, in addition, the most
81 significant bit of DENOMINATOR must be 1, then the pre-processor symbol
82 UDIV_NEEDS_NORMALIZATION is defined to 1.
83
84 4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
85 denominator). Like udiv_qrnnd but the numbers are signed. The quotient
86 is rounded towards 0.
87
88 5) count_leading_zeros(count, x) counts the number of zero-bits from the
89 msb to the first non-zero bit in the UWtype X. This is the number of
90 steps X needs to be shifted left to set the msb. Undefined for X == 0,
91 unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
92
93 6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
94 from the least significant end.
95
96 7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
97 high_addend_2, low_addend_2) adds two UWtype integers, composed by
98 HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
99 respectively. The result is placed in HIGH_SUM and LOW_SUM. Overflow
100 (i.e. carry out) is not stored anywhere, and is lost.
101
102 8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
103 high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
104 composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
105 LOW_SUBTRAHEND_2 respectively. The result is placed in HIGH_DIFFERENCE
106 and LOW_DIFFERENCE. Overflow (i.e. carry out) is not stored anywhere,
107 and is lost.
108
109 If any of these macros are left undefined for a particular CPU,
110 C macros are used.
111
112
113 Notes:
114
115 For add_ssaaaa the two high and two low addends can both commute, but
116 unfortunately gcc only supports one "%" commutative in each asm block.
117 This has always been so but is only documented in recent versions
118 (eg. pre-release 3.3). Having two or more "%"s can cause an internal
119 compiler error in certain rare circumstances.
120
121 Apparently it was only the last "%" that was ever actually respected, so
122 the code has been updated to leave just that. Clearly there's a free
123 choice whether high or low should get it, if there's a reason to favour
124 one over the other. Also obviously when the constraints on the two
125 operands are identical there's no benefit to the reloader in any "%" at
126 all.
127
128 */
129
130/* The CPUs come in alphabetical order below.
131
132 Please add support for more CPUs here, or improve the current support
133 for the CPUs below! */
134
135
136/* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
137 3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
138 Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
139 __builtin_ctzll.
140
141 These builtins are only used when we check what code comes out, on some
142 chips they're merely libgcc calls, where we will instead want an inline
143 in that case (either asm or generic C).
144
145 These builtins are better than an asm block of the same insn, since an
146 asm block doesn't give gcc any information about scheduling or resource
147 usage. We keep an asm block for use on prior versions of gcc though.
148
149 For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
150 it's not used (for count_leading_zeros) because it generally gives extra
151 code to ensure the result is 0 when the input is 0, which we don't need
152 or want. */
153
154#ifdef _LONG_LONG_LIMB
155#define count_leading_zeros_gcc_clz(count,x) \
156 do { \
157 ASSERT ((x) != 0); \
158 (count) = __builtin_clzll (x); \
159 } while (0)
160#else
161#define count_leading_zeros_gcc_clz(count,x) \
162 do { \
163 ASSERT ((x) != 0); \
164 (count) = __builtin_clzl (x); \
165 } while (0)
166#endif
167
168#ifdef _LONG_LONG_LIMB
169#define count_trailing_zeros_gcc_ctz(count,x) \
170 do { \
171 ASSERT ((x) != 0); \
172 (count) = __builtin_ctzll (x); \
173 } while (0)
174#else
175#define count_trailing_zeros_gcc_ctz(count,x) \
176 do { \
177 ASSERT ((x) != 0); \
178 (count) = __builtin_ctzl (x); \
179 } while (0)
180#endif
181
182
183/* FIXME: The macros using external routines like __MPN(count_leading_zeros)
184 don't need to be under !NO_ASM */
185#if ! defined (NO_ASM)
186
187#if defined (__alpha) && W_TYPE_SIZE == 64
188/* Most alpha-based machines, except Cray systems. */
189#if defined (__GNUC__)
190#if __GMP_GNUC_PREREQ (3,3)
191#define umul_ppmm(ph, pl, m0, m1) \
192 do { \
193 UDItype __m0 = (m0), __m1 = (m1); \
194 (ph) = __builtin_alpha_umulh (__m0, __m1); \
195 (pl) = __m0 * __m1; \
196 } while (0)
197#else
198#define umul_ppmm(ph, pl, m0, m1) \
199 do { \
200 UDItype __m0 = (m0), __m1 = (m1); \
201 __asm__ ("umulh %r1,%2,%0" \
202 : "=r" (ph) \
203 : "%rJ" (__m0), "rI" (__m1)); \
204 (pl) = __m0 * __m1; \
205 } while (0)
206#endif
207#else /* ! __GNUC__ */
208#include <machine/builtins.h>
209#define umul_ppmm(ph, pl, m0, m1) \
210 do { \
211 UDItype __m0 = (m0), __m1 = (m1); \
212 (ph) = __UMULH (__m0, __m1); \
213 (pl) = __m0 * __m1; \
214 } while (0)
215#endif
216#ifndef LONGLONG_STANDALONE
217#define udiv_qrnnd(q, r, n1, n0, d) \
218 do { UWtype __di; \
219 __di = __MPN(invert_limb) (d); \
220 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
221 } while (0)
222#define UDIV_PREINV_ALWAYS 1
223#define UDIV_NEEDS_NORMALIZATION 1
224#endif /* LONGLONG_STANDALONE */
225
226/* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
227 always goes into libgmp.so, even when not actually used. */
228#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
229
230#if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
231#define count_leading_zeros(COUNT,X) \
232 __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
233#define count_trailing_zeros(COUNT,X) \
234 __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
235#endif /* clz/ctz using cix */
236
237#if ! defined (count_leading_zeros) \
238 && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
239/* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
240 "$31" is written explicitly in the asm, since an "r" constraint won't
241 select reg 31. There seems no need to worry about "r31" syntax for cray,
242 since gcc itself (pre-release 3.4) emits just $31 in various places. */
243#define ALPHA_CMPBGE_0(dst, src) \
244 do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
245/* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
246 them, locating the highest non-zero byte. A second __clz_tab lookup
247 counts the leading zero bits in that byte, giving the result. */
248#define count_leading_zeros(count, x) \
249 do { \
250 UWtype __clz__b, __clz__c, __clz__x = (x); \
251 ALPHA_CMPBGE_0 (__clz__b, __clz__x); /* zero bytes */ \
252 __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F]; /* 8 to 1 byte */ \
253 __clz__b = __clz__b * 8 - 7; /* 57 to 1 shift */ \
254 __clz__x >>= __clz__b; \
255 __clz__c = __clz_tab [__clz__x]; /* 8 to 1 bit */ \
256 __clz__b = 65 - __clz__b; \
257 (count) = __clz__b - __clz__c; \
258 } while (0)
259#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
260#endif /* clz using cmpbge */
261
262#if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
263#if HAVE_ATTRIBUTE_CONST
264long __MPN(count_leading_zeros) (UDItype) __attribute__ ((const));
265#else
266long __MPN(count_leading_zeros) (UDItype);
267#endif
268#define count_leading_zeros(count, x) \
269 ((count) = __MPN(count_leading_zeros) (x))
270#endif /* clz using mpn */
271#endif /* __alpha */
272
273#if defined (__AVR) && W_TYPE_SIZE == 8
274#define umul_ppmm(ph, pl, m0, m1) \
275 do { \
276 unsigned short __p = (unsigned short) (m0) * (m1); \
277 (ph) = __p >> 8; \
278 (pl) = __p; \
279 } while (0)
280#endif /* AVR */
281
282#if defined (_CRAY) && W_TYPE_SIZE == 64
283#include <intrinsics.h>
284#define UDIV_PREINV_ALWAYS 1
285#define UDIV_NEEDS_NORMALIZATION 1
286long __MPN(count_leading_zeros) (UDItype);
287#define count_leading_zeros(count, x) \
288 ((count) = _leadz ((UWtype) (x)))
289#if defined (_CRAYIEEE) /* I.e., Cray T90/ieee, T3D, and T3E */
290#define umul_ppmm(ph, pl, m0, m1) \
291 do { \
292 UDItype __m0 = (m0), __m1 = (m1); \
293 (ph) = _int_mult_upper (__m0, __m1); \
294 (pl) = __m0 * __m1; \
295 } while (0)
296#ifndef LONGLONG_STANDALONE
297#define udiv_qrnnd(q, r, n1, n0, d) \
298 do { UWtype __di; \
299 __di = __MPN(invert_limb) (d); \
300 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
301 } while (0)
302#endif /* LONGLONG_STANDALONE */
303#endif /* _CRAYIEEE */
304#endif /* _CRAY */
305
306#if defined (__ia64) && W_TYPE_SIZE == 64
307/* This form encourages gcc (pre-release 3.4 at least) to emit predicated
308 "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency. The generic
309 code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
310 register, which takes an extra cycle. */
311#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
312 do { \
313 UWtype __x; \
314 __x = (al) - (bl); \
315 if ((al) < (bl)) \
316 (sh) = (ah) - (bh) - 1; \
317 else \
318 (sh) = (ah) - (bh); \
319 (sl) = __x; \
320 } while (0)
321#if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
322/* Do both product parts in assembly, since that gives better code with
323 all gcc versions. Some callers will just use the upper part, and in
324 that situation we waste an instruction, but not any cycles. */
325#define umul_ppmm(ph, pl, m0, m1) \
326 __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0" \
327 : "=&f" (ph), "=f" (pl) \
328 : "f" (m0), "f" (m1))
329#define count_leading_zeros(count, x) \
330 do { \
331 UWtype _x = (x), _y, _a, _c; \
332 __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x)); \
333 __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y)); \
334 _c = (_a - 1) << 3; \
335 _x >>= _c; \
336 if (_x >= 1 << 4) \
337 _x >>= 4, _c += 4; \
338 if (_x >= 1 << 2) \
339 _x >>= 2, _c += 2; \
340 _c += _x >> 1; \
341 (count) = W_TYPE_SIZE - 1 - _c; \
342 } while (0)
343/* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
344 based, and we don't need a special case for x==0 here */
345#define count_trailing_zeros(count, x) \
346 do { \
347 UWtype __ctz_x = (x); \
348 __asm__ ("popcnt %0 = %1" \
349 : "=r" (count) \
350 : "r" ((__ctz_x-1) & ~__ctz_x)); \
351 } while (0)
352#endif
353#if defined (__INTEL_COMPILER)
354#include <ia64intrin.h>
355#define umul_ppmm(ph, pl, m0, m1) \
356 do { \
357 UWtype __m0 = (m0), __m1 = (m1); \
358 ph = _m64_xmahu (__m0, __m1, 0); \
359 pl = __m0 * __m1; \
360 } while (0)
361#endif
362#ifndef LONGLONG_STANDALONE
363#define udiv_qrnnd(q, r, n1, n0, d) \
364 do { UWtype __di; \
365 __di = __MPN(invert_limb) (d); \
366 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
367 } while (0)
368#define UDIV_PREINV_ALWAYS 1
369#define UDIV_NEEDS_NORMALIZATION 1
370#endif
371#endif
372
373
374#if defined (__GNUC__)
375
376/* We sometimes need to clobber "cc" with gcc2, but that would not be
377 understood by gcc1. Use cpp to avoid major code duplication. */
378#if __GNUC__ < 2
379#define __CLOBBER_CC
380#define __AND_CLOBBER_CC
381#else /* __GNUC__ >= 2 */
382#define __CLOBBER_CC : "cc"
383#define __AND_CLOBBER_CC , "cc"
384#endif /* __GNUC__ < 2 */
385
386#if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
387#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
388 __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3" \
389 : "=r" (sh), "=&r" (sl) \
390 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
391#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
392 __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3" \
393 : "=r" (sh), "=&r" (sl) \
394 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
395#define umul_ppmm(xh, xl, m0, m1) \
396 do { \
397 USItype __m0 = (m0), __m1 = (m1); \
398 __asm__ ("multiplu %0,%1,%2" \
399 : "=r" (xl) \
400 : "r" (__m0), "r" (__m1)); \
401 __asm__ ("multmu %0,%1,%2" \
402 : "=r" (xh) \
403 : "r" (__m0), "r" (__m1)); \
404 } while (0)
405#define udiv_qrnnd(q, r, n1, n0, d) \
406 __asm__ ("dividu %0,%3,%4" \
407 : "=r" (q), "=q" (r) \
408 : "1" (n1), "r" (n0), "r" (d))
409#define count_leading_zeros(count, x) \
410 __asm__ ("clz %0,%1" \
411 : "=r" (count) \
412 : "r" (x))
413#define COUNT_LEADING_ZEROS_0 32
414#endif /* __a29k__ */
415
416#if defined (__arc__)
417#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
418 __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3" \
419 : "=r" (sh), \
420 "=&r" (sl) \
421 : "r" ((USItype) (ah)), \
422 "rICal" ((USItype) (bh)), \
423 "%r" ((USItype) (al)), \
424 "rICal" ((USItype) (bl)))
425#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
426 __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
427 : "=r" (sh), \
428 "=&r" (sl) \
429 : "r" ((USItype) (ah)), \
430 "rICal" ((USItype) (bh)), \
431 "r" ((USItype) (al)), \
432 "rICal" ((USItype) (bl)))
433#endif
434
435#if defined (__arm__) && (defined (__thumb2__) || !defined (__thumb__)) \
436 && W_TYPE_SIZE == 32
437#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
438 do { \
439 if (__builtin_constant_p (bl) && -(USItype)(bl) < 0x100) \
440 __asm__ ("subs\t%1, %4, %5\n\tadc\t%0, %2, %3" \
441 : "=r" (sh), "=&r" (sl) \
442 : "r" (ah), "rI" (bh), \
443 "%r" (al), "rI" (-(USItype)(bl)) __CLOBBER_CC); \
444 else \
445 __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3" \
446 : "=r" (sh), "=&r" (sl) \
447 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC); \
448 } while (0)
449/* FIXME: Extend the immediate range for the low word by using both ADDS and
450 SUBS, since they set carry in the same way. Note: We need separate
451 definitions for thumb and non-thumb to to th absense of RSC under thumb. */
452#if defined (__thumb__)
453#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
454 do { \
455 if (__builtin_constant_p (ah) && __builtin_constant_p (bh) \
456 && (ah) == (bh)) \
457 __asm__ ("subs\t%1, %2, %3\n\tsbc\t%0, %0, %0" \
458 : "=r" (sh), "=r" (sl) \
459 : "r" (al), "rI" (bl) __CLOBBER_CC); \
460 else if (__builtin_constant_p (al)) \
461 __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3" \
462 : "=r" (sh), "=&r" (sl) \
463 : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
464 else if (__builtin_constant_p (bl)) \
465 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
466 : "=r" (sh), "=&r" (sl) \
467 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
468 else \
469 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
470 : "=r" (sh), "=&r" (sl) \
471 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
472 } while (0)
473#else
474#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
475 do { \
476 if (__builtin_constant_p (ah) && __builtin_constant_p (bh) \
477 && (ah) == (bh)) \
478 __asm__ ("subs\t%1, %2, %3\n\tsbc\t%0, %0, %0" \
479 : "=r" (sh), "=r" (sl) \
480 : "r" (al), "rI" (bl) __CLOBBER_CC); \
481 else if (__builtin_constant_p (al)) \
482 { \
483 if (__builtin_constant_p (ah)) \
484 __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \
485 : "=r" (sh), "=&r" (sl) \
486 : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
487 else \
488 __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3" \
489 : "=r" (sh), "=&r" (sl) \
490 : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
491 } \
492 else if (__builtin_constant_p (ah)) \
493 { \
494 if (__builtin_constant_p (bl)) \
495 __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2" \
496 : "=r" (sh), "=&r" (sl) \
497 : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
498 else \
499 __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \
500 : "=r" (sh), "=&r" (sl) \
501 : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
502 } \
503 else if (__builtin_constant_p (bl)) \
504 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
505 : "=r" (sh), "=&r" (sl) \
506 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
507 else /* only bh might be a constant */ \
508 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
509 : "=r" (sh), "=&r" (sl) \
510 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
511 } while (0)
512#endif
513#if defined (__ARM_ARCH_2__) || defined (__ARM_ARCH_2A__) \
514 || defined (__ARM_ARCH_3__)
515#define umul_ppmm(xh, xl, a, b) \
516 do { \
517 register USItype __t0, __t1, __t2; \
518 __asm__ ("%@ Inlined umul_ppmm\n" \
519 " mov %2, %5, lsr #16\n" \
520 " mov %0, %6, lsr #16\n" \
521 " bic %3, %5, %2, lsl #16\n" \
522 " bic %4, %6, %0, lsl #16\n" \
523 " mul %1, %3, %4\n" \
524 " mul %4, %2, %4\n" \
525 " mul %3, %0, %3\n" \
526 " mul %0, %2, %0\n" \
527 " adds %3, %4, %3\n" \
528 " addcs %0, %0, #65536\n" \
529 " adds %1, %1, %3, lsl #16\n" \
530 " adc %0, %0, %3, lsr #16" \
531 : "=&r" ((USItype) (xh)), "=r" ((USItype) (xl)), \
532 "=&r" (__t0), "=&r" (__t1), "=r" (__t2) \
533 : "r" ((USItype) (a)), "r" ((USItype) (b)) __CLOBBER_CC); \
534 } while (0)
535#ifndef LONGLONG_STANDALONE
536#define udiv_qrnnd(q, r, n1, n0, d) \
537 do { UWtype __r; \
538 (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \
539 (r) = __r; \
540 } while (0)
541extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
542#endif /* LONGLONG_STANDALONE */
543#else /* ARMv4 or newer */
544#define umul_ppmm(xh, xl, a, b) \
545 __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
546#define smul_ppmm(xh, xl, a, b) \
547 __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
548#ifndef LONGLONG_STANDALONE
549#define udiv_qrnnd(q, r, n1, n0, d) \
550 do { UWtype __di; \
551 __di = __MPN(invert_limb) (d); \
552 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
553 } while (0)
554#define UDIV_PREINV_ALWAYS 1
555#define UDIV_NEEDS_NORMALIZATION 1
556#endif /* LONGLONG_STANDALONE */
557#endif /* defined(__ARM_ARCH_2__) ... */
558#define count_leading_zeros(count, x) count_leading_zeros_gcc_clz(count, x)
559#define count_trailing_zeros(count, x) count_trailing_zeros_gcc_ctz(count, x)
560#endif /* __arm__ */
561
562#if defined (__aarch64__) && W_TYPE_SIZE == 64
563/* FIXME: Extend the immediate range for the low word by using both
564 ADDS and SUBS, since they set carry in the same way. */
565#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
566 do { \
567 if (__builtin_constant_p (bl) && -(UDItype)(bl) < 0x1000) \
568 __asm__ ("subs\t%1, %x4, %5\n\tadc\t%0, %x2, %x3" \
569 : "=r" (sh), "=&r" (sl) \
570 : "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)), \
571 "%r" ((UDItype)(al)), "rI" (-(UDItype)(bl)) __CLOBBER_CC);\
572 else \
573 __asm__ ("adds\t%1, %x4, %5\n\tadc\t%0, %x2, %x3" \
574 : "=r" (sh), "=&r" (sl) \
575 : "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)), \
576 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC);\
577 } while (0)
578#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
579 do { \
580 if (__builtin_constant_p (bl) && -(UDItype)(bl) < 0x1000) \
581 __asm__ ("adds\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3" \
582 : "=r,r" (sh), "=&r,&r" (sl) \
583 : "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)), \
584 "r,Z" ((UDItype)(al)), "rI,r" (-(UDItype)(bl)) __CLOBBER_CC);\
585 else \
586 __asm__ ("subs\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3" \
587 : "=r,r" (sh), "=&r,&r" (sl) \
588 : "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)), \
589 "r,Z" ((UDItype)(al)), "rI,r" ((UDItype)(bl)) __CLOBBER_CC);\
590 } while(0);
591#if __GMP_GNUC_PREREQ (4,9)
592#define umul_ppmm(w1, w0, u, v) \
593 do { \
594 typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \
595 __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \
596 w1 = __ll >> 64; \
597 w0 = __ll; \
598 } while (0)
599#endif
600#if !defined (umul_ppmm)
601#define umul_ppmm(ph, pl, m0, m1) \
602 do { \
603 UDItype __m0 = (m0), __m1 = (m1); \
604 __asm__ ("umulh\t%0, %1, %2" : "=r" (ph) : "r" (__m0), "r" (__m1)); \
605 (pl) = __m0 * __m1; \
606 } while (0)
607#endif
608#define count_leading_zeros(count, x) count_leading_zeros_gcc_clz(count, x)
609#define count_trailing_zeros(count, x) count_trailing_zeros_gcc_ctz(count, x)
610#endif /* __aarch64__ */
611
612#if defined (__clipper__) && W_TYPE_SIZE == 32
613#define umul_ppmm(w1, w0, u, v) \
614 ({union {UDItype __ll; \
615 struct {USItype __l, __h;} __i; \
616 } __x; \
617 __asm__ ("mulwux %2,%0" \
618 : "=r" (__x.__ll) \
619 : "%0" ((USItype)(u)), "r" ((USItype)(v))); \
620 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
621#define smul_ppmm(w1, w0, u, v) \
622 ({union {DItype __ll; \
623 struct {SItype __l, __h;} __i; \
624 } __x; \
625 __asm__ ("mulwx %2,%0" \
626 : "=r" (__x.__ll) \
627 : "%0" ((SItype)(u)), "r" ((SItype)(v))); \
628 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
629#define __umulsidi3(u, v) \
630 ({UDItype __w; \
631 __asm__ ("mulwux %2,%0" \
632 : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v))); \
633 __w; })
634#endif /* __clipper__ */
635
636/* Fujitsu vector computers. */
637#if defined (__uxp__) && W_TYPE_SIZE == 32
638#define umul_ppmm(ph, pl, u, v) \
639 do { \
640 union {UDItype __ll; \
641 struct {USItype __h, __l;} __i; \
642 } __x; \
643 __asm__ ("mult.lu %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));\
644 (ph) = __x.__i.__h; \
645 (pl) = __x.__i.__l; \
646 } while (0)
647#define smul_ppmm(ph, pl, u, v) \
648 do { \
649 union {UDItype __ll; \
650 struct {USItype __h, __l;} __i; \
651 } __x; \
652 __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v)); \
653 (ph) = __x.__i.__h; \
654 (pl) = __x.__i.__l; \
655 } while (0)
656#endif
657
658#if defined (__gmicro__) && W_TYPE_SIZE == 32
659#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
660 __asm__ ("add.w %5,%1\n\taddx %3,%0" \
661 : "=g" (sh), "=&g" (sl) \
662 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
663 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
664#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
665 __asm__ ("sub.w %5,%1\n\tsubx %3,%0" \
666 : "=g" (sh), "=&g" (sl) \
667 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
668 "1" ((USItype)(al)), "g" ((USItype)(bl)))
669#define umul_ppmm(ph, pl, m0, m1) \
670 __asm__ ("mulx %3,%0,%1" \
671 : "=g" (ph), "=r" (pl) \
672 : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
673#define udiv_qrnnd(q, r, nh, nl, d) \
674 __asm__ ("divx %4,%0,%1" \
675 : "=g" (q), "=r" (r) \
676 : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
677#define count_leading_zeros(count, x) \
678 __asm__ ("bsch/1 %1,%0" \
679 : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
680#endif
681
682#if defined (__hppa) && W_TYPE_SIZE == 32
683#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
684 __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0" \
685 : "=r" (sh), "=&r" (sl) \
686 : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
687#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
688 __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0" \
689 : "=r" (sh), "=&r" (sl) \
690 : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
691#if defined (_PA_RISC1_1)
692#define umul_ppmm(wh, wl, u, v) \
693 do { \
694 union {UDItype __ll; \
695 struct {USItype __h, __l;} __i; \
696 } __x; \
697 __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v)); \
698 (wh) = __x.__i.__h; \
699 (wl) = __x.__i.__l; \
700 } while (0)
701#endif
702#define count_leading_zeros(count, x) \
703 do { \
704 USItype __tmp; \
705 __asm__ ( \
706 "ldi 1,%0\n" \
707" extru,= %1,15,16,%%r0 ; Bits 31..16 zero?\n" \
708" extru,tr %1,15,16,%1 ; No. Shift down, skip add.\n" \
709" ldo 16(%0),%0 ; Yes. Perform add.\n" \
710" extru,= %1,23,8,%%r0 ; Bits 15..8 zero?\n" \
711" extru,tr %1,23,8,%1 ; No. Shift down, skip add.\n" \
712" ldo 8(%0),%0 ; Yes. Perform add.\n" \
713" extru,= %1,27,4,%%r0 ; Bits 7..4 zero?\n" \
714" extru,tr %1,27,4,%1 ; No. Shift down, skip add.\n" \
715" ldo 4(%0),%0 ; Yes. Perform add.\n" \
716" extru,= %1,29,2,%%r0 ; Bits 3..2 zero?\n" \
717" extru,tr %1,29,2,%1 ; No. Shift down, skip add.\n" \
718" ldo 2(%0),%0 ; Yes. Perform add.\n" \
719" extru %1,30,1,%1 ; Extract bit 1.\n" \
720" sub %0,%1,%0 ; Subtract it.\n" \
721 : "=r" (count), "=r" (__tmp) : "1" (x)); \
722 } while (0)
723#endif /* hppa */
724
725/* These macros are for ABI=2.0w. In ABI=2.0n they can't be used, since GCC
726 (3.2) puts longlong into two adjacent 32-bit registers. Presumably this
727 is just a case of no direct support for 2.0n but treating it like 1.0. */
728#if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
729#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
730 __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0" \
731 : "=r" (sh), "=&r" (sl) \
732 : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
733#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
734 __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0" \
735 : "=r" (sh), "=&r" (sl) \
736 : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
737#endif /* hppa */
738
739#if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
740#if defined (__zarch__) || defined (HAVE_HOST_CPU_s390_zarch)
741#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
742 do { \
743/* if (__builtin_constant_p (bl)) \
744 __asm__ ("alfi\t%1,%o5\n\talcr\t%0,%3" \
745 : "=r" (sh), "=&r" (sl) \
746 : "0" (ah), "r" (bh), "%1" (al), "n" (bl) __CLOBBER_CC);\
747 else \
748*/ __asm__ ("alr\t%1,%5\n\talcr\t%0,%3" \
749 : "=r" (sh), "=&r" (sl) \
750 : "0" (ah), "r" (bh), "%1" (al), "r" (bl)__CLOBBER_CC); \
751 } while (0)
752#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
753 do { \
754/* if (__builtin_constant_p (bl)) \
755 __asm__ ("slfi\t%1,%o5\n\tslbr\t%0,%3" \
756 : "=r" (sh), "=&r" (sl) \
757 : "0" (ah), "r" (bh), "1" (al), "n" (bl) __CLOBBER_CC); \
758 else \
759*/ __asm__ ("slr\t%1,%5\n\tslbr\t%0,%3" \
760 : "=r" (sh), "=&r" (sl) \
761 : "0" (ah), "r" (bh), "1" (al), "r" (bl) __CLOBBER_CC); \
762 } while (0)
763#if __GMP_GNUC_PREREQ (4,5)
764#define umul_ppmm(xh, xl, m0, m1) \
765 do { \
766 union {UDItype __ll; \
767 struct {USItype __h, __l;} __i; \
768 } __x; \
769 __x.__ll = (UDItype) (m0) * (UDItype) (m1); \
770 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
771 } while (0)
772#else
773#if 0
774/* FIXME: this fails if gcc knows about the 64-bit registers. Use only
775 with a new enough processor pretending we have 32-bit registers. */
776#define umul_ppmm(xh, xl, m0, m1) \
777 do { \
778 union {UDItype __ll; \
779 struct {USItype __h, __l;} __i; \
780 } __x; \
781 __asm__ ("mlr\t%0,%2" \
782 : "=r" (__x.__ll) \
783 : "%0" (m0), "r" (m1)); \
784 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
785 } while (0)
786#else
787#define umul_ppmm(xh, xl, m0, m1) \
788 do { \
789 /* When we have 64-bit regs and gcc is aware of that, we cannot simply use
790 DImode for the product, since that would be allocated to a single 64-bit
791 register, whereas mlr uses the low 32-bits of an even-odd register pair.
792 */ \
793 register USItype __r0 __asm__ ("0"); \
794 register USItype __r1 __asm__ ("1") = (m0); \
795 __asm__ ("mlr\t%0,%3" \
796 : "=r" (__r0), "=r" (__r1) \
797 : "r" (__r1), "r" (m1)); \
798 (xh) = __r0; (xl) = __r1; \
799 } while (0)
800#endif /* if 0 */
801#endif
802#if 0
803/* FIXME: this fails if gcc knows about the 64-bit registers. Use only
804 with a new enough processor pretending we have 32-bit registers. */
805#define udiv_qrnnd(q, r, n1, n0, d) \
806 do { \
807 union {UDItype __ll; \
808 struct {USItype __h, __l;} __i; \
809 } __x; \
810 __x.__i.__h = n1; __x.__i.__l = n0; \
811 __asm__ ("dlr\t%0,%2" \
812 : "=r" (__x.__ll) \
813 : "0" (__x.__ll), "r" (d)); \
814 (q) = __x.__i.__l; (r) = __x.__i.__h; \
815 } while (0)
816#else
817#define udiv_qrnnd(q, r, n1, n0, d) \
818 do { \
819 register USItype __r0 __asm__ ("0") = (n1); \
820 register USItype __r1 __asm__ ("1") = (n0); \
821 __asm__ ("dlr\t%0,%4" \
822 : "=r" (__r0), "=r" (__r1) \
823 : "r" (__r0), "r" (__r1), "r" (d)); \
824 (q) = __r1; (r) = __r0; \
825 } while (0)
826#endif /* if 0 */
827#else /* if __zarch__ */
828/* FIXME: this fails if gcc knows about the 64-bit registers. */
829#define smul_ppmm(xh, xl, m0, m1) \
830 do { \
831 union {DItype __ll; \
832 struct {USItype __h, __l;} __i; \
833 } __x; \
834 __asm__ ("mr\t%0,%2" \
835 : "=r" (__x.__ll) \
836 : "%0" (m0), "r" (m1)); \
837 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
838 } while (0)
839/* FIXME: this fails if gcc knows about the 64-bit registers. */
840#define sdiv_qrnnd(q, r, n1, n0, d) \
841 do { \
842 union {DItype __ll; \
843 struct {USItype __h, __l;} __i; \
844 } __x; \
845 __x.__i.__h = n1; __x.__i.__l = n0; \
846 __asm__ ("dr\t%0,%2" \
847 : "=r" (__x.__ll) \
848 : "0" (__x.__ll), "r" (d)); \
849 (q) = __x.__i.__l; (r) = __x.__i.__h; \
850 } while (0)
851#endif /* if __zarch__ */
852#endif
853
854#if defined (__s390x__) && W_TYPE_SIZE == 64
855/* We need to cast operands with register constraints, otherwise their types
856 will be assumed to be SImode by gcc. For these machines, such operations
857 will insert a value into the low 32 bits, and leave the high 32 bits with
858 garbage. */
859#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
860 do { \
861 __asm__ ("algr\t%1,%5\n\talcgr\t%0,%3" \
862 : "=r" (sh), "=&r" (sl) \
863 : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)), \
864 "%1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \
865 } while (0)
866#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
867 do { \
868 __asm__ ("slgr\t%1,%5\n\tslbgr\t%0,%3" \
869 : "=r" (sh), "=&r" (sl) \
870 : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)), \
871 "1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \
872 } while (0)
873#define umul_ppmm(xh, xl, m0, m1) \
874 do { \
875 union {unsigned int __attribute__ ((mode(TI))) __ll; \
876 struct {UDItype __h, __l;} __i; \
877 } __x; \
878 __asm__ ("mlgr\t%0,%2" \
879 : "=r" (__x.__ll) \
880 : "%0" ((UDItype)(m0)), "r" ((UDItype)(m1))); \
881 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
882 } while (0)
883#define udiv_qrnnd(q, r, n1, n0, d) \
884 do { \
885 union {unsigned int __attribute__ ((mode(TI))) __ll; \
886 struct {UDItype __h, __l;} __i; \
887 } __x; \
888 __x.__i.__h = n1; __x.__i.__l = n0; \
889 __asm__ ("dlgr\t%0,%2" \
890 : "=r" (__x.__ll) \
891 : "0" (__x.__ll), "r" ((UDItype)(d))); \
892 (q) = __x.__i.__l; (r) = __x.__i.__h; \
893 } while (0)
894#if 0 /* FIXME: Enable for z10 (?) */
895#define count_leading_zeros(cnt, x) \
896 do { \
897 union {unsigned int __attribute__ ((mode(TI))) __ll; \
898 struct {UDItype __h, __l;} __i; \
899 } __clr_cnt; \
900 __asm__ ("flogr\t%0,%1" \
901 : "=r" (__clr_cnt.__ll) \
902 : "r" (x) __CLOBBER_CC); \
903 (cnt) = __clr_cnt.__i.__h; \
904 } while (0)
905#endif
906#endif
907
908/* On x86 and x86_64, every asm implicitly clobbers "flags" and "fpsr",
909 so we don't need __CLOBBER_CC. */
910#if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
911#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
912 __asm__ ("addl %5,%k1\n\tadcl %3,%k0" \
913 : "=r" (sh), "=&r" (sl) \
914 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
915 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
916#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
917 __asm__ ("subl %5,%k1\n\tsbbl %3,%k0" \
918 : "=r" (sh), "=&r" (sl) \
919 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
920 "1" ((USItype)(al)), "g" ((USItype)(bl)))
921#define umul_ppmm(w1, w0, u, v) \
922 __asm__ ("mull %3" \
923 : "=a" (w0), "=d" (w1) \
924 : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
925#define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
926 __asm__ ("divl %4" /* stringification in K&R C */ \
927 : "=a" (q), "=d" (r) \
928 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
929
930#if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
931/* Pentium bsrl takes between 10 and 72 cycles depending where the most
932 significant 1 bit is, hence the use of the following alternatives. bsfl
933 is slow too, between 18 and 42 depending where the least significant 1
934 bit is, so let the generic count_trailing_zeros below make use of the
935 count_leading_zeros here too. */
936
937#if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
938/* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
939 cache miss reading from __clz_tab. For P55 it's favoured over the float
940 below so as to avoid mixing MMX and x87, since the penalty for switching
941 between the two is about 100 cycles.
942
943 The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
944 16, -1 for 8, or 0 otherwise. This could be written equivalently as
945 follows, but as of gcc 2.95.2 it results in conditional jumps.
946
947 __shift = -(__n < 0x1000000);
948 __shift -= (__n < 0x10000);
949 __shift -= (__n < 0x100);
950
951 The middle two sbbl and cmpl's pair, and with luck something gcc
952 generates might pair with the first cmpl and the last sbbl. The "32+1"
953 constant could be folded into __clz_tab[], but it doesn't seem worth
954 making a different table just for that. */
955
956#define count_leading_zeros(c,n) \
957 do { \
958 USItype __n = (n); \
959 USItype __shift; \
960 __asm__ ("cmpl $0x1000000, %1\n" \
961 "sbbl %0, %0\n" \
962 "cmpl $0x10000, %1\n" \
963 "sbbl $0, %0\n" \
964 "cmpl $0x100, %1\n" \
965 "sbbl $0, %0\n" \
966 : "=&r" (__shift) : "r" (__n)); \
967 __shift = __shift*8 + 24 + 1; \
968 (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift]; \
969 } while (0)
970#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
971#define COUNT_LEADING_ZEROS_0 31 /* n==0 indistinguishable from n==1 */
972
973#else /* ! pentiummmx || LONGLONG_STANDALONE */
974/* The following should be a fixed 14 cycles or so. Some scheduling
975 opportunities should be available between the float load/store too. This
976 sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
977 apparently suggested by the Intel optimizing manual (don't know exactly
978 where). gcc 2.95 or up will be best for this, so the "double" is
979 correctly aligned on the stack. */
980#define count_leading_zeros(c,n) \
981 do { \
982 union { \
983 double d; \
984 unsigned a[2]; \
985 } __u; \
986 __u.d = (UWtype) (n); \
987 (c) = 0x3FF + 31 - (__u.a[1] >> 20); \
988 } while (0)
989#define COUNT_LEADING_ZEROS_0 (0x3FF + 31)
990#endif /* pentiummx */
991
992#else /* ! pentium */
993
994#if __GMP_GNUC_PREREQ (3,4) /* using bsrl */
995#define count_leading_zeros(count,x) count_leading_zeros_gcc_clz(count,x)
996#endif /* gcc clz */
997
998/* On P6, gcc prior to 3.0 generates a partial register stall for
999 __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
1000 being 1 code byte smaller. "31-__cbtmp" is a workaround, probably at the
1001 cost of one extra instruction. Do this for "i386" too, since that means
1002 generic x86. */
1003#if ! defined (count_leading_zeros) && __GNUC__ < 3 \
1004 && (HAVE_HOST_CPU_i386 \
1005 || HAVE_HOST_CPU_i686 \
1006 || HAVE_HOST_CPU_pentiumpro \
1007 || HAVE_HOST_CPU_pentium2 \
1008 || HAVE_HOST_CPU_pentium3)
1009#define count_leading_zeros(count, x) \
1010 do { \
1011 USItype __cbtmp; \
1012 ASSERT ((x) != 0); \
1013 __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \
1014 (count) = 31 - __cbtmp; \
1015 } while (0)
1016#endif /* gcc<3 asm bsrl */
1017
1018#ifndef count_leading_zeros
1019#define count_leading_zeros(count, x) \
1020 do { \
1021 USItype __cbtmp; \
1022 ASSERT ((x) != 0); \
1023 __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \
1024 (count) = __cbtmp ^ 31; \
1025 } while (0)
1026#endif /* asm bsrl */
1027
1028#if __GMP_GNUC_PREREQ (3,4) /* using bsfl */
1029#define count_trailing_zeros(count,x) count_trailing_zeros_gcc_ctz(count,x)
1030#endif /* gcc ctz */
1031
1032#ifndef count_trailing_zeros
1033#define count_trailing_zeros(count, x) \
1034 do { \
1035 ASSERT ((x) != 0); \
1036 __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x))); \
1037 } while (0)
1038#endif /* asm bsfl */
1039
1040#endif /* ! pentium */
1041
1042#endif /* 80x86 */
1043
1044#if defined (__amd64__) && W_TYPE_SIZE == 64
1045#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1046 __asm__ ("addq %5,%q1\n\tadcq %3,%q0" \
1047 : "=r" (sh), "=&r" (sl) \
1048 : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \
1049 "%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
1050#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1051 __asm__ ("subq %5,%q1\n\tsbbq %3,%q0" \
1052 : "=r" (sh), "=&r" (sl) \
1053 : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \
1054 "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
1055#if X86_ASM_MULX \
1056 && (HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell \
1057 || HAVE_HOST_CPU_skylake || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen)
1058#define umul_ppmm(w1, w0, u, v) \
1059 __asm__ ("mulx\t%3, %0, %1" \
1060 : "=r" (w0), "=r" (w1) \
1061 : "%d" ((UDItype)(u)), "rm" ((UDItype)(v)))
1062#else
1063#define umul_ppmm(w1, w0, u, v) \
1064 __asm__ ("mulq\t%3" \
1065 : "=a" (w0), "=d" (w1) \
1066 : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
1067#endif
1068#define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
1069 __asm__ ("divq %4" /* stringification in K&R C */ \
1070 : "=a" (q), "=d" (r) \
1071 : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
1072
1073#if HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell || HAVE_HOST_CPU_skylake \
1074 || HAVE_HOST_CPU_k10 || HAVE_HOST_CPU_bd1 || HAVE_HOST_CPU_bd2 \
1075 || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen \
1076 || HAVE_HOST_CPU_bobcat || HAVE_HOST_CPU_jaguar
1077#define count_leading_zeros(count, x) \
1078 do { \
1079 /* This is lzcnt, spelled for older assemblers. Destination and */ \
1080 /* source must be a 64-bit registers, hence cast and %q. */ \
1081 __asm__ ("rep;bsr\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \
1082 } while (0)
1083#define COUNT_LEADING_ZEROS_0 64
1084#else
1085#define count_leading_zeros(count, x) \
1086 do { \
1087 UDItype __cbtmp; \
1088 ASSERT ((x) != 0); \
1089 __asm__ ("bsr\t%1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x))); \
1090 (count) = __cbtmp ^ 63; \
1091 } while (0)
1092#endif
1093
1094#if HAVE_HOST_CPU_bd2 || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 \
1095 || HAVE_HOST_CPU_zen || HAVE_HOST_CPU_jaguar
1096#define count_trailing_zeros(count, x) \
1097 do { \
1098 /* This is tzcnt, spelled for older assemblers. Destination and */ \
1099 /* source must be a 64-bit registers, hence cast and %q. */ \
1100 __asm__ ("rep;bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \
1101 } while (0)
1102#define COUNT_TRAILING_ZEROS_0 64
1103#else
1104#define count_trailing_zeros(count, x) \
1105 do { \
1106 ASSERT ((x) != 0); \
1107 __asm__ ("bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \
1108 } while (0)
1109#endif
1110#endif /* __amd64__ */
1111
1112#if defined (__i860__) && W_TYPE_SIZE == 32
1113#define rshift_rhlc(r,h,l,c) \
1114 __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0" \
1115 "=r" (r) : "r" (h), "r" (l), "rn" (c))
1116#endif /* i860 */
1117
1118#if defined (__i960__) && W_TYPE_SIZE == 32
1119#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1120 __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0" \
1121 : "=r" (sh), "=&r" (sl) \
1122 : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
1123#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1124 __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0" \
1125 : "=r" (sh), "=&r" (sl) \
1126 : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
1127#define umul_ppmm(w1, w0, u, v) \
1128 ({union {UDItype __ll; \
1129 struct {USItype __l, __h;} __i; \
1130 } __x; \
1131 __asm__ ("emul %2,%1,%0" \
1132 : "=d" (__x.__ll) : "%dI" (u), "dI" (v)); \
1133 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1134#define __umulsidi3(u, v) \
1135 ({UDItype __w; \
1136 __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v)); \
1137 __w; })
1138#define udiv_qrnnd(q, r, nh, nl, d) \
1139 do { \
1140 union {UDItype __ll; \
1141 struct {USItype __l, __h;} __i; \
1142 } __nn; \
1143 __nn.__i.__h = (nh); __nn.__i.__l = (nl); \
1144 __asm__ ("ediv %d,%n,%0" \
1145 : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d)); \
1146 (r) = __rq.__i.__l; (q) = __rq.__i.__h; \
1147 } while (0)
1148#define count_leading_zeros(count, x) \
1149 do { \
1150 USItype __cbtmp; \
1151 __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x)); \
1152 (count) = __cbtmp ^ 31; \
1153 } while (0)
1154#define COUNT_LEADING_ZEROS_0 (-32) /* sic */
1155#if defined (__i960mx) /* what is the proper symbol to test??? */
1156#define rshift_rhlc(r,h,l,c) \
1157 do { \
1158 union {UDItype __ll; \
1159 struct {USItype __l, __h;} __i; \
1160 } __nn; \
1161 __nn.__i.__h = (h); __nn.__i.__l = (l); \
1162 __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c)); \
1163 }
1164#endif /* i960mx */
1165#endif /* i960 */
1166
1167#if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
1168 || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
1169 || defined (__mc5307__)) && W_TYPE_SIZE == 32
1170#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1171 __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0" \
1172 : "=d" (sh), "=&d" (sl) \
1173 : "0" ((USItype)(ah)), "d" ((USItype)(bh)), \
1174 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1175#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1176 __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0" \
1177 : "=d" (sh), "=&d" (sl) \
1178 : "0" ((USItype)(ah)), "d" ((USItype)(bh)), \
1179 "1" ((USItype)(al)), "g" ((USItype)(bl)))
1180/* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r. */
1181#if defined (__mc68020__) || defined(mc68020) \
1182 || defined (__mc68030__) || defined (mc68030) \
1183 || defined (__mc68040__) || defined (mc68040) \
1184 || defined (__mcpu32__) || defined (mcpu32) \
1185 || defined (__NeXT__)
1186#define umul_ppmm(w1, w0, u, v) \
1187 __asm__ ("mulu%.l %3,%1:%0" \
1188 : "=d" (w0), "=d" (w1) \
1189 : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
1190#define udiv_qrnnd(q, r, n1, n0, d) \
1191 __asm__ ("divu%.l %4,%1:%0" \
1192 : "=d" (q), "=d" (r) \
1193 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1194#define sdiv_qrnnd(q, r, n1, n0, d) \
1195 __asm__ ("divs%.l %4,%1:%0" \
1196 : "=d" (q), "=d" (r) \
1197 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1198#else /* for other 68k family members use 16x16->32 multiplication */
1199#define umul_ppmm(xh, xl, a, b) \
1200 do { USItype __umul_tmp1, __umul_tmp2; \
1201 __asm__ ("| Inlined umul_ppmm\n" \
1202" move%.l %5,%3\n" \
1203" move%.l %2,%0\n" \
1204" move%.w %3,%1\n" \
1205" swap %3\n" \
1206" swap %0\n" \
1207" mulu%.w %2,%1\n" \
1208" mulu%.w %3,%0\n" \
1209" mulu%.w %2,%3\n" \
1210" swap %2\n" \
1211" mulu%.w %5,%2\n" \
1212" add%.l %3,%2\n" \
1213" jcc 1f\n" \
1214" add%.l %#0x10000,%0\n" \
1215"1: move%.l %2,%3\n" \
1216" clr%.w %2\n" \
1217" swap %2\n" \
1218" swap %3\n" \
1219" clr%.w %3\n" \
1220" add%.l %3,%1\n" \
1221" addx%.l %2,%0\n" \
1222" | End inlined umul_ppmm" \
1223 : "=&d" (xh), "=&d" (xl), \
1224 "=d" (__umul_tmp1), "=&d" (__umul_tmp2) \
1225 : "%2" ((USItype)(a)), "d" ((USItype)(b))); \
1226 } while (0)
1227#endif /* not mc68020 */
1228/* The '020, '030, '040 and '060 have bitfield insns.
1229 GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
1230 exclude bfffo on that chip (bitfield insns not available). */
1231#if (defined (__mc68020__) || defined (mc68020) \
1232 || defined (__mc68030__) || defined (mc68030) \
1233 || defined (__mc68040__) || defined (mc68040) \
1234 || defined (__mc68060__) || defined (mc68060) \
1235 || defined (__NeXT__)) \
1236 && ! defined (__mcpu32__)
1237#define count_leading_zeros(count, x) \
1238 __asm__ ("bfffo %1{%b2:%b2},%0" \
1239 : "=d" (count) \
1240 : "od" ((USItype) (x)), "n" (0))
1241#define COUNT_LEADING_ZEROS_0 32
1242#endif
1243#endif /* mc68000 */
1244
1245#if defined (__m88000__) && W_TYPE_SIZE == 32
1246#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1247 __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3" \
1248 : "=r" (sh), "=&r" (sl) \
1249 : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
1250#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1251 __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3" \
1252 : "=r" (sh), "=&r" (sl) \
1253 : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
1254#define count_leading_zeros(count, x) \
1255 do { \
1256 USItype __cbtmp; \
1257 __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x)); \
1258 (count) = __cbtmp ^ 31; \
1259 } while (0)
1260#define COUNT_LEADING_ZEROS_0 63 /* sic */
1261#if defined (__m88110__)
1262#define umul_ppmm(wh, wl, u, v) \
1263 do { \
1264 union {UDItype __ll; \
1265 struct {USItype __h, __l;} __i; \
1266 } __x; \
1267 __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v)); \
1268 (wh) = __x.__i.__h; \
1269 (wl) = __x.__i.__l; \
1270 } while (0)
1271#define udiv_qrnnd(q, r, n1, n0, d) \
1272 ({union {UDItype __ll; \
1273 struct {USItype __h, __l;} __i; \
1274 } __x, __q; \
1275 __x.__i.__h = (n1); __x.__i.__l = (n0); \
1276 __asm__ ("divu.d %0,%1,%2" \
1277 : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d)); \
1278 (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
1279#endif /* __m88110__ */
1280#endif /* __m88000__ */
1281
1282#if defined (__mips) && W_TYPE_SIZE == 32
1283#if __GMP_GNUC_PREREQ (4,4)
1284#define umul_ppmm(w1, w0, u, v) \
1285 do { \
1286 UDItype __ll = (UDItype)(u) * (v); \
1287 w1 = __ll >> 32; \
1288 w0 = __ll; \
1289 } while (0)
1290#endif
1291#if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
1292#define umul_ppmm(w1, w0, u, v) \
1293 __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1294#endif
1295#if !defined (umul_ppmm)
1296#define umul_ppmm(w1, w0, u, v) \
1297 __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1" \
1298 : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1299#endif
1300#endif /* __mips */
1301
1302#if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
1303#if defined (_MIPS_ARCH_MIPS64R6)
1304#define umul_ppmm(w1, w0, u, v) \
1305 do { \
1306 UDItype __m0 = (u), __m1 = (v); \
1307 (w0) = __m0 * __m1; \
1308 __asm__ ("dmuhu\t%0, %1, %2" : "=d" (w1) : "d" (__m0), "d" (__m1)); \
1309 } while (0)
1310#endif
1311#if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (4,4)
1312#define umul_ppmm(w1, w0, u, v) \
1313 do { \
1314 typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \
1315 __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \
1316 w1 = __ll >> 64; \
1317 w0 = __ll; \
1318 } while (0)
1319#endif
1320#if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
1321#define umul_ppmm(w1, w0, u, v) \
1322 __asm__ ("dmultu %2,%3" \
1323 : "=l" (w0), "=h" (w1) \
1324 : "d" ((UDItype)(u)), "d" ((UDItype)(v)))
1325#endif
1326#if !defined (umul_ppmm)
1327#define umul_ppmm(w1, w0, u, v) \
1328 __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1" \
1329 : "=d" (w0), "=d" (w1) \
1330 : "d" ((UDItype)(u)), "d" ((UDItype)(v)))
1331#endif
1332#endif /* __mips */
1333
1334#if defined (__mmix__) && W_TYPE_SIZE == 64
1335#define umul_ppmm(w1, w0, u, v) \
1336 __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v))
1337#endif
1338
1339#if defined (__ns32000__) && W_TYPE_SIZE == 32
1340#define umul_ppmm(w1, w0, u, v) \
1341 ({union {UDItype __ll; \
1342 struct {USItype __l, __h;} __i; \
1343 } __x; \
1344 __asm__ ("meid %2,%0" \
1345 : "=g" (__x.__ll) \
1346 : "%0" ((USItype)(u)), "g" ((USItype)(v))); \
1347 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1348#define __umulsidi3(u, v) \
1349 ({UDItype __w; \
1350 __asm__ ("meid %2,%0" \
1351 : "=g" (__w) \
1352 : "%0" ((USItype)(u)), "g" ((USItype)(v))); \
1353 __w; })
1354#define udiv_qrnnd(q, r, n1, n0, d) \
1355 ({union {UDItype __ll; \
1356 struct {USItype __l, __h;} __i; \
1357 } __x; \
1358 __x.__i.__h = (n1); __x.__i.__l = (n0); \
1359 __asm__ ("deid %2,%0" \
1360 : "=g" (__x.__ll) \
1361 : "0" (__x.__ll), "g" ((USItype)(d))); \
1362 (r) = __x.__i.__l; (q) = __x.__i.__h; })
1363#define count_trailing_zeros(count,x) \
1364 do { \
1365 __asm__ ("ffsd %2,%0" \
1366 : "=r" (count) \
1367 : "0" ((USItype) 0), "r" ((USItype) (x))); \
1368 } while (0)
1369#endif /* __ns32000__ */
1370
1371/* In the past we had a block of various #defines tested
1372 _ARCH_PPC - AIX
1373 _ARCH_PWR - AIX
1374 __powerpc__ - gcc
1375 __POWERPC__ - BEOS
1376 __ppc__ - Darwin
1377 PPC - old gcc, GNU/Linux, SysV
1378 The plain PPC test was not good for vxWorks, since PPC is defined on all
1379 CPUs there (eg. m68k too), as a constant one is expected to compare
1380 CPU_FAMILY against.
1381
1382 At any rate, this was pretty unattractive and a bit fragile. The use of
1383 HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
1384 getting the desired effect.
1385
1386 ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
1387 the system vendor compilers. (Is that vendor compilers with inline asm,
1388 or what?) */
1389
1390#if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc) \
1391 && W_TYPE_SIZE == 32
1392#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1393 do { \
1394 if (__builtin_constant_p (bh) && (bh) == 0) \
1395 __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2" \
1396 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl) \
1397 __CLOBBER_CC); \
1398 else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \
1399 __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2" \
1400 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl) \
1401 __CLOBBER_CC); \
1402 else \
1403 __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3" \
1404 : "=r" (sh), "=&r" (sl) \
1405 : "r" (ah), "r" (bh), "%r" (al), "rI" (bl) \
1406 __CLOBBER_CC); \
1407 } while (0)
1408#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1409 do { \
1410 if (__builtin_constant_p (ah) && (ah) == 0) \
1411 __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2" \
1412 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl) \
1413 __CLOBBER_CC); \
1414 else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0) \
1415 __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2" \
1416 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl) \
1417 __CLOBBER_CC); \
1418 else if (__builtin_constant_p (bh) && (bh) == 0) \
1419 __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2" \
1420 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl) \
1421 __CLOBBER_CC); \
1422 else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \
1423 __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2" \
1424 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl) \
1425 __CLOBBER_CC); \
1426 else \
1427 __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2" \
1428 : "=r" (sh), "=&r" (sl) \
1429 : "r" (ah), "r" (bh), "rI" (al), "r" (bl) \
1430 __CLOBBER_CC); \
1431 } while (0)
1432#define count_leading_zeros(count, x) \
1433 __asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x))
1434#define COUNT_LEADING_ZEROS_0 32
1435#if HAVE_HOST_CPU_FAMILY_powerpc
1436#if __GMP_GNUC_PREREQ (4,4)
1437#define umul_ppmm(w1, w0, u, v) \
1438 do { \
1439 UDItype __ll = (UDItype)(u) * (v); \
1440 w1 = __ll >> 32; \
1441 w0 = __ll; \
1442 } while (0)
1443#endif
1444#if !defined (umul_ppmm)
1445#define umul_ppmm(ph, pl, m0, m1) \
1446 do { \
1447 USItype __m0 = (m0), __m1 = (m1); \
1448 __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \
1449 (pl) = __m0 * __m1; \
1450 } while (0)
1451#endif
1452#define smul_ppmm(ph, pl, m0, m1) \
1453 do { \
1454 SItype __m0 = (m0), __m1 = (m1); \
1455 __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \
1456 (pl) = __m0 * __m1; \
1457 } while (0)
1458#else
1459#define smul_ppmm(xh, xl, m0, m1) \
1460 __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
1461#define sdiv_qrnnd(q, r, nh, nl, d) \
1462 __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
1463#endif
1464#endif /* 32-bit POWER architecture variants. */
1465
1466/* We should test _IBMR2 here when we add assembly support for the system
1467 vendor compilers. */
1468#if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
1469#if !defined (_LONG_LONG_LIMB)
1470/* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values. So
1471 use adde etc only when not _LONG_LONG_LIMB. */
1472#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1473 do { \
1474 if (__builtin_constant_p (bh) && (bh) == 0) \
1475 __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2" \
1476 : "=r" (sh), "=&r" (sl) \
1477 : "r" ((UDItype)(ah)), \
1478 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) \
1479 __CLOBBER_CC); \
1480 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \
1481 __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2" \
1482 : "=r" (sh), "=&r" (sl) \
1483 : "r" ((UDItype)(ah)), \
1484 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) \
1485 __CLOBBER_CC); \
1486 else \
1487 __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3" \
1488 : "=r" (sh), "=&r" (sl) \
1489 : "r" ((UDItype)(ah)), "r" ((UDItype)(bh)), \
1490 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) \
1491 __CLOBBER_CC); \
1492 } while (0)
1493/* We use "*rI" for the constant operand here, since with just "I", gcc barfs.
1494 This might seem strange, but gcc folds away the dead code late. */
1495#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1496 do { \
1497 if (__builtin_constant_p (bl) && bl > -0x8000 && bl <= 0x8000) { \
1498 if (__builtin_constant_p (ah) && (ah) == 0) \
1499 __asm__ ("addic %1,%3,%4\n\tsubfze %0,%2" \
1500 : "=r" (sh), "=&r" (sl) \
1501 : "r" ((UDItype)(bh)), \
1502 "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \
1503 __CLOBBER_CC); \
1504 else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0) \
1505 __asm__ ("addic %1,%3,%4\n\tsubfme %0,%2" \
1506 : "=r" (sh), "=&r" (sl) \
1507 : "r" ((UDItype)(bh)), \
1508 "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \
1509 __CLOBBER_CC); \
1510 else if (__builtin_constant_p (bh) && (bh) == 0) \
1511 __asm__ ("addic %1,%3,%4\n\taddme %0,%2" \
1512 : "=r" (sh), "=&r" (sl) \
1513 : "r" ((UDItype)(ah)), \
1514 "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \
1515 __CLOBBER_CC); \
1516 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \
1517 __asm__ ("addic %1,%3,%4\n\taddze %0,%2" \
1518 : "=r" (sh), "=&r" (sl) \
1519 : "r" ((UDItype)(ah)), \
1520 "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \
1521 __CLOBBER_CC); \
1522 else \
1523 __asm__ ("addic %1,%4,%5\n\tsubfe %0,%3,%2" \
1524 : "=r" (sh), "=&r" (sl) \
1525 : "r" ((UDItype)(ah)), "r" ((UDItype)(bh)), \
1526 "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \
1527 __CLOBBER_CC); \
1528 } else { \
1529 if (__builtin_constant_p (ah) && (ah) == 0) \
1530 __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2" \
1531 : "=r" (sh), "=&r" (sl) \
1532 : "r" ((UDItype)(bh)), \
1533 "rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \
1534 __CLOBBER_CC); \
1535 else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0) \
1536 __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2" \
1537 : "=r" (sh), "=&r" (sl) \
1538 : "r" ((UDItype)(bh)), \
1539 "rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \
1540 __CLOBBER_CC); \
1541 else if (__builtin_constant_p (bh) && (bh) == 0) \
1542 __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2" \
1543 : "=r" (sh), "=&r" (sl) \
1544 : "r" ((UDItype)(ah)), \
1545 "rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \
1546 __CLOBBER_CC); \
1547 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \
1548 __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2" \
1549 : "=r" (sh), "=&r" (sl) \
1550 : "r" ((UDItype)(ah)), \
1551 "rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \
1552 __CLOBBER_CC); \
1553 else \
1554 __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2" \
1555 : "=r" (sh), "=&r" (sl) \
1556 : "r" ((UDItype)(ah)), "r" ((UDItype)(bh)), \
1557 "rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \
1558 __CLOBBER_CC); \
1559 } \
1560 } while (0)
1561#endif /* ! _LONG_LONG_LIMB */
1562#define count_leading_zeros(count, x) \
1563 __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
1564#define COUNT_LEADING_ZEROS_0 64
1565#if __GMP_GNUC_PREREQ (4,8)
1566#define umul_ppmm(w1, w0, u, v) \
1567 do { \
1568 typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \
1569 __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \
1570 w1 = __ll >> 64; \
1571 w0 = __ll; \
1572 } while (0)
1573#endif
1574#if !defined (umul_ppmm)
1575#define umul_ppmm(ph, pl, m0, m1) \
1576 do { \
1577 UDItype __m0 = (m0), __m1 = (m1); \
1578 __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1)); \
1579 (pl) = __m0 * __m1; \
1580 } while (0)
1581#endif
1582#define smul_ppmm(ph, pl, m0, m1) \
1583 do { \
1584 DItype __m0 = (m0), __m1 = (m1); \
1585 __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1)); \
1586 (pl) = __m0 * __m1; \
1587 } while (0)
1588#endif /* 64-bit PowerPC. */
1589
1590#if defined (__pyr__) && W_TYPE_SIZE == 32
1591#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1592 __asm__ ("addw %5,%1\n\taddwc %3,%0" \
1593 : "=r" (sh), "=&r" (sl) \
1594 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1595 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1596#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1597 __asm__ ("subw %5,%1\n\tsubwb %3,%0" \
1598 : "=r" (sh), "=&r" (sl) \
1599 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1600 "1" ((USItype)(al)), "g" ((USItype)(bl)))
1601/* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP. */
1602#define umul_ppmm(w1, w0, u, v) \
1603 ({union {UDItype __ll; \
1604 struct {USItype __h, __l;} __i; \
1605 } __x; \
1606 __asm__ ("movw %1,%R0\n\tuemul %2,%0" \
1607 : "=&r" (__x.__ll) \
1608 : "g" ((USItype) (u)), "g" ((USItype)(v))); \
1609 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1610#endif /* __pyr__ */
1611
1612#if defined (__ibm032__) /* RT/ROMP */ && W_TYPE_SIZE == 32
1613#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1614 __asm__ ("a %1,%5\n\tae %0,%3" \
1615 : "=r" (sh), "=&r" (sl) \
1616 : "0" ((USItype)(ah)), "r" ((USItype)(bh)), \
1617 "%1" ((USItype)(al)), "r" ((USItype)(bl)))
1618#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1619 __asm__ ("s %1,%5\n\tse %0,%3" \
1620 : "=r" (sh), "=&r" (sl) \
1621 : "0" ((USItype)(ah)), "r" ((USItype)(bh)), \
1622 "1" ((USItype)(al)), "r" ((USItype)(bl)))
1623#define smul_ppmm(ph, pl, m0, m1) \
1624 __asm__ ( \
1625 "s r2,r2\n" \
1626" mts r10,%2\n" \
1627" m r2,%3\n" \
1628" m r2,%3\n" \
1629" m r2,%3\n" \
1630" m r2,%3\n" \
1631" m r2,%3\n" \
1632" m r2,%3\n" \
1633" m r2,%3\n" \
1634" m r2,%3\n" \
1635" m r2,%3\n" \
1636" m r2,%3\n" \
1637" m r2,%3\n" \
1638" m r2,%3\n" \
1639" m r2,%3\n" \
1640" m r2,%3\n" \
1641" m r2,%3\n" \
1642" m r2,%3\n" \
1643" cas %0,r2,r0\n" \
1644" mfs r10,%1" \
1645 : "=r" (ph), "=r" (pl) \
1646 : "%r" ((USItype)(m0)), "r" ((USItype)(m1)) \
1647 : "r2")
1648#define count_leading_zeros(count, x) \
1649 do { \
1650 if ((x) >= 0x10000) \
1651 __asm__ ("clz %0,%1" \
1652 : "=r" (count) : "r" ((USItype)(x) >> 16)); \
1653 else \
1654 { \
1655 __asm__ ("clz %0,%1" \
1656 : "=r" (count) : "r" ((USItype)(x))); \
1657 (count) += 16; \
1658 } \
1659 } while (0)
1660#endif /* RT/ROMP */
1661
1662#if defined (__riscv64) && W_TYPE_SIZE == 64
1663#define umul_ppmm(ph, pl, u, v) \
1664 do { \
1665 UDItype __u = (u), __v = (v); \
1666 (pl) = __u * __v; \
1667 __asm__ ("mulhu\t%2, %1, %0" : "=r" (ph) : "%r" (__u), "r" (__v)); \
1668 } while (0)
1669#endif
1670
1671#if (defined (__SH2__) || defined (__SH3__) || defined (__SH4__)) && W_TYPE_SIZE == 32
1672#define umul_ppmm(w1, w0, u, v) \
1673 __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0" \
1674 : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
1675#endif
1676
1677#if defined (__sparc__) && W_TYPE_SIZE == 32
1678#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1679 __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0" \
1680 : "=r" (sh), "=&r" (sl) \
1681 : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl) \
1682 __CLOBBER_CC)
1683#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1684 __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0" \
1685 : "=r" (sh), "=&r" (sl) \
1686 : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl) \
1687 __CLOBBER_CC)
1688/* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
1689 doesn't define anything to indicate that to us, it only sets __sparcv8. */
1690#if defined (__sparc_v9__) || defined (__sparcv9)
1691/* Perhaps we should use floating-point operations here? */
1692#if 0
1693/* Triggers a bug making mpz/tests/t-gcd.c fail.
1694 Perhaps we simply need explicitly zero-extend the inputs? */
1695#define umul_ppmm(w1, w0, u, v) \
1696 __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" : \
1697 "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
1698#else
1699/* Use v8 umul until above bug is fixed. */
1700#define umul_ppmm(w1, w0, u, v) \
1701 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1702#endif
1703/* Use a plain v8 divide for v9. */
1704#define udiv_qrnnd(q, r, n1, n0, d) \
1705 do { \
1706 USItype __q; \
1707 __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \
1708 : "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \
1709 (r) = (n0) - __q * (d); \
1710 (q) = __q; \
1711 } while (0)
1712#else
1713#if defined (__sparc_v8__) /* gcc normal */ \
1714 || defined (__sparcv8) /* gcc solaris */ \
1715 || HAVE_HOST_CPU_supersparc
1716/* Don't match immediate range because, 1) it is not often useful,
1717 2) the 'I' flag thinks of the range as a 13 bit signed interval,
1718 while we want to match a 13 bit interval, sign extended to 32 bits,
1719 but INTERPRETED AS UNSIGNED. */
1720#define umul_ppmm(w1, w0, u, v) \
1721 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1722
1723#if HAVE_HOST_CPU_supersparc
1724#else
1725/* Don't use this on SuperSPARC because its udiv only handles 53 bit
1726 dividends and will trap to the kernel for the rest. */
1727#define udiv_qrnnd(q, r, n1, n0, d) \
1728 do { \
1729 USItype __q; \
1730 __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \
1731 : "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \
1732 (r) = (n0) - __q * (d); \
1733 (q) = __q; \
1734 } while (0)
1735#endif /* HAVE_HOST_CPU_supersparc */
1736
1737#else /* ! __sparc_v8__ */
1738#if defined (__sparclite__)
1739/* This has hardware multiply but not divide. It also has two additional
1740 instructions scan (ffs from high bit) and divscc. */
1741#define umul_ppmm(w1, w0, u, v) \
1742 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1743#define udiv_qrnnd(q, r, n1, n0, d) \
1744 __asm__ ("! Inlined udiv_qrnnd\n" \
1745" wr %%g0,%2,%%y ! Not a delayed write for sparclite\n" \
1746" tst %%g0\n" \
1747" divscc %3,%4,%%g1\n" \
1748" divscc %%g1,%4,%%g1\n" \
1749" divscc %%g1,%4,%%g1\n" \
1750" divscc %%g1,%4,%%g1\n" \
1751" divscc %%g1,%4,%%g1\n" \
1752" divscc %%g1,%4,%%g1\n" \
1753" divscc %%g1,%4,%%g1\n" \
1754" divscc %%g1,%4,%%g1\n" \
1755" divscc %%g1,%4,%%g1\n" \
1756" divscc %%g1,%4,%%g1\n" \
1757" divscc %%g1,%4,%%g1\n" \
1758" divscc %%g1,%4,%%g1\n" \
1759" divscc %%g1,%4,%%g1\n" \
1760" divscc %%g1,%4,%%g1\n" \
1761" divscc %%g1,%4,%%g1\n" \
1762" divscc %%g1,%4,%%g1\n" \
1763" divscc %%g1,%4,%%g1\n" \
1764" divscc %%g1,%4,%%g1\n" \
1765" divscc %%g1,%4,%%g1\n" \
1766" divscc %%g1,%4,%%g1\n" \
1767" divscc %%g1,%4,%%g1\n" \
1768" divscc %%g1,%4,%%g1\n" \
1769" divscc %%g1,%4,%%g1\n" \
1770" divscc %%g1,%4,%%g1\n" \
1771" divscc %%g1,%4,%%g1\n" \
1772" divscc %%g1,%4,%%g1\n" \
1773" divscc %%g1,%4,%%g1\n" \
1774" divscc %%g1,%4,%%g1\n" \
1775" divscc %%g1,%4,%%g1\n" \
1776" divscc %%g1,%4,%%g1\n" \
1777" divscc %%g1,%4,%%g1\n" \
1778" divscc %%g1,%4,%0\n" \
1779" rd %%y,%1\n" \
1780" bl,a 1f\n" \
1781" add %1,%4,%1\n" \
1782"1: ! End of inline udiv_qrnnd" \
1783 : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d) \
1784 : "%g1" __AND_CLOBBER_CC)
1785#define count_leading_zeros(count, x) \
1786 __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
1787/* Early sparclites return 63 for an argument of 0, but they warn that future
1788 implementations might change this. Therefore, leave COUNT_LEADING_ZEROS_0
1789 undefined. */
1790#endif /* __sparclite__ */
1791#endif /* __sparc_v8__ */
1792#endif /* __sparc_v9__ */
1793/* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd. */
1794#ifndef umul_ppmm
1795#define umul_ppmm(w1, w0, u, v) \
1796 __asm__ ("! Inlined umul_ppmm\n" \
1797" wr %%g0,%2,%%y ! SPARC has 0-3 delay insn after a wr\n" \
1798" sra %3,31,%%g2 ! Don't move this insn\n" \
1799" and %2,%%g2,%%g2 ! Don't move this insn\n" \
1800" andcc %%g0,0,%%g1 ! Don't move this insn\n" \
1801" mulscc %%g1,%3,%%g1\n" \
1802" mulscc %%g1,%3,%%g1\n" \
1803" mulscc %%g1,%3,%%g1\n" \
1804" mulscc %%g1,%3,%%g1\n" \
1805" mulscc %%g1,%3,%%g1\n" \
1806" mulscc %%g1,%3,%%g1\n" \
1807" mulscc %%g1,%3,%%g1\n" \
1808" mulscc %%g1,%3,%%g1\n" \
1809" mulscc %%g1,%3,%%g1\n" \
1810" mulscc %%g1,%3,%%g1\n" \
1811" mulscc %%g1,%3,%%g1\n" \
1812" mulscc %%g1,%3,%%g1\n" \
1813" mulscc %%g1,%3,%%g1\n" \
1814" mulscc %%g1,%3,%%g1\n" \
1815" mulscc %%g1,%3,%%g1\n" \
1816" mulscc %%g1,%3,%%g1\n" \
1817" mulscc %%g1,%3,%%g1\n" \
1818" mulscc %%g1,%3,%%g1\n" \
1819" mulscc %%g1,%3,%%g1\n" \
1820" mulscc %%g1,%3,%%g1\n" \
1821" mulscc %%g1,%3,%%g1\n" \
1822" mulscc %%g1,%3,%%g1\n" \
1823" mulscc %%g1,%3,%%g1\n" \
1824" mulscc %%g1,%3,%%g1\n" \
1825" mulscc %%g1,%3,%%g1\n" \
1826" mulscc %%g1,%3,%%g1\n" \
1827" mulscc %%g1,%3,%%g1\n" \
1828" mulscc %%g1,%3,%%g1\n" \
1829" mulscc %%g1,%3,%%g1\n" \
1830" mulscc %%g1,%3,%%g1\n" \
1831" mulscc %%g1,%3,%%g1\n" \
1832" mulscc %%g1,%3,%%g1\n" \
1833" mulscc %%g1,0,%%g1\n" \
1834" add %%g1,%%g2,%0\n" \
1835" rd %%y,%1" \
1836 : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v) \
1837 : "%g1", "%g2" __AND_CLOBBER_CC)
1838#endif
1839#ifndef udiv_qrnnd
1840#ifndef LONGLONG_STANDALONE
1841#define udiv_qrnnd(q, r, n1, n0, d) \
1842 do { UWtype __r; \
1843 (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \
1844 (r) = __r; \
1845 } while (0)
1846extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
1847#endif /* LONGLONG_STANDALONE */
1848#endif /* udiv_qrnnd */
1849#endif /* __sparc__ */
1850
1851#if defined (__sparc__) && W_TYPE_SIZE == 64
1852#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1853 __asm__ ( \
1854 "addcc %r4,%5,%1\n" \
1855 " addccc %r6,%7,%%g0\n" \
1856 " addc %r2,%3,%0" \
1857 : "=r" (sh), "=&r" (sl) \
1858 : "rJ" ((UDItype)(ah)), "rI" ((UDItype)(bh)), \
1859 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)), \
1860 "%rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32) \
1861 __CLOBBER_CC)
1862#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1863 __asm__ ( \
1864 "subcc %r4,%5,%1\n" \
1865 " subccc %r6,%7,%%g0\n" \
1866 " subc %r2,%3,%0" \
1867 : "=r" (sh), "=&r" (sl) \
1868 : "rJ" ((UDItype)(ah)), "rI" ((UDItype)(bh)), \
1869 "rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)), \
1870 "rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32) \
1871 __CLOBBER_CC)
1872#if __VIS__ >= 0x300
1873#undef add_ssaaaa
1874#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1875 __asm__ ( \
1876 "addcc %r4, %5, %1\n" \
1877 " addxc %r2, %r3, %0" \
1878 : "=r" (sh), "=&r" (sl) \
1879 : "rJ" ((UDItype)(ah)), "rJ" ((UDItype)(bh)), \
1880 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC)
1881#define umul_ppmm(ph, pl, m0, m1) \
1882 do { \
1883 UDItype __m0 = (m0), __m1 = (m1); \
1884 (pl) = __m0 * __m1; \
1885 __asm__ ("umulxhi\t%2, %1, %0" \
1886 : "=r" (ph) \
1887 : "%r" (__m0), "r" (__m1)); \
1888 } while (0)
1889#define count_leading_zeros(count, x) \
1890 __asm__ ("lzd\t%1,%0" : "=r" (count) : "r" (x))
1891/* Needed by count_leading_zeros_32 in sparc64.h. */
1892#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1893#endif
1894#endif
1895
1896#if (defined (__vax) || defined (__vax__)) && W_TYPE_SIZE == 32
1897#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1898 __asm__ ("addl2 %5,%1\n\tadwc %3,%0" \
1899 : "=g" (sh), "=&g" (sl) \
1900 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1901 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1902#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1903 __asm__ ("subl2 %5,%1\n\tsbwc %3,%0" \
1904 : "=g" (sh), "=&g" (sl) \
1905 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1906 "1" ((USItype)(al)), "g" ((USItype)(bl)))
1907#define smul_ppmm(xh, xl, m0, m1) \
1908 do { \
1909 union {UDItype __ll; \
1910 struct {USItype __l, __h;} __i; \
1911 } __x; \
1912 USItype __m0 = (m0), __m1 = (m1); \
1913 __asm__ ("emul %1,%2,$0,%0" \
1914 : "=g" (__x.__ll) : "g" (__m0), "g" (__m1)); \
1915 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
1916 } while (0)
1917#define sdiv_qrnnd(q, r, n1, n0, d) \
1918 do { \
1919 union {DItype __ll; \
1920 struct {SItype __l, __h;} __i; \
1921 } __x; \
1922 __x.__i.__h = n1; __x.__i.__l = n0; \
1923 __asm__ ("ediv %3,%2,%0,%1" \
1924 : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d)); \
1925 } while (0)
1926#if 0
1927/* FIXME: This instruction appears to be unimplemented on some systems (vax
1928 8800 maybe). */
1929#define count_trailing_zeros(count,x) \
1930 do { \
1931 __asm__ ("ffs 0, 31, %1, %0" \
1932 : "=g" (count) \
1933 : "g" ((USItype) (x))); \
1934 } while (0)
1935#endif
1936#endif /* vax */
1937
1938#if defined (__z8000__) && W_TYPE_SIZE == 16
1939#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1940 __asm__ ("add %H1,%H5\n\tadc %H0,%H3" \
1941 : "=r" (sh), "=&r" (sl) \
1942 : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \
1943 "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1944#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1945 __asm__ ("sub %H1,%H5\n\tsbc %H0,%H3" \
1946 : "=r" (sh), "=&r" (sl) \
1947 : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \
1948 "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1949#define umul_ppmm(xh, xl, m0, m1) \
1950 do { \
1951 union {long int __ll; \
1952 struct {unsigned int __h, __l;} __i; \
1953 } __x; \
1954 unsigned int __m0 = (m0), __m1 = (m1); \
1955 __asm__ ("mult %S0,%H3" \
1956 : "=r" (__x.__i.__h), "=r" (__x.__i.__l) \
1957 : "%1" (m0), "rQR" (m1)); \
1958 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
1959 (xh) += ((((signed int) __m0 >> 15) & __m1) \
1960 + (((signed int) __m1 >> 15) & __m0)); \
1961 } while (0)
1962#endif /* __z8000__ */
1963
1964#endif /* __GNUC__ */
1965
1966#endif /* NO_ASM */
1967
1968
1969/* FIXME: "sidi" here is highly doubtful, should sometimes be "diti". */
1970#if !defined (umul_ppmm) && defined (__umulsidi3)
1971#define umul_ppmm(ph, pl, m0, m1) \
1972 do { \
1973 UDWtype __ll = __umulsidi3 (m0, m1); \
1974 ph = (UWtype) (__ll >> W_TYPE_SIZE); \
1975 pl = (UWtype) __ll; \
1976 } while (0)
1977#endif
1978
1979#if !defined (__umulsidi3)
1980#define __umulsidi3(u, v) \
1981 ({UWtype __hi, __lo; \
1982 umul_ppmm (__hi, __lo, u, v); \
1983 ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
1984#endif
1985
1986
1987#if defined (__cplusplus)
1988#define __longlong_h_C "C"
1989#else
1990#define __longlong_h_C
1991#endif
1992
1993/* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist. The "_r"
1994 forms have "reversed" arguments, meaning the pointer is last, which
1995 sometimes allows better parameter passing, in particular on 64-bit
1996 hppa. */
1997
1998#define mpn_umul_ppmm __MPN(umul_ppmm)
1999extern __longlong_h_C UWtype mpn_umul_ppmm (UWtype *, UWtype, UWtype);
2000
2001#if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm \
2002 && ! defined (LONGLONG_STANDALONE)
2003#define umul_ppmm(wh, wl, u, v) \
2004 do { \
2005 UWtype __umul_ppmm__p0; \
2006 (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));\
2007 (wl) = __umul_ppmm__p0; \
2008 } while (0)
2009#endif
2010
2011#define mpn_umul_ppmm_r __MPN(umul_ppmm_r)
2012extern __longlong_h_C UWtype mpn_umul_ppmm_r (UWtype, UWtype, UWtype *);
2013
2014#if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r \
2015 && ! defined (LONGLONG_STANDALONE)
2016#define umul_ppmm(wh, wl, u, v) \
2017 do { \
2018 UWtype __umul_p0; \
2019 (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_p0); \
2020 (wl) = __umul_p0; \
2021 } while (0)
2022#endif
2023
2024#define mpn_udiv_qrnnd __MPN(udiv_qrnnd)
2025extern __longlong_h_C UWtype mpn_udiv_qrnnd (UWtype *, UWtype, UWtype, UWtype);
2026
2027#if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd \
2028 && ! defined (LONGLONG_STANDALONE)
2029#define udiv_qrnnd(q, r, n1, n0, d) \
2030 do { \
2031 UWtype __udiv_qrnnd_r; \
2032 (q) = mpn_udiv_qrnnd (&__udiv_qrnnd_r, \
2033 (UWtype) (n1), (UWtype) (n0), (UWtype) d); \
2034 (r) = __udiv_qrnnd_r; \
2035 } while (0)
2036#endif
2037
2038#define mpn_udiv_qrnnd_r __MPN(udiv_qrnnd_r)
2039extern __longlong_h_C UWtype mpn_udiv_qrnnd_r (UWtype, UWtype, UWtype, UWtype *);
2040
2041#if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r \
2042 && ! defined (LONGLONG_STANDALONE)
2043#define udiv_qrnnd(q, r, n1, n0, d) \
2044 do { \
2045 UWtype __udiv_qrnnd_r; \
2046 (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d, \
2047 &__udiv_qrnnd_r); \
2048 (r) = __udiv_qrnnd_r; \
2049 } while (0)
2050#endif
2051
2052
2053/* If this machine has no inline assembler, use C macros. */
2054
2055#if !defined (add_ssaaaa)
2056#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
2057 do { \
2058 UWtype __x; \
2059 __x = (al) + (bl); \
2060 (sh) = (ah) + (bh) + (__x < (al)); \
2061 (sl) = __x; \
2062 } while (0)
2063#endif
2064
2065#if !defined (sub_ddmmss)
2066#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
2067 do { \
2068 UWtype __x; \
2069 __x = (al) - (bl); \
2070 (sh) = (ah) - (bh) - ((al) < (bl)); \
2071 (sl) = __x; \
2072 } while (0)
2073#endif
2074
2075/* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
2076 smul_ppmm. */
2077#if !defined (umul_ppmm) && defined (smul_ppmm)
2078#define umul_ppmm(w1, w0, u, v) \
2079 do { \
2080 UWtype __w1; \
2081 UWtype __xm0 = (u), __xm1 = (v); \
2082 smul_ppmm (__w1, w0, __xm0, __xm1); \
2083 (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \
2084 + (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \
2085 } while (0)
2086#endif
2087
2088/* If we still don't have umul_ppmm, define it using plain C.
2089
2090 For reference, when this code is used for squaring (ie. u and v identical
2091 expressions), gcc recognises __x1 and __x2 are the same and generates 3
2092 multiplies, not 4. The subsequent additions could be optimized a bit,
2093 but the only place GMP currently uses such a square is mpn_sqr_basecase,
2094 and chips obliged to use this generic C umul will have plenty of worse
2095 performance problems than a couple of extra instructions on the diagonal
2096 of sqr_basecase. */
2097
2098#if !defined (umul_ppmm)
2099#define umul_ppmm(w1, w0, u, v) \
2100 do { \
2101 UWtype __x0, __x1, __x2, __x3; \
2102 UHWtype __ul, __vl, __uh, __vh; \
2103 UWtype __u = (u), __v = (v); \
2104 \
2105 __ul = __ll_lowpart (__u); \
2106 __uh = __ll_highpart (__u); \
2107 __vl = __ll_lowpart (__v); \
2108 __vh = __ll_highpart (__v); \
2109 \
2110 __x0 = (UWtype) __ul * __vl; \
2111 __x1 = (UWtype) __ul * __vh; \
2112 __x2 = (UWtype) __uh * __vl; \
2113 __x3 = (UWtype) __uh * __vh; \
2114 \
2115 __x1 += __ll_highpart (__x0);/* this can't give carry */ \
2116 __x1 += __x2; /* but this indeed can */ \
2117 if (__x1 < __x2) /* did we get it? */ \
2118 __x3 += __ll_B; /* yes, add it in the proper pos. */ \
2119 \
2120 (w1) = __x3 + __ll_highpart (__x1); \
2121 (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0); \
2122 } while (0)
2123#endif
2124
2125/* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
2126 exist in one form or another. */
2127#if !defined (smul_ppmm)
2128#define smul_ppmm(w1, w0, u, v) \
2129 do { \
2130 UWtype __w1; \
2131 UWtype __xm0 = (u), __xm1 = (v); \
2132 umul_ppmm (__w1, w0, __xm0, __xm1); \
2133 (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \
2134 - (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \
2135 } while (0)
2136#endif
2137
2138/* Define this unconditionally, so it can be used for debugging. */
2139#define __udiv_qrnnd_c(q, r, n1, n0, d) \
2140 do { \
2141 UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m; \
2142 \
2143 ASSERT ((d) != 0); \
2144 ASSERT ((n1) < (d)); \
2145 \
2146 __d1 = __ll_highpart (d); \
2147 __d0 = __ll_lowpart (d); \
2148 \
2149 __q1 = (n1) / __d1; \
2150 __r1 = (n1) - __q1 * __d1; \
2151 __m = __q1 * __d0; \
2152 __r1 = __r1 * __ll_B | __ll_highpart (n0); \
2153 if (__r1 < __m) \
2154 { \
2155 __q1--, __r1 += (d); \
2156 if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
2157 if (__r1 < __m) \
2158 __q1--, __r1 += (d); \
2159 } \
2160 __r1 -= __m; \
2161 \
2162 __q0 = __r1 / __d1; \
2163 __r0 = __r1 - __q0 * __d1; \
2164 __m = __q0 * __d0; \
2165 __r0 = __r0 * __ll_B | __ll_lowpart (n0); \
2166 if (__r0 < __m) \
2167 { \
2168 __q0--, __r0 += (d); \
2169 if (__r0 >= (d)) \
2170 if (__r0 < __m) \
2171 __q0--, __r0 += (d); \
2172 } \
2173 __r0 -= __m; \
2174 \
2175 (q) = __q1 * __ll_B | __q0; \
2176 (r) = __r0; \
2177 } while (0)
2178
2179/* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
2180 __udiv_w_sdiv (defined in libgcc or elsewhere). */
2181#if !defined (udiv_qrnnd) && defined (sdiv_qrnnd) \
2182 && ! defined (LONGLONG_STANDALONE)
2183#define udiv_qrnnd(q, r, nh, nl, d) \
2184 do { \
2185 UWtype __r; \
2186 (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d); \
2187 (r) = __r; \
2188 } while (0)
2189__GMP_DECLSPEC UWtype __MPN(udiv_w_sdiv) (UWtype *, UWtype, UWtype, UWtype);
2190#endif
2191
2192/* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c. */
2193#if !defined (udiv_qrnnd)
2194#define UDIV_NEEDS_NORMALIZATION 1
2195#define udiv_qrnnd __udiv_qrnnd_c
2196#endif
2197
2198#if !defined (count_leading_zeros)
2199#define count_leading_zeros(count, x) \
2200 do { \
2201 UWtype __xr = (x); \
2202 UWtype __a; \
2203 \
2204 if (W_TYPE_SIZE == 32) \
2205 { \
2206 __a = __xr < ((UWtype) 1 << 2*__BITS4) \
2207 ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1) \
2208 : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1 \
2209 : 3*__BITS4 + 1); \
2210 } \
2211 else \
2212 { \
2213 for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8) \
2214 if (((__xr >> __a) & 0xff) != 0) \
2215 break; \
2216 ++__a; \
2217 } \
2218 \
2219 (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a]; \
2220 } while (0)
2221/* This version gives a well-defined value for zero. */
2222#define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
2223#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2224#define COUNT_LEADING_ZEROS_SLOW
2225#endif
2226
2227/* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
2228#if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
2229#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2230#endif
2231
2232#ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2233extern const unsigned char __GMP_DECLSPEC __clz_tab[129];
2234#endif
2235
2236#if !defined (count_trailing_zeros)
2237#if !defined (COUNT_LEADING_ZEROS_SLOW)
2238/* Define count_trailing_zeros using an asm count_leading_zeros. */
2239#define count_trailing_zeros(count, x) \
2240 do { \
2241 UWtype __ctz_x = (x); \
2242 UWtype __ctz_c; \
2243 ASSERT (__ctz_x != 0); \
2244 count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x); \
2245 (count) = W_TYPE_SIZE - 1 - __ctz_c; \
2246 } while (0)
2247#else
2248/* Define count_trailing_zeros in plain C, assuming small counts are common.
2249 We use clz_tab without ado, since the C count_leading_zeros above will have
2250 pulled it in. */
2251#define count_trailing_zeros(count, x) \
2252 do { \
2253 UWtype __ctz_x = (x); \
2254 int __ctz_c; \
2255 \
2256 if (LIKELY ((__ctz_x & 0xff) != 0)) \
2257 (count) = __clz_tab[__ctz_x & -__ctz_x] - 2; \
2258 else \
2259 { \
2260 for (__ctz_c = 8 - 2; __ctz_c < W_TYPE_SIZE - 2; __ctz_c += 8) \
2261 { \
2262 __ctz_x >>= 8; \
2263 if (LIKELY ((__ctz_x & 0xff) != 0)) \
2264 break; \
2265 } \
2266 \
2267 (count) = __ctz_c + __clz_tab[__ctz_x & -__ctz_x]; \
2268 } \
2269 } while (0)
2270#endif
2271#endif
2272
2273#ifndef UDIV_NEEDS_NORMALIZATION
2274#define UDIV_NEEDS_NORMALIZATION 0
2275#endif
2276
2277/* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
2278 that hence the latter should always be used. */
2279#ifndef UDIV_PREINV_ALWAYS
2280#define UDIV_PREINV_ALWAYS 0
2281#endif