Add libgmp 6.2.0 to third_party

Don't build it yet.  That will come in the next review.

Change-Id: Idf3266558165e5ab45f4a41c98cc8c838c8244d5
diff --git a/third_party/gmp/tune/speed.h b/third_party/gmp/tune/speed.h
new file mode 100644
index 0000000..f23bacb
--- /dev/null
+++ b/third_party/gmp/tune/speed.h
@@ -0,0 +1,3800 @@
+/* Header for speed and threshold things.
+
+Copyright 1999-2003, 2005, 2006, 2008-2017, 2019 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#ifndef __SPEED_H__
+#define __SPEED_H__
+
+
+/* Pad ptr,oldsize with zero limbs (at the most significant end) to make it
+   newsize long. */
+#define MPN_ZERO_EXTEND(ptr, oldsize, newsize)		\
+  do {							\
+    ASSERT ((newsize) >= (oldsize));			\
+    MPN_ZERO ((ptr)+(oldsize), (newsize)-(oldsize));	\
+  } while (0)
+
+/* A mask of the least significant n bits.  Note 1<<32 doesn't give zero on
+   x86 family CPUs, hence the separate case for GMP_LIMB_BITS. */
+#define MP_LIMB_T_LOWBITMASK(n)	\
+  ((n) == GMP_LIMB_BITS ? MP_LIMB_T_MAX : ((mp_limb_t) 1 << (n)) - 1)
+
+
+/* align must be a power of 2 here, usually CACHE_LINE_SIZE is a good choice */
+
+#define TMP_ALLOC_ALIGNED(bytes, align)	\
+  align_pointer (TMP_ALLOC ((bytes) + (align)-1), (align))
+#define TMP_ALLOC_LIMBS_ALIGNED(limbs, align)	\
+  ((mp_ptr) TMP_ALLOC_ALIGNED ((limbs)*sizeof(mp_limb_t), align))
+
+/* CACHE_LINE_SIZE is our default alignment for speed operands, and the
+   limit on what s->align_xp etc and then request for off-alignment.  Maybe
+   this should be an option of some sort, but in any case here are some line
+   sizes,
+
+       bytes
+	 32   pentium
+	 64   athlon
+	 64   itanium-2 L1
+	128   itanium-2 L2
+*/
+#define CACHE_LINE_SIZE   64 /* bytes */
+
+#define SPEED_TMP_ALLOC_ADJUST_MASK  (CACHE_LINE_SIZE/GMP_LIMB_BYTES - 1)
+
+/* Set ptr to a TMP_ALLOC block of the given limbs, with the given limb
+   alignment.  */
+#define SPEED_TMP_ALLOC_LIMBS(ptr, limbs, align)			\
+  do {									\
+    mp_ptr     __ptr;							\
+    mp_size_t  __ptr_align, __ptr_add;					\
+									\
+    ASSERT ((CACHE_LINE_SIZE % GMP_LIMB_BYTES) == 0);		\
+    __ptr = TMP_ALLOC_LIMBS ((limbs) + SPEED_TMP_ALLOC_ADJUST_MASK);	\
+    __ptr_align = (__ptr - (mp_ptr) NULL);				\
+    __ptr_add = ((align) - __ptr_align) & SPEED_TMP_ALLOC_ADJUST_MASK;	\
+    (ptr) = __ptr + __ptr_add;						\
+  } while (0)
+
+
+/* This is the size for s->xp_block and s->yp_block, used in certain
+   routines that want to run across many different data values and use
+   s->size for a different purpose, eg. SPEED_ROUTINE_MPN_GCD_1.
+
+   512 means 2kbytes of data for each of xp_block and yp_block, making 4k
+   total, which should fit easily in any L1 data cache. */
+
+#define SPEED_BLOCK_SIZE   512 /* limbs */
+
+
+extern double  speed_unittime;
+extern double  speed_cycletime;
+extern int     speed_precision;
+extern char    speed_time_string[];
+void speed_time_init (void);
+void speed_cycletime_fail (const char *str);
+void speed_cycletime_init (void);
+void speed_cycletime_need_cycles (void);
+void speed_cycletime_need_seconds (void);
+void speed_starttime (void);
+double speed_endtime (void);
+
+
+struct speed_params {
+  unsigned   reps;	/* how many times to run the routine */
+  mp_ptr     xp;	/* first argument */
+  mp_ptr     yp;	/* second argument */
+  mp_size_t  size;	/* size of both arguments */
+  mp_limb_t  r;		/* user supplied parameter */
+  mp_size_t  align_xp;	/* alignment of xp */
+  mp_size_t  align_yp;	/* alignment of yp */
+  mp_size_t  align_wp;	/* intended alignment of wp */
+  mp_size_t  align_wp2; /* intended alignment of wp2 */
+  mp_ptr     xp_block;	/* first special SPEED_BLOCK_SIZE block */
+  mp_ptr     yp_block;	/* second special SPEED_BLOCK_SIZE block */
+
+  double     time_divisor; /* optionally set by the speed routine */
+
+  /* used by the cache priming things */
+  int	     cache;
+  unsigned   src_num, dst_num;
+  struct {
+    mp_ptr    ptr;
+    mp_size_t size;
+  } src[5], dst[4];
+};
+
+typedef double (*speed_function_t) (struct speed_params *);
+
+double speed_measure (speed_function_t fun, struct speed_params *);
+
+/* Prototypes for speed measuring routines */
+
+double speed_back_to_back (struct speed_params *);
+double speed_count_leading_zeros (struct speed_params *);
+double speed_count_trailing_zeros (struct speed_params *);
+double speed_find_a (struct speed_params *);
+double speed_gmp_allocate_free (struct speed_params *);
+double speed_gmp_allocate_reallocate_free (struct speed_params *);
+double speed_invert_limb (struct speed_params *);
+double speed_malloc_free (struct speed_params *);
+double speed_malloc_realloc_free (struct speed_params *);
+double speed_memcpy (struct speed_params *);
+double speed_binvert_limb (struct speed_params *);
+double speed_binvert_limb_mul1 (struct speed_params *);
+double speed_binvert_limb_loop (struct speed_params *);
+double speed_binvert_limb_cond (struct speed_params *);
+double speed_binvert_limb_arith (struct speed_params *);
+
+double speed_mpf_init_clear (struct speed_params *);
+
+double speed_mpn_add_n (struct speed_params *);
+double speed_mpn_add_1 (struct speed_params *);
+double speed_mpn_add_1_inplace (struct speed_params *);
+double speed_mpn_add_err1_n (struct speed_params *);
+double speed_mpn_add_err2_n (struct speed_params *);
+double speed_mpn_add_err3_n (struct speed_params *);
+double speed_mpn_addlsh_n (struct speed_params *);
+double speed_mpn_addlsh1_n (struct speed_params *);
+double speed_mpn_addlsh2_n (struct speed_params *);
+double speed_mpn_addlsh_n_ip1 (struct speed_params *);
+double speed_mpn_addlsh1_n_ip1 (struct speed_params *);
+double speed_mpn_addlsh2_n_ip1 (struct speed_params *);
+double speed_mpn_addlsh_n_ip2 (struct speed_params *);
+double speed_mpn_addlsh1_n_ip2 (struct speed_params *);
+double speed_mpn_addlsh2_n_ip2 (struct speed_params *);
+double speed_mpn_add_n_sub_n (struct speed_params *);
+double speed_mpn_and_n (struct speed_params *);
+double speed_mpn_andn_n (struct speed_params *);
+double speed_mpn_addmul_1 (struct speed_params *);
+double speed_mpn_addmul_2 (struct speed_params *);
+double speed_mpn_addmul_3 (struct speed_params *);
+double speed_mpn_addmul_4 (struct speed_params *);
+double speed_mpn_addmul_5 (struct speed_params *);
+double speed_mpn_addmul_6 (struct speed_params *);
+double speed_mpn_addmul_7 (struct speed_params *);
+double speed_mpn_addmul_8 (struct speed_params *);
+double speed_mpn_cnd_add_n (struct speed_params *);
+double speed_mpn_cnd_sub_n (struct speed_params *);
+double speed_mpn_com (struct speed_params *);
+double speed_mpn_neg (struct speed_params *);
+double speed_mpn_copyd (struct speed_params *);
+double speed_mpn_copyi (struct speed_params *);
+double speed_MPN_COPY (struct speed_params *);
+double speed_MPN_COPY_DECR (struct speed_params *);
+double speed_MPN_COPY_INCR (struct speed_params *);
+double speed_mpn_sec_tabselect (struct speed_params *);
+double speed_mpn_divexact_1 (struct speed_params *);
+double speed_mpn_divexact_by3 (struct speed_params *);
+double speed_mpn_bdiv_q_1 (struct speed_params *);
+double speed_mpn_pi1_bdiv_q_1 (struct speed_params *);
+double speed_mpn_bdiv_dbm1c (struct speed_params *);
+double speed_mpn_divrem_1 (struct speed_params *);
+double speed_mpn_divrem_1f (struct speed_params *);
+double speed_mpn_divrem_1c (struct speed_params *);
+double speed_mpn_divrem_1cf (struct speed_params *);
+double speed_mpn_divrem_1_div (struct speed_params *);
+double speed_mpn_divrem_1f_div (struct speed_params *);
+double speed_mpn_divrem_1_inv (struct speed_params *);
+double speed_mpn_divrem_1f_inv (struct speed_params *);
+double speed_mpn_divrem_2 (struct speed_params *);
+double speed_mpn_divrem_2_div (struct speed_params *);
+double speed_mpn_divrem_2_inv (struct speed_params *);
+double speed_mpn_div_qr_1n_pi1 (struct speed_params *);
+double speed_mpn_div_qr_1n_pi1_1 (struct speed_params *);
+double speed_mpn_div_qr_1n_pi1_2 (struct speed_params *);
+double speed_mpn_div_qr_1 (struct speed_params *);
+double speed_mpn_div_qr_2n (struct speed_params *);
+double speed_mpn_div_qr_2u (struct speed_params *);
+double speed_mpn_fib2_ui (struct speed_params *);
+double speed_mpn_matrix22_mul (struct speed_params *);
+double speed_mpn_hgcd2 (struct speed_params *);
+double speed_mpn_hgcd2_1 (struct speed_params *);
+double speed_mpn_hgcd2_2 (struct speed_params *);
+double speed_mpn_hgcd2_3 (struct speed_params *);
+double speed_mpn_hgcd2_4 (struct speed_params *);
+double speed_mpn_hgcd2_5 (struct speed_params *);
+double speed_mpn_hgcd (struct speed_params *);
+double speed_mpn_hgcd_lehmer (struct speed_params *);
+double speed_mpn_hgcd_appr (struct speed_params *);
+double speed_mpn_hgcd_appr_lehmer (struct speed_params *);
+double speed_mpn_hgcd_reduce (struct speed_params *);
+double speed_mpn_hgcd_reduce_1 (struct speed_params *);
+double speed_mpn_hgcd_reduce_2 (struct speed_params *);
+double speed_mpn_gcd (struct speed_params *);
+double speed_mpn_gcd_1 (struct speed_params *);
+double speed_mpn_gcd_11 (struct speed_params *);
+double speed_mpn_gcd_1N (struct speed_params *);
+double speed_mpn_gcd_22 (struct speed_params *);
+double speed_mpn_gcdext (struct speed_params *);
+double speed_mpn_gcdext_double (struct speed_params *);
+double speed_mpn_gcdext_one_double (struct speed_params *);
+double speed_mpn_gcdext_one_single (struct speed_params *);
+double speed_mpn_gcdext_single (struct speed_params *);
+double speed_mpn_get_str (struct speed_params *);
+double speed_mpn_hamdist (struct speed_params *);
+double speed_mpn_ior_n (struct speed_params *);
+double speed_mpn_iorn_n (struct speed_params *);
+double speed_mpn_jacobi_base (struct speed_params *);
+double speed_mpn_jacobi_base_1 (struct speed_params *);
+double speed_mpn_jacobi_base_2 (struct speed_params *);
+double speed_mpn_jacobi_base_3 (struct speed_params *);
+double speed_mpn_jacobi_base_4 (struct speed_params *);
+double speed_mpn_lshift (struct speed_params *);
+double speed_mpn_lshiftc (struct speed_params *);
+double speed_mpn_mod_1 (struct speed_params *);
+double speed_mpn_mod_1c (struct speed_params *);
+double speed_mpn_mod_1_div (struct speed_params *);
+double speed_mpn_mod_1_inv (struct speed_params *);
+double speed_mpn_mod_1_1 (struct speed_params *);
+double speed_mpn_mod_1_1_1 (struct speed_params *);
+double speed_mpn_mod_1_1_2 (struct speed_params *);
+double speed_mpn_mod_1_2 (struct speed_params *);
+double speed_mpn_mod_1_3 (struct speed_params *);
+double speed_mpn_mod_1_4 (struct speed_params *);
+double speed_mpn_mod_34lsub1 (struct speed_params *);
+double speed_mpn_modexact_1_odd (struct speed_params *);
+double speed_mpn_modexact_1c_odd (struct speed_params *);
+double speed_mpn_mul_1 (struct speed_params *);
+double speed_mpn_mul_1_inplace (struct speed_params *);
+double speed_mpn_mul_2 (struct speed_params *);
+double speed_mpn_mul_3 (struct speed_params *);
+double speed_mpn_mul_4 (struct speed_params *);
+double speed_mpn_mul_5 (struct speed_params *);
+double speed_mpn_mul_6 (struct speed_params *);
+double speed_mpn_mul (struct speed_params *);
+double speed_mpn_mul_basecase (struct speed_params *);
+double speed_mpn_mulmid (struct speed_params *);
+double speed_mpn_mulmid_basecase (struct speed_params *);
+double speed_mpn_mul_fft (struct speed_params *);
+double speed_mpn_mul_fft_sqr (struct speed_params *);
+double speed_mpn_fft_mul (struct speed_params *);
+double speed_mpn_fft_sqr (struct speed_params *);
+#if WANT_OLD_FFT_FULL
+double speed_mpn_mul_fft_full (struct speed_params *);
+double speed_mpn_mul_fft_full_sqr (struct speed_params *);
+#endif
+double speed_mpn_nussbaumer_mul (struct speed_params *);
+double speed_mpn_nussbaumer_mul_sqr (struct speed_params *);
+double speed_mpn_mul_n (struct speed_params *);
+double speed_mpn_mul_n_sqr (struct speed_params *);
+double speed_mpn_mulmid_n (struct speed_params *);
+double speed_mpn_sqrlo (struct speed_params *);
+double speed_mpn_sqrlo_basecase (struct speed_params *);
+double speed_mpn_mullo_n (struct speed_params *);
+double speed_mpn_mullo_basecase (struct speed_params *);
+double speed_mpn_nand_n (struct speed_params *);
+double speed_mpn_nior_n (struct speed_params *);
+double speed_mpn_popcount (struct speed_params *);
+double speed_mpn_preinv_divrem_1 (struct speed_params *);
+double speed_mpn_preinv_divrem_1f (struct speed_params *);
+double speed_mpn_preinv_mod_1 (struct speed_params *);
+double speed_mpn_sbpi1_div_qr (struct speed_params *);
+double speed_mpn_dcpi1_div_qr (struct speed_params *);
+double speed_mpn_sbpi1_divappr_q (struct speed_params *);
+double speed_mpn_dcpi1_divappr_q (struct speed_params *);
+double speed_mpn_mu_div_qr (struct speed_params *);
+double speed_mpn_mu_divappr_q (struct speed_params *);
+double speed_mpn_mupi_div_qr (struct speed_params *);
+double speed_mpn_mu_div_q (struct speed_params *);
+double speed_mpn_sbpi1_bdiv_qr (struct speed_params *);
+double speed_mpn_dcpi1_bdiv_qr (struct speed_params *);
+double speed_mpn_sbpi1_bdiv_q (struct speed_params *);
+double speed_mpn_dcpi1_bdiv_q (struct speed_params *);
+double speed_mpn_sbpi1_bdiv_r (struct speed_params *);
+double speed_mpn_mu_bdiv_q (struct speed_params *);
+double speed_mpn_mu_bdiv_qr (struct speed_params *);
+double speed_mpn_broot (struct speed_params *);
+double speed_mpn_broot_invm1 (struct speed_params *);
+double speed_mpn_brootinv (struct speed_params *);
+double speed_mpn_invert (struct speed_params *);
+double speed_mpn_invertappr (struct speed_params *);
+double speed_mpn_ni_invertappr (struct speed_params *);
+double speed_mpn_sec_invert (struct speed_params *s);
+double speed_mpn_binvert (struct speed_params *);
+double speed_mpn_redc_1 (struct speed_params *);
+double speed_mpn_redc_2 (struct speed_params *);
+double speed_mpn_redc_n (struct speed_params *);
+double speed_mpn_rsblsh_n (struct speed_params *);
+double speed_mpn_rsblsh1_n (struct speed_params *);
+double speed_mpn_rsblsh2_n (struct speed_params *);
+double speed_mpn_rsh1add_n (struct speed_params *);
+double speed_mpn_rsh1sub_n (struct speed_params *);
+double speed_mpn_rshift (struct speed_params *);
+double speed_mpn_sb_divrem_m3 (struct speed_params *);
+double speed_mpn_sb_divrem_m3_div (struct speed_params *);
+double speed_mpn_sb_divrem_m3_inv (struct speed_params *);
+double speed_mpn_set_str (struct speed_params *);
+double speed_mpn_bc_set_str (struct speed_params *);
+double speed_mpn_dc_set_str (struct speed_params *);
+double speed_mpn_set_str_pre (struct speed_params *);
+double speed_mpn_sqr_basecase (struct speed_params *);
+double speed_mpn_sqr_diag_addlsh1 (struct speed_params *);
+double speed_mpn_sqr_diagonal (struct speed_params *);
+double speed_mpn_sqr (struct speed_params *);
+double speed_mpn_sqrtrem (struct speed_params *);
+double speed_mpn_rootrem (struct speed_params *);
+double speed_mpn_sqrt (struct speed_params *);
+double speed_mpn_root (struct speed_params *);
+double speed_mpn_perfect_power_p (struct speed_params *);
+double speed_mpn_perfect_square_p (struct speed_params *);
+double speed_mpn_sub_n (struct speed_params *);
+double speed_mpn_sub_1 (struct speed_params *);
+double speed_mpn_sub_1_inplace (struct speed_params *);
+double speed_mpn_sub_err1_n (struct speed_params *);
+double speed_mpn_sub_err2_n (struct speed_params *);
+double speed_mpn_sub_err3_n (struct speed_params *);
+double speed_mpn_sublsh_n (struct speed_params *);
+double speed_mpn_sublsh1_n (struct speed_params *);
+double speed_mpn_sublsh2_n (struct speed_params *);
+double speed_mpn_sublsh_n_ip1 (struct speed_params *);
+double speed_mpn_sublsh1_n_ip1 (struct speed_params *);
+double speed_mpn_sublsh2_n_ip1 (struct speed_params *);
+double speed_mpn_submul_1 (struct speed_params *);
+double speed_mpn_toom2_sqr (struct speed_params *);
+double speed_mpn_toom3_sqr (struct speed_params *);
+double speed_mpn_toom4_sqr (struct speed_params *);
+double speed_mpn_toom6_sqr (struct speed_params *);
+double speed_mpn_toom8_sqr (struct speed_params *);
+double speed_mpn_toom22_mul (struct speed_params *);
+double speed_mpn_toom33_mul (struct speed_params *);
+double speed_mpn_toom44_mul (struct speed_params *);
+double speed_mpn_toom6h_mul (struct speed_params *);
+double speed_mpn_toom8h_mul (struct speed_params *);
+double speed_mpn_toom32_mul (struct speed_params *);
+double speed_mpn_toom42_mul (struct speed_params *);
+double speed_mpn_toom43_mul (struct speed_params *);
+double speed_mpn_toom63_mul (struct speed_params *);
+double speed_mpn_toom32_for_toom43_mul (struct speed_params *);
+double speed_mpn_toom43_for_toom32_mul (struct speed_params *);
+double speed_mpn_toom32_for_toom53_mul (struct speed_params *);
+double speed_mpn_toom53_for_toom32_mul (struct speed_params *);
+double speed_mpn_toom42_for_toom53_mul (struct speed_params *);
+double speed_mpn_toom53_for_toom42_mul (struct speed_params *);
+double speed_mpn_toom43_for_toom54_mul (struct speed_params *);
+double speed_mpn_toom54_for_toom43_mul (struct speed_params *);
+double speed_mpn_toom42_mulmid (struct speed_params *);
+double speed_mpn_mulmod_bnm1 (struct speed_params *);
+double speed_mpn_bc_mulmod_bnm1 (struct speed_params *);
+double speed_mpn_mulmod_bnm1_rounded (struct speed_params *);
+double speed_mpn_sqrmod_bnm1 (struct speed_params *);
+double speed_mpn_udiv_qrnnd (struct speed_params *);
+double speed_mpn_udiv_qrnnd_r (struct speed_params *);
+double speed_mpn_umul_ppmm (struct speed_params *);
+double speed_mpn_umul_ppmm_r (struct speed_params *);
+double speed_mpn_xnor_n (struct speed_params *);
+double speed_mpn_xor_n (struct speed_params *);
+double speed_MPN_ZERO (struct speed_params *);
+
+double speed_mpq_init_clear (struct speed_params *);
+
+double speed_mpz_add (struct speed_params *);
+double speed_mpz_invert (struct speed_params *);
+double speed_mpz_bin_uiui (struct speed_params *);
+double speed_mpz_bin_ui (struct speed_params *);
+double speed_mpz_fac_ui (struct speed_params *);
+double speed_mpz_2fac_ui (struct speed_params *);
+double speed_mpz_mfac_uiui (struct speed_params *);
+double speed_mpz_primorial_ui (struct speed_params *);
+double speed_mpz_fib_ui (struct speed_params *);
+double speed_mpz_fib2_ui (struct speed_params *);
+double speed_mpz_init_clear (struct speed_params *);
+double speed_mpz_init_realloc_clear (struct speed_params *);
+double speed_mpz_nextprime (struct speed_params *);
+double speed_mpz_jacobi (struct speed_params *);
+double speed_mpz_lucnum_ui (struct speed_params *);
+double speed_mpz_lucnum2_ui (struct speed_params *);
+double speed_mpz_mod (struct speed_params *);
+double speed_mpz_powm (struct speed_params *);
+double speed_mpz_powm_mod (struct speed_params *);
+double speed_mpz_powm_redc (struct speed_params *);
+double speed_mpz_powm_sec (struct speed_params *);
+double speed_mpz_powm_ui (struct speed_params *);
+double speed_mpz_urandomb (struct speed_params *);
+
+double speed_gmp_randseed (struct speed_params *);
+double speed_gmp_randseed_ui (struct speed_params *);
+
+double speed_noop (struct speed_params *);
+double speed_noop_wxs (struct speed_params *);
+double speed_noop_wxys (struct speed_params *);
+
+double speed_operator_div (struct speed_params *);
+double speed_operator_mod (struct speed_params *);
+
+double speed_udiv_qrnnd (struct speed_params *);
+double speed_udiv_qrnnd_preinv1 (struct speed_params *);
+double speed_udiv_qrnnd_preinv2 (struct speed_params *);
+double speed_udiv_qrnnd_preinv3 (struct speed_params *);
+double speed_udiv_qrnnd_c (struct speed_params *);
+double speed_umul_ppmm (struct speed_params *);
+
+/* Prototypes for other routines */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* low 32-bits in p[0], high 32-bits in p[1] */
+void speed_cyclecounter (unsigned p[2]);
+
+#if defined (__cplusplus)
+}
+#endif
+
+void mftb_function (unsigned p[2]);
+
+double speed_cyclecounter_diff (const unsigned [2], const unsigned [2]);
+int gettimeofday_microseconds_p (void);
+int getrusage_microseconds_p (void);
+int cycles_works_p (void);
+long clk_tck (void);
+double freq_measure (const char *, double (*)(void));
+
+int double_cmp_ptr (const double *, const double *);
+void pentium_wbinvd (void);
+typedef int (*qsort_function_t) (const void *, const void *);
+
+void noop (void);
+void noop_1 (mp_limb_t);
+void noop_wxs (mp_ptr, mp_srcptr, mp_size_t);
+void noop_wxys (mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
+void mpn_cache_fill (mp_srcptr, mp_size_t);
+void mpn_cache_fill_dummy (mp_limb_t);
+void speed_cache_fill (struct speed_params *);
+void speed_operand_src (struct speed_params *, mp_ptr, mp_size_t);
+void speed_operand_dst (struct speed_params *, mp_ptr, mp_size_t);
+
+extern int  speed_option_addrs;
+extern int  speed_option_verbose;
+extern int  speed_option_cycles_broken;
+void speed_option_set (const char *);
+
+mp_limb_t mpn_div_qr_1n_pi1_1 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
+mp_limb_t mpn_div_qr_1n_pi1_2 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
+
+mp_limb_t mpn_divrem_1_div (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
+mp_limb_t mpn_divrem_1_inv (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
+mp_limb_t mpn_divrem_2_div (mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr);
+mp_limb_t mpn_divrem_2_inv (mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr);
+
+int mpn_jacobi_base_1 (mp_limb_t, mp_limb_t, int);
+int mpn_jacobi_base_2 (mp_limb_t, mp_limb_t, int);
+int mpn_jacobi_base_3 (mp_limb_t, mp_limb_t, int);
+int mpn_jacobi_base_4 (mp_limb_t, mp_limb_t, int);
+
+int mpn_hgcd2_1 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
+int mpn_hgcd2_2 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
+int mpn_hgcd2_3 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
+int mpn_hgcd2_4 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
+int mpn_hgcd2_5 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
+
+mp_limb_t mpn_mod_1_div (mp_srcptr, mp_size_t, mp_limb_t);
+mp_limb_t mpn_mod_1_inv (mp_srcptr, mp_size_t, mp_limb_t);
+
+mp_limb_t mpn_mod_1_1p_1 (mp_srcptr, mp_size_t, mp_limb_t, const mp_limb_t [4]);
+mp_limb_t mpn_mod_1_1p_2 (mp_srcptr, mp_size_t, mp_limb_t, const mp_limb_t [4]);
+
+void mpn_mod_1_1p_cps_1 (mp_limb_t [4], mp_limb_t);
+void mpn_mod_1_1p_cps_2 (mp_limb_t [4], mp_limb_t);
+
+mp_size_t mpn_gcdext_one_double (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
+mp_size_t mpn_gcdext_one_single (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
+mp_size_t mpn_gcdext_single (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
+mp_size_t mpn_gcdext_double (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
+mp_size_t mpn_hgcd_lehmer (mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr);
+mp_size_t mpn_hgcd_lehmer_itch (mp_size_t);
+
+mp_size_t mpn_hgcd_appr_lehmer (mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr);
+mp_size_t mpn_hgcd_appr_lehmer_itch (mp_size_t);
+
+mp_size_t mpn_hgcd_reduce_1 (struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr);
+mp_size_t mpn_hgcd_reduce_1_itch (mp_size_t, mp_size_t);
+
+mp_size_t mpn_hgcd_reduce_2 (struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr);
+mp_size_t mpn_hgcd_reduce_2_itch (mp_size_t, mp_size_t);
+
+mp_limb_t mpn_sb_divrem_mn_div (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t);
+mp_limb_t mpn_sb_divrem_mn_inv (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t);
+
+mp_size_t mpn_set_str_basecase (mp_ptr, const unsigned char *, size_t, int);
+void mpn_pre_set_str (mp_ptr, unsigned char *, size_t, powers_t *, mp_ptr);
+
+void mpz_powm_mod (mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr);
+void mpz_powm_redc (mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr);
+
+int speed_routine_count_zeros_setup (struct speed_params *, mp_ptr, int, int);
+
+
+/* "get" is called repeatedly until it ticks over, just in case on a fast
+   processor it takes less than a microsecond, though this is probably
+   unlikely if it's a system call.
+
+   speed_cyclecounter is called on the same side of the "get" for the start
+   and end measurements.  It doesn't matter how long it takes from the "get"
+   sample to the cycles sample, since that period will cancel out in the
+   difference calculation (assuming it's the same each time).
+
+   Letting the test run for more than a process time slice is probably only
+   going to reduce accuracy, especially for getrusage when the cycle counter
+   is real time, or for gettimeofday if the cycle counter is in fact process
+   time.  Use CLK_TCK/2 as a reasonable stop.
+
+   It'd be desirable to be quite accurate here.  The default speed_precision
+   for a cycle counter is 10000 cycles, so to mix that with getrusage or
+   gettimeofday the frequency should be at least that accurate.  But running
+   measurements for 10000 microseconds (or more) is too long.  Be satisfied
+   with just a half clock tick (5000 microseconds usually).  */
+
+#define FREQ_MEASURE_ONE(name, type, get, getc, sec, usec)		\
+  do {									\
+    type      st1, st, et1, et;						\
+    unsigned  sc[2], ec[2];						\
+    long      dt, half_tick;						\
+    double    dc, cyc;							\
+									\
+    half_tick = (1000000L / clk_tck()) / 2;				\
+									\
+    get (st1);								\
+    do {								\
+      get (st);								\
+    } while (usec(st) == usec(st1) && sec(st) == sec(st1));		\
+									\
+    getc (sc);								\
+									\
+    for (;;)								\
+      {									\
+	get (et1);							\
+	do {								\
+	  get (et);							\
+	} while (usec(et) == usec(et1) && sec(et) == sec(et1));		\
+									\
+	getc (ec);							\
+									\
+	dc = speed_cyclecounter_diff (ec, sc);				\
+									\
+	/* allow secs to cancel before multiplying */			\
+	dt = sec(et) - sec(st);						\
+	dt = dt * 1000000L + (usec(et) - usec(st));			\
+									\
+	if (dt >= half_tick)						\
+	  break;							\
+      }									\
+									\
+    cyc = dt * 1e-6 / dc;						\
+									\
+    if (speed_option_verbose >= 2)					\
+      printf ("freq_measure_%s_one() dc=%.6g dt=%ld cyc=%.6g\n",	\
+	      name, dc, dt, cyc);					\
+									\
+    return dt * 1e-6 / dc;						\
+									\
+  } while (0)
+
+
+
+
+/* The measuring routines use these big macros to save duplication for
+   similar forms.  They also get used for some automatically generated
+   measuring of new implementations of functions.
+
+   Having something like SPEED_ROUTINE_BINARY_N as a subroutine accepting a
+   function pointer is considered undesirable since it's not the way a
+   normal application will be calling, and some processors might do
+   different things with an indirect call, like not branch predicting, or
+   doing a full pipe flush.  At least some of the "functions" measured are
+   actually macros too.
+
+   The net effect is to bloat the object code, possibly in a big way, but
+   only what's being measured is being run, so that doesn't matter.
+
+   The loop forms don't try to cope with __GMP_ATTRIBUTE_PURE or
+   ATTRIBUTE_CONST on the called functions.  Adding a cast to a non-pure
+   function pointer doesn't work in gcc 3.2.  Using an actual non-pure
+   function pointer variable works, but stands a real risk of a
+   non-optimizing compiler generating unnecessary overheads in the call.
+   Currently the best idea is not to use those attributes for a timing
+   program build.  __GMP_NO_ATTRIBUTE_CONST_PURE will tell gmp.h and
+   gmp-impl.h to omit them from routines there.  */
+
+#define SPEED_RESTRICT_COND(cond)   if (!(cond)) return -1.0;
+
+/* For mpn_copy or similar. */
+#define SPEED_ROUTINE_MPN_COPY_CALL(call)				\
+  {									\
+    mp_ptr    wp;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 0);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+#define SPEED_ROUTINE_MPN_COPY(function)				\
+  SPEED_ROUTINE_MPN_COPY_CALL (function (wp, s->xp, s->size))
+
+#define SPEED_ROUTINE_MPN_TABSELECT(function)				\
+  {									\
+    mp_ptr    xp, wp;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 0);					\
+									\
+    if (s->r == 0)							\
+      s->r = s->size;	/* default to a quadratic shape */		\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (xp, s->size * s->r, s->align_xp);		\
+    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
+									\
+    speed_operand_src (s, xp, s->size * s->r);				\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (wp, xp, s->size, s->r, (s->r) / 2);			\
+    while (--i != 0);							\
+    t = speed_endtime () / s->r;					\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+
+#define SPEED_ROUTINE_MPN_COPYC(function)				\
+  {									\
+    mp_ptr    wp;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 0);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (wp, s->xp, s->size, 0);					\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+/* s->size is still in limbs, and it's limbs which are copied, but
+   "function" takes a size in bytes not limbs.  */
+#define SPEED_ROUTINE_MPN_COPY_BYTES(function)				\
+  {									\
+    mp_ptr    wp;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 0);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (wp, s->xp, s->size * GMP_LIMB_BYTES);		\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+
+/* For mpn_add_n, mpn_sub_n, or similar. */
+#define SPEED_ROUTINE_MPN_BINARY_N_CALL(call)				\
+  {									\
+    mp_ptr     wp;							\
+    mp_ptr     xp, yp;							\
+    unsigned   i;							\
+    double     t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
+									\
+    xp = s->xp;								\
+    yp = s->yp;								\
+									\
+    if (s->r == 0)	;						\
+    else if (s->r == 1) { xp = wp;	    }				\
+    else if (s->r == 2) {	   yp = wp; }				\
+    else if (s->r == 3) { xp = wp; yp = wp; }				\
+    else if (s->r == 4) {     yp = xp;	    }				\
+    else		{						\
+      TMP_FREE;								\
+      return -1.0;							\
+    }									\
+									\
+    /* initialize wp if operand overlap */				\
+    if (xp == wp || yp == wp)						\
+      MPN_COPY (wp, s->xp, s->size);					\
+									\
+    speed_operand_src (s, xp, s->size);					\
+    speed_operand_src (s, yp, s->size);					\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+
+/* For mpn_aors_errK_n, where 1 <= K <= 3. */
+#define SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL(call, K)			\
+  {									\
+    mp_ptr     wp;							\
+    mp_ptr     xp, yp;							\
+    mp_ptr     zp[K];							\
+    mp_limb_t  ep[2*K];							\
+    unsigned   i;							\
+    double     t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
+									\
+    /* (don't have a mechanism to specify zp alignments) */		\
+    for (i = 0; i < K; i++)						\
+      SPEED_TMP_ALLOC_LIMBS (zp[i], s->size, 0);			\
+									\
+    xp = s->xp;								\
+    yp = s->yp;								\
+									\
+    if (s->r == 0)	;						\
+    else if (s->r == 1) { xp = wp;	    }				\
+    else if (s->r == 2) {	   yp = wp; }				\
+    else if (s->r == 3) { xp = wp; yp = wp; }				\
+    else if (s->r == 4) {     yp = xp;	    }				\
+    else		{						\
+      TMP_FREE;								\
+      return -1.0;							\
+    }									\
+									\
+    /* initialize wp if operand overlap */				\
+    if (xp == wp || yp == wp)						\
+      MPN_COPY (wp, s->xp, s->size);					\
+									\
+    speed_operand_src (s, xp, s->size);					\
+    speed_operand_src (s, yp, s->size);					\
+    for (i = 0; i < K; i++)						\
+      speed_operand_src (s, zp[i], s->size);				\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_BINARY_ERR1_N(function)			\
+  SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], s->size, 0), 1)
+
+#define SPEED_ROUTINE_MPN_BINARY_ERR2_N(function)			\
+  SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], zp[1], s->size, 0), 2)
+
+#define SPEED_ROUTINE_MPN_BINARY_ERR3_N(function)			\
+  SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], zp[1], zp[2], s->size, 0), 3)
+
+
+/* For mpn_add_n, mpn_sub_n, or similar. */
+#define SPEED_ROUTINE_MPN_ADDSUB_N_CALL(call)				\
+  {									\
+    mp_ptr     ap, sp;							\
+    mp_ptr     xp, yp;							\
+    unsigned   i;							\
+    double     t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (ap, s->size, s->align_wp);			\
+    SPEED_TMP_ALLOC_LIMBS (sp, s->size, s->align_wp);			\
+									\
+    xp = s->xp;								\
+    yp = s->yp;								\
+									\
+    if ((s->r & 1) != 0) { xp = ap; }					\
+    if ((s->r & 2) != 0) { yp = ap; }					\
+    if ((s->r & 4) != 0) { xp = sp; }					\
+    if ((s->r & 8) != 0) { yp = sp; }					\
+    if ((s->r & 3) == 3  ||  (s->r & 12) == 12)				\
+      {									\
+	TMP_FREE;							\
+	return -1.0;							\
+      }									\
+									\
+    /* initialize ap if operand overlap */				\
+    if (xp == ap || yp == ap)						\
+      MPN_COPY (ap, s->xp, s->size);					\
+    /* initialize sp if operand overlap */				\
+    if (xp == sp || yp == sp)						\
+      MPN_COPY (sp, s->xp, s->size);					\
+									\
+    speed_operand_src (s, xp, s->size);					\
+    speed_operand_src (s, yp, s->size);					\
+    speed_operand_dst (s, ap, s->size);					\
+    speed_operand_dst (s, sp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_BINARY_N(function)				\
+   SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size))
+
+#define SPEED_ROUTINE_MPN_BINARY_NC(function)				\
+   SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size, 0))
+
+
+/* For mpn_lshift, mpn_rshift, mpn_mul_1, with r, or similar. */
+#define SPEED_ROUTINE_MPN_UNARY_1_CALL(call)				\
+  {									\
+    mp_ptr    wp;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_UNARY_1(function)				\
+  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
+
+#define SPEED_ROUTINE_MPN_UNARY_1C(function)				\
+  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0))
+
+/* FIXME: wp is uninitialized here, should start it off from xp */
+#define SPEED_ROUTINE_MPN_UNARY_1_INPLACE(function)			\
+  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, wp, s->size, s->r))
+
+#define SPEED_ROUTINE_MPN_DIVEXACT_1(function)				\
+  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
+
+#define SPEED_ROUTINE_MPN_BDIV_Q_1(function)				\
+    SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
+
+#define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL(call)			\
+  {									\
+    unsigned   shift;							\
+    mp_limb_t  dinv;							\
+									\
+    SPEED_RESTRICT_COND (s->size > 0);					\
+    SPEED_RESTRICT_COND (s->r != 0);					\
+									\
+    count_trailing_zeros (shift, s->r);					\
+    binvert_limb (dinv, s->r >> shift);					\
+									\
+    SPEED_ROUTINE_MPN_UNARY_1_CALL (call);				\
+  }
+#define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1(function)			\
+  SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL					\
+  ((*function) (wp, s->xp, s->size, s->r, dinv, shift))
+
+#define SPEED_ROUTINE_MPN_BDIV_DBM1C(function)				\
+  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0))
+
+#define SPEED_ROUTINE_MPN_DIVREM_1(function)				\
+  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r))
+
+#define SPEED_ROUTINE_MPN_DIVREM_1C(function)				\
+  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r, 0))
+
+#define SPEED_ROUTINE_MPN_DIVREM_1F(function)				\
+  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r))
+
+#define SPEED_ROUTINE_MPN_DIVREM_1CF(function)				\
+  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r, 0))
+
+
+#define SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL(call)			\
+  {									\
+    unsigned   shift;							\
+    mp_limb_t  dinv;							\
+									\
+    SPEED_RESTRICT_COND (s->size >= 0);					\
+    SPEED_RESTRICT_COND (s->r != 0);					\
+									\
+    count_leading_zeros (shift, s->r);					\
+    invert_limb (dinv, s->r << shift);					\
+									\
+    SPEED_ROUTINE_MPN_UNARY_1_CALL (call);				\
+  }									\
+
+#define SPEED_ROUTINE_MPN_PREINV_DIVREM_1(function)			\
+  SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL				\
+  ((*function) (wp, 0, s->xp, s->size, s->r, dinv, shift))
+
+/* s->size limbs worth of fraction part */
+#define SPEED_ROUTINE_MPN_PREINV_DIVREM_1F(function)			\
+  SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL				\
+  ((*function) (wp, s->size, s->xp, 0, s->r, dinv, shift))
+
+
+/* s->r is duplicated to form the multiplier, defaulting to
+   MP_BASES_BIG_BASE_10.  Not sure if that's particularly useful, but at
+   least it provides some control.  */
+#define SPEED_ROUTINE_MPN_UNARY_N(function,N)				\
+  {									\
+    mp_ptr     wp;							\
+    mp_size_t  wn;							\
+    unsigned   i;							\
+    double     t;							\
+    mp_limb_t  yp[N];							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= N);					\
+									\
+    TMP_MARK;								\
+    wn = s->size + N-1;							\
+    SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);			\
+    for (i = 0; i < N; i++)						\
+      yp[i] = (s->r != 0 ? s->r : MP_BASES_BIG_BASE_10);		\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_src (s, yp, (mp_size_t) N);				\
+    speed_operand_dst (s, wp, wn);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (wp, s->xp, s->size, yp);				\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_UNARY_2(function)				\
+  SPEED_ROUTINE_MPN_UNARY_N (function, 2)
+#define SPEED_ROUTINE_MPN_UNARY_3(function)				\
+  SPEED_ROUTINE_MPN_UNARY_N (function, 3)
+#define SPEED_ROUTINE_MPN_UNARY_4(function)				\
+  SPEED_ROUTINE_MPN_UNARY_N (function, 4)
+#define SPEED_ROUTINE_MPN_UNARY_5(function)				\
+  SPEED_ROUTINE_MPN_UNARY_N (function, 5)
+#define SPEED_ROUTINE_MPN_UNARY_6(function)				\
+  SPEED_ROUTINE_MPN_UNARY_N (function, 6)
+#define SPEED_ROUTINE_MPN_UNARY_7(function)				\
+  SPEED_ROUTINE_MPN_UNARY_N (function, 7)
+#define SPEED_ROUTINE_MPN_UNARY_8(function)				\
+  SPEED_ROUTINE_MPN_UNARY_N (function, 8)
+
+
+/* For mpn_mul, mpn_mul_basecase, xsize=r, ysize=s->size. */
+#define SPEED_ROUTINE_MPN_MUL(function)					\
+  {									\
+    mp_ptr    wp;							\
+    mp_size_t size1;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    size1 = (s->r == 0 ? s->size : s->r);				\
+    if (size1 < 0) size1 = -size1 - s->size;				\
+									\
+    SPEED_RESTRICT_COND (size1 >= 1);					\
+    SPEED_RESTRICT_COND (s->size >= size1);				\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, size1 + s->size, s->align_wp);		\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_src (s, s->yp, size1);				\
+    speed_operand_dst (s, wp, size1 + s->size);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (wp, s->xp, s->size, s->yp, size1);			\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+
+#define SPEED_ROUTINE_MPN_MUL_N_CALL(call)				\
+  {									\
+    mp_ptr    wp;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_src (s, s->yp, s->size);				\
+    speed_operand_dst (s, wp, 2*s->size);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_MUL_N(function)				\
+  SPEED_ROUTINE_MPN_MUL_N_CALL (function (wp, s->xp, s->yp, s->size));
+
+#define SPEED_ROUTINE_MPN_MULLO_N_CALL(call)				\
+  {									\
+    mp_ptr    wp;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_src (s, s->yp, s->size);				\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_MULLO_N(function)				\
+  SPEED_ROUTINE_MPN_MULLO_N_CALL (function (wp, s->xp, s->yp, s->size));
+
+#define SPEED_ROUTINE_MPN_MULLO_BASECASE(function)			\
+  SPEED_ROUTINE_MPN_MULLO_N_CALL (function (wp, s->xp, s->yp, s->size));
+
+#define SPEED_ROUTINE_MPN_SQRLO(function)				\
+  {									\
+    mp_ptr    wp;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (wp, s->xp, s->size);					\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+/* For mpn_mulmid, mpn_mulmid_basecase, xsize=r, ysize=s->size. */
+#define SPEED_ROUTINE_MPN_MULMID(function)				\
+  {									\
+    mp_ptr    wp, xp;							\
+    mp_size_t size1;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    size1 = (s->r == 0 ? (2 * s->size - 1) : s->r);			\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+    SPEED_RESTRICT_COND (size1 >= s->size);				\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp);	\
+    SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp);			\
+									\
+    speed_operand_src (s, xp, size1);					\
+    speed_operand_src (s, s->yp, s->size);				\
+    speed_operand_dst (s, wp, size1 - s->size + 3);			\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (wp, xp, size1, s->yp, s->size);				\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_MULMID_N(function)				\
+  {									\
+    mp_ptr    wp, xp;							\
+    mp_size_t size1;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    size1 = 2 * s->size - 1;						\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp);	\
+    SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp);			\
+									\
+    speed_operand_src (s, xp, size1);					\
+    speed_operand_src (s, s->yp, s->size);				\
+    speed_operand_dst (s, wp, size1 - s->size + 3);			\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (wp, xp, s->yp, s->size);				\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_TOOM42_MULMID(function)			\
+  {									\
+    mp_ptr    wp, xp, scratch;						\
+    mp_size_t size1, scratch_size;					\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    size1 = 2 * s->size - 1;						\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp);	\
+    SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp);			\
+    scratch_size = mpn_toom42_mulmid_itch (s->size);			\
+    SPEED_TMP_ALLOC_LIMBS (scratch, scratch_size, 0);			\
+									\
+    speed_operand_src (s, xp, size1);					\
+    speed_operand_src (s, s->yp, s->size);				\
+    speed_operand_dst (s, wp, size1 - s->size + 3);			\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (wp, xp, s->yp, s->size, scratch);			\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL(call)			\
+  {									\
+    mp_ptr    wp, tp;							\
+    unsigned  i;							\
+    double    t;							\
+    mp_size_t itch;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    itch = mpn_mulmod_bnm1_itch (s->size, s->size, s->size);		\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, 2 * s->size, s->align_wp);		\
+    SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2);			\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_src (s, s->yp, s->size);				\
+    speed_operand_dst (s, wp, 2 * s->size);				\
+    speed_operand_dst (s, tp, itch);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+#define SPEED_ROUTINE_MPN_MULMOD_BNM1_ROUNDED(function)			\
+  {									\
+    mp_ptr    wp, tp;							\
+    unsigned  i;							\
+    double    t;							\
+    mp_size_t size, itch;						\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    size = mpn_mulmod_bnm1_next_size (s->size);				\
+    itch = mpn_mulmod_bnm1_itch (size, size, size);			\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, size, s->align_wp);			\
+    SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2);			\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_src (s, s->yp, s->size);				\
+    speed_operand_dst (s, wp, size);					\
+    speed_operand_dst (s, tp, itch);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (wp, size, s->xp, s->size, s->yp, s->size, tp);		\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_MUL_N_TSPACE(call, tsize, minsize)		\
+  {									\
+    mp_ptr    wp, tspace;						\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= minsize);				\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
+    SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2);		\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_src (s, s->yp, s->size);				\
+    speed_operand_dst (s, wp, 2*s->size);				\
+    speed_operand_dst (s, tspace, tsize);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_TOOM22_MUL_N(function)			\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
+     mpn_toom22_mul_itch (s->size, s->size),				\
+     MPN_TOOM22_MUL_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_TOOM33_MUL_N(function)			\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
+     mpn_toom33_mul_itch (s->size, s->size),				\
+     MPN_TOOM33_MUL_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_TOOM44_MUL_N(function)			\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
+     mpn_toom44_mul_itch (s->size, s->size),				\
+     MPN_TOOM44_MUL_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_TOOM6H_MUL_N(function)			\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
+     mpn_toom6h_mul_itch (s->size, s->size),				\
+     MPN_TOOM6H_MUL_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_TOOM8H_MUL_N(function)			\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
+     mpn_toom8h_mul_itch (s->size, s->size),				\
+     MPN_TOOM8H_MUL_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_TOOM32_MUL(function)				\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, 2*s->size/3, tspace),		\
+     mpn_toom32_mul_itch (s->size, 2*s->size/3),			\
+     MPN_TOOM32_MUL_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_TOOM42_MUL(function)				\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, s->size/2, tspace),		\
+     mpn_toom42_mul_itch (s->size, s->size/2),				\
+     MPN_TOOM42_MUL_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_TOOM43_MUL(function)				\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, s->size*3/4, tspace),		\
+     mpn_toom43_mul_itch (s->size, s->size*3/4),			\
+     MPN_TOOM43_MUL_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_TOOM63_MUL(function)				\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, s->size/2, tspace),		\
+     mpn_toom63_mul_itch (s->size, s->size/2),				\
+     MPN_TOOM63_MUL_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM43_MUL(function)		\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace),	\
+     mpn_toom32_mul_itch (s->size, 17*s->size/24),			\
+     MPN_TOOM32_MUL_MINSIZE)
+#define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM32_MUL(function)		\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace),	\
+     mpn_toom43_mul_itch (s->size, 17*s->size/24),			\
+     MPN_TOOM43_MUL_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM53_MUL(function)		\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace),	\
+     mpn_toom32_mul_itch (s->size, 19*s->size/30),			\
+     MPN_TOOM32_MUL_MINSIZE)
+#define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM32_MUL(function)		\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace),	\
+     mpn_toom53_mul_itch (s->size, 19*s->size/30),			\
+     MPN_TOOM53_MUL_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_TOOM42_FOR_TOOM53_MUL(function)		\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace),	\
+     mpn_toom42_mul_itch (s->size, 11*s->size/20),			\
+     MPN_TOOM42_MUL_MINSIZE)
+#define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM42_MUL(function)		\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace),	\
+     mpn_toom53_mul_itch (s->size, 11*s->size/20),			\
+     MPN_TOOM53_MUL_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM54_MUL(function)		\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, 5*s->size/6, tspace),	\
+     mpn_toom42_mul_itch (s->size, 5*s->size/6),			\
+     MPN_TOOM54_MUL_MINSIZE)
+#define SPEED_ROUTINE_MPN_TOOM54_FOR_TOOM43_MUL(function)		\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, 5*s->size/6, tspace),	\
+     mpn_toom54_mul_itch (s->size, 5*s->size/6),			\
+     MPN_TOOM54_MUL_MINSIZE)
+
+
+
+#define SPEED_ROUTINE_MPN_SQR_CALL(call)				\
+  {									\
+    mp_ptr    wp;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_dst (s, wp, 2*s->size);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_SQR(function)					\
+  SPEED_ROUTINE_MPN_SQR_CALL (function (wp, s->xp, s->size))
+
+#define SPEED_ROUTINE_MPN_SQR_DIAG_ADDLSH1_CALL(call)			\
+  {									\
+    mp_ptr    wp, tp;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 2);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_wp);		\
+    SPEED_TMP_ALLOC_LIMBS (wp, 2 * s->size, s->align_wp);		\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_src (s, tp, 2 * s->size);				\
+    speed_operand_dst (s, wp, 2 * s->size);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime () / 2;						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_SQR_TSPACE(call, tsize, minsize)		\
+  {									\
+    mp_ptr    wp, tspace;						\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= minsize);				\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
+    SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2);		\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_dst (s, wp, 2*s->size);				\
+    speed_operand_dst (s, tspace, tsize);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_TOOM2_SQR(function)				\
+  SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
+				mpn_toom2_sqr_itch (s->size),		\
+				MPN_TOOM2_SQR_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_TOOM3_SQR(function)				\
+  SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
+				mpn_toom3_sqr_itch (s->size),		\
+				MPN_TOOM3_SQR_MINSIZE)
+
+
+#define SPEED_ROUTINE_MPN_TOOM4_SQR(function)				\
+  SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
+				mpn_toom4_sqr_itch (s->size),		\
+				MPN_TOOM4_SQR_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_TOOM6_SQR(function)				\
+  SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
+				mpn_toom6_sqr_itch (s->size),		\
+				MPN_TOOM6_SQR_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_TOOM8_SQR(function)				\
+  SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
+				mpn_toom8_sqr_itch (s->size),		\
+				MPN_TOOM8_SQR_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_MOD_CALL(call)				\
+  {									\
+    unsigned   i;							\
+									\
+    SPEED_RESTRICT_COND (s->size >= 0);					\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+									\
+    return speed_endtime ();						\
+  }
+
+#define SPEED_ROUTINE_MPN_MOD_1(function)				\
+   SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size, s->r))
+
+#define SPEED_ROUTINE_MPN_MOD_1C(function)				\
+   SPEED_ROUTINE_MPN_MOD_CALL ((*function)(s->xp, s->size, s->r, CNST_LIMB(0)))
+
+#define SPEED_ROUTINE_MPN_MODEXACT_1_ODD(function)			\
+  SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r));
+
+#define SPEED_ROUTINE_MPN_MODEXACT_1C_ODD(function)			\
+  SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r, CNST_LIMB(0)));
+
+#define SPEED_ROUTINE_MPN_MOD_34LSUB1(function)				\
+   SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size))
+
+#define SPEED_ROUTINE_MPN_PREINV_MOD_1(function)			\
+  {									\
+    unsigned   i;							\
+    mp_limb_t  inv;							\
+									\
+    SPEED_RESTRICT_COND (s->size >= 0);					\
+    SPEED_RESTRICT_COND (s->r & GMP_LIMB_HIGHBIT);			\
+									\
+    invert_limb (inv, s->r);						\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      (*function) (s->xp, s->size, s->r, inv);				\
+    while (--i != 0);							\
+									\
+    return speed_endtime ();						\
+  }
+
+#define SPEED_ROUTINE_MPN_MOD_1_1(function,pfunc)			\
+  {									\
+    unsigned   i;							\
+    mp_limb_t  inv[4];							\
+									\
+    SPEED_RESTRICT_COND (s->size >= 2);					\
+									\
+    mpn_mod_1_1p_cps (inv, s->r);					\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do {								\
+      pfunc (inv, s->r);						\
+      function (s->xp, s->size, s->r << inv[1], inv);				\
+    } while (--i != 0);							\
+									\
+    return speed_endtime ();						\
+  }
+#define SPEED_ROUTINE_MPN_MOD_1_N(function,pfunc,N)			\
+  {									\
+    unsigned   i;							\
+    mp_limb_t  inv[N+3];						\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+    SPEED_RESTRICT_COND (s->r <= ~(mp_limb_t)0 / N);			\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do {								\
+      pfunc (inv, s->r);						\
+      function (s->xp, s->size, s->r, inv);				\
+    } while (--i != 0);							\
+									\
+    return speed_endtime ();						\
+  }
+
+
+/* A division of 2*s->size by s->size limbs */
+
+#define SPEED_ROUTINE_MPN_DC_DIVREM_CALL(call)				\
+  {									\
+    unsigned  i;							\
+    mp_ptr    a, d, q, r;						\
+    double    t;							\
+    gmp_pi1_t dinv;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (d, s->size,   s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp);			\
+    SPEED_TMP_ALLOC_LIMBS (r, s->size,   s->align_wp2);			\
+									\
+    MPN_COPY (a, s->xp, s->size);					\
+    MPN_COPY (a+s->size, s->xp, s->size);				\
+									\
+    MPN_COPY (d, s->yp, s->size);					\
+									\
+    /* normalize the data */						\
+    d[s->size-1] |= GMP_NUMB_HIGHBIT;					\
+    a[2*s->size-1] = d[s->size-1] - 1;					\
+									\
+    invert_pi1 (dinv, d[s->size-1], d[s->size-2]);			\
+									\
+    speed_operand_src (s, a, 2*s->size);				\
+    speed_operand_src (s, d, s->size);					\
+    speed_operand_dst (s, q, s->size+1);				\
+    speed_operand_dst (s, r, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+
+/* A remainder 2*s->size by s->size limbs */
+
+#define SPEED_ROUTINE_MPZ_MOD(function)					\
+  {									\
+    unsigned   i;							\
+    mpz_t      a, d, r;							\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    mpz_init_set_n (d, s->yp, s->size);					\
+									\
+    /* high part less than d, low part a duplicate copied in */		\
+    mpz_init_set_n (a, s->xp, s->size);					\
+    mpz_mod (a, a, d);							\
+    mpz_mul_2exp (a, a, GMP_LIMB_BITS * s->size);			\
+    MPN_COPY (PTR(a), s->xp, s->size);					\
+									\
+    mpz_init (r);							\
+									\
+    speed_operand_src (s, PTR(a), SIZ(a));				\
+    speed_operand_src (s, PTR(d), SIZ(d));				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (r, a, d);						\
+    while (--i != 0);							\
+    return speed_endtime ();						\
+  }
+
+#define SPEED_ROUTINE_MPN_PI1_DIV(function, INV, DMIN, QMIN)		\
+  {									\
+    unsigned   i;							\
+    mp_ptr     dp, tp, ap, qp;						\
+    gmp_pi1_t  inv;							\
+    double     t;							\
+    mp_size_t size1;							\
+    TMP_DECL;								\
+									\
+    size1 = (s->r == 0 ? 2 * s->size : s->r);				\
+									\
+    SPEED_RESTRICT_COND (s->size >= DMIN);				\
+    SPEED_RESTRICT_COND (size1 - s->size >= QMIN);			\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (ap, size1, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp);		\
+    SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_wp2);			\
+									\
+    /* we don't fill in dividend completely when size1 > s->size */	\
+    MPN_COPY (ap,         s->xp, s->size);				\
+    MPN_COPY (ap + size1 - s->size, s->xp, s->size);			\
+									\
+    MPN_COPY (dp,         s->yp, s->size);				\
+									\
+    /* normalize the data */						\
+    dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
+    ap[size1 - 1] = dp[s->size - 1] - 1;				\
+									\
+    invert_pi1 (inv, dp[s->size-1], dp[s->size-2]);			\
+									\
+    speed_operand_src (s, ap, size1);					\
+    speed_operand_dst (s, tp, size1);					\
+    speed_operand_src (s, dp, s->size);					\
+    speed_operand_dst (s, qp, size1 - s->size);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do {								\
+      MPN_COPY (tp, ap, size1);						\
+      function (qp, tp, size1, dp, s->size, INV);			\
+    } while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+#define SPEED_ROUTINE_MPN_MU_DIV_Q(function,itchfn)			\
+  {									\
+    unsigned   i;							\
+    mp_ptr     dp, tp, qp, scratch;					\
+    double     t;							\
+    mp_size_t itch;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 2);					\
+									\
+    itch = itchfn (2 * s->size, s->size, 0);				\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
+    SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp);		\
+    SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
+									\
+    MPN_COPY (tp,         s->xp, s->size);				\
+    MPN_COPY (tp+s->size, s->xp, s->size);				\
+									\
+    /* normalize the data */						\
+    dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
+    tp[2*s->size-1] = dp[s->size-1] - 1;				\
+									\
+    speed_operand_dst (s, qp, s->size);					\
+    speed_operand_src (s, tp, 2 * s->size);				\
+    speed_operand_src (s, dp, s->size);					\
+    speed_operand_dst (s, scratch, itch);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do {								\
+      function (qp, tp, 2 * s->size, dp, s->size, scratch);		\
+    } while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+#define SPEED_ROUTINE_MPN_MU_DIV_QR(function,itchfn)			\
+  {									\
+    unsigned   i;							\
+    mp_ptr     dp, tp, qp, rp, scratch;					\
+    double     t;							\
+    mp_size_t size1, itch;						\
+    TMP_DECL;								\
+									\
+    size1 = (s->r == 0 ? 2 * s->size : s->r);				\
+									\
+    SPEED_RESTRICT_COND (s->size >= 2);					\
+    SPEED_RESTRICT_COND (size1 >= s->size);				\
+									\
+    itch = itchfn (size1, s->size, 0);					\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp);		\
+    SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
+    SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */	\
+									\
+    /* we don't fill in dividend completely when size1 > s->size */	\
+    MPN_COPY (tp,         s->xp, s->size);				\
+    MPN_COPY (tp + size1 - s->size, s->xp, s->size);			\
+									\
+    MPN_COPY (dp,         s->yp, s->size);				\
+									\
+    /* normalize the data */						\
+    dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
+    tp[size1 - 1] = dp[s->size - 1] - 1;				\
+									\
+    speed_operand_dst (s, qp, size1 - s->size);				\
+    speed_operand_dst (s, rp, s->size);					\
+    speed_operand_src (s, tp, size1);					\
+    speed_operand_src (s, dp, s->size);					\
+    speed_operand_dst (s, scratch, itch);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do {								\
+      function (qp, rp, tp, size1, dp, s->size, scratch);		\
+    } while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+#define SPEED_ROUTINE_MPN_MUPI_DIV_QR(function,itchfn)			\
+  {									\
+    unsigned   i;							\
+    mp_ptr     dp, tp, qp, rp, ip, scratch, tmp;			\
+    double     t;							\
+    mp_size_t  size1, itch;						\
+    TMP_DECL;								\
+									\
+    size1 = (s->r == 0 ? 2 * s->size : s->r);				\
+									\
+    SPEED_RESTRICT_COND (s->size >= 2);					\
+    SPEED_RESTRICT_COND (size1 >= s->size);				\
+									\
+    itch = itchfn (size1, s->size, s->size);				\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp);		\
+    SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
+    SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */	\
+    SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_wp2); /* alignment? */	\
+									\
+    /* we don't fill in dividend completely when size1 > s->size */	\
+    MPN_COPY (tp,         s->xp, s->size);				\
+    MPN_COPY (tp + size1 - s->size, s->xp, s->size);			\
+									\
+    MPN_COPY (dp,         s->yp, s->size);				\
+									\
+    /* normalize the data */						\
+    dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
+    tp[size1 - 1] = dp[s->size-1] - 1;					\
+									\
+    tmp = TMP_ALLOC_LIMBS (mpn_invert_itch (s->size));			\
+    mpn_invert (ip, dp, s->size, tmp);					\
+									\
+    speed_operand_dst (s, qp, size1 - s->size);				\
+    speed_operand_dst (s, rp, s->size);					\
+    speed_operand_src (s, tp, size1);					\
+    speed_operand_src (s, dp, s->size);					\
+    speed_operand_src (s, ip, s->size);					\
+    speed_operand_dst (s, scratch, itch);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do {								\
+      function (qp, rp, tp, size1, dp, s->size, ip, s->size, scratch);	\
+    } while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_PI1_BDIV_QR(function)				\
+  {									\
+    unsigned   i;							\
+    mp_ptr     dp, tp, ap, qp;						\
+    mp_limb_t  inv;							\
+    double     t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
+    SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size, s->align_wp2);		\
+									\
+    MPN_COPY (ap,         s->xp, s->size);				\
+    MPN_COPY (ap+s->size, s->xp, s->size);				\
+									\
+    /* divisor must be odd */						\
+    MPN_COPY (dp, s->yp, s->size);					\
+    dp[0] |= 1;								\
+    binvert_limb (inv, dp[0]);						\
+    inv = -inv;								\
+									\
+    speed_operand_src (s, ap, 2*s->size);				\
+    speed_operand_dst (s, tp, 2*s->size);				\
+    speed_operand_src (s, dp, s->size);					\
+    speed_operand_dst (s, qp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do {								\
+      MPN_COPY (tp, ap, 2*s->size);					\
+      function (qp, tp, 2*s->size, dp, s->size, inv);			\
+    } while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+#define SPEED_ROUTINE_MPN_PI1_BDIV_Q(function)				\
+  {									\
+    unsigned   i;							\
+    mp_ptr     dp, tp, qp;						\
+    mp_limb_t  inv;							\
+    double     t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
+    SPEED_TMP_ALLOC_LIMBS (tp, s->size, s->align_wp2);			\
+									\
+    /* divisor must be odd */						\
+    MPN_COPY (dp, s->yp, s->size);					\
+    dp[0] |= 1;								\
+    binvert_limb (inv, dp[0]);						\
+    inv = -inv;								\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_dst (s, tp, s->size);					\
+    speed_operand_src (s, dp, s->size);					\
+    speed_operand_dst (s, qp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do {								\
+      MPN_COPY (tp, s->xp, s->size);					\
+      function (qp, tp, s->size, dp, s->size, inv);			\
+    } while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+#define SPEED_ROUTINE_MPN_PI1_BDIV_R(function)				\
+  {									\
+    unsigned   i;							\
+    mp_ptr     dp, tp, ap;						\
+    mp_limb_t  inv;							\
+    double     t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size, s->align_wp2);		\
+									\
+    MPN_COPY (ap,         s->xp, s->size);				\
+    MPN_COPY (ap+s->size, s->xp, s->size);				\
+									\
+    /* divisor must be odd */						\
+    MPN_COPY (dp, s->yp, s->size);					\
+    dp[0] |= 1;								\
+    binvert_limb (inv, dp[0]);						\
+    inv = -inv;								\
+									\
+    speed_operand_src (s, ap, 2*s->size);				\
+    speed_operand_dst (s, tp, 2*s->size);				\
+    speed_operand_src (s, dp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do {								\
+      MPN_COPY (tp, ap, 2*s->size);					\
+      function (tp, 2*s->size, dp, s->size, inv);			\
+    } while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+#define SPEED_ROUTINE_MPN_MU_BDIV_Q(function,itchfn)			\
+  {									\
+    unsigned   i;							\
+    mp_ptr     dp, qp, scratch;						\
+    double     t;							\
+    mp_size_t itch;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 2);					\
+									\
+    itch = itchfn (s->size, s->size);					\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
+    SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
+									\
+    /* divisor must be odd */						\
+    MPN_COPY (dp, s->yp, s->size);					\
+    dp[0] |= 1;								\
+									\
+    speed_operand_dst (s, qp, s->size);					\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_src (s, dp, s->size);					\
+    speed_operand_dst (s, scratch, itch);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do {								\
+      function (qp, s->xp, s->size, dp, s->size, scratch);		\
+    } while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+#define SPEED_ROUTINE_MPN_MU_BDIV_QR(function,itchfn)			\
+  {									\
+    unsigned   i;							\
+    mp_ptr     dp, tp, qp, rp, scratch;					\
+    double     t;							\
+    mp_size_t itch;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 2);					\
+									\
+    itch = itchfn (2 * s->size, s->size);				\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
+    SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp);		\
+    SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
+    SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */	\
+									\
+    MPN_COPY (tp,         s->xp, s->size);				\
+    MPN_COPY (tp+s->size, s->xp, s->size);				\
+									\
+    /* divisor must be odd */						\
+    MPN_COPY (dp, s->yp, s->size);					\
+    dp[0] |= 1;								\
+									\
+    speed_operand_dst (s, qp, s->size);					\
+    speed_operand_dst (s, rp, s->size);					\
+    speed_operand_src (s, tp, 2 * s->size);				\
+    speed_operand_src (s, dp, s->size);					\
+    speed_operand_dst (s, scratch, itch);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do {								\
+      function (qp, rp, tp, 2 * s->size, dp, s->size, scratch);		\
+    } while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_BROOT(function)	\
+  {						\
+    SPEED_RESTRICT_COND (s->r & 1);		\
+    s->xp[0] |= 1;				\
+    SPEED_ROUTINE_MPN_UNARY_1_CALL		\
+      ((*function) (wp, s->xp, s->size, s->r));	\
+  }
+
+#define SPEED_ROUTINE_MPN_BROOTINV(function, itch)	\
+  {							\
+    mp_ptr    wp, tp;					\
+    unsigned  i;					\
+    double    t;					\
+    TMP_DECL;						\
+    TMP_MARK;						\
+    SPEED_RESTRICT_COND (s->size >= 1);			\
+    SPEED_RESTRICT_COND (s->r & 1);			\
+    wp = TMP_ALLOC_LIMBS (s->size);			\
+    tp = TMP_ALLOC_LIMBS ( (itch));			\
+    s->xp[0] |= 1;					\
+							\
+    speed_operand_src (s, s->xp, s->size);		\
+    speed_operand_dst (s, wp, s->size);			\
+    speed_cache_fill (s);				\
+							\
+    speed_starttime ();					\
+    i = s->reps;					\
+    do							\
+      (*function) (wp, s->xp, s->size, s->r, tp);	\
+    while (--i != 0);					\
+    t = speed_endtime ();				\
+							\
+    TMP_FREE;						\
+    return t;						\
+  }
+
+#define SPEED_ROUTINE_MPN_INVERT(function,itchfn)			\
+  {									\
+    long  i;								\
+    mp_ptr    up, tp, ip;						\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (up, s->size,   s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
+									\
+    MPN_COPY (up, s->xp, s->size);					\
+									\
+    /* normalize the data */						\
+    up[s->size-1] |= GMP_NUMB_HIGHBIT;					\
+									\
+    speed_operand_src (s, up, s->size);					\
+    speed_operand_dst (s, tp, s->size);					\
+    speed_operand_dst (s, ip, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (ip, up, s->size, tp);					\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_INVERTAPPR(function,itchfn)			\
+  {									\
+    long  i;								\
+    mp_ptr    up, tp, ip;						\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
+									\
+    MPN_COPY (up, s->xp, s->size);					\
+									\
+    /* normalize the data */						\
+    up[s->size-1] |= GMP_NUMB_HIGHBIT;					\
+									\
+    speed_operand_src (s, up, s->size);					\
+    speed_operand_dst (s, tp, s->size);					\
+    speed_operand_dst (s, ip, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (ip, up, s->size, tp);					\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_NI_INVERTAPPR(function,itchfn)		\
+  {									\
+    long  i;								\
+    mp_ptr    up, tp, ip;						\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 3);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
+									\
+    MPN_COPY (up, s->xp, s->size);					\
+									\
+    /* normalize the data */						\
+    up[s->size-1] |= GMP_NUMB_HIGHBIT;					\
+									\
+    speed_operand_src (s, up, s->size);					\
+    speed_operand_dst (s, tp, s->size);					\
+    speed_operand_dst (s, ip, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (ip, up, s->size, tp);					\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_BINVERT(function,itchfn)			\
+  {									\
+    long  i;								\
+    mp_ptr    up, tp, ip;						\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (up, s->size,   s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
+									\
+    MPN_COPY (up, s->xp, s->size);					\
+									\
+    /* normalize the data */						\
+    up[0] |= 1;								\
+									\
+    speed_operand_src (s, up, s->size);					\
+    speed_operand_dst (s, tp, s->size);					\
+    speed_operand_dst (s, ip, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (ip, up, s->size, tp);					\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_SEC_INVERT(function,itchfn)			\
+  {									\
+    long  i;								\
+    mp_ptr    up, mp, tp, ip;						\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
+									\
+    speed_operand_src (s, up, s->size);					\
+    speed_operand_dst (s, tp, s->size);					\
+    speed_operand_dst (s, ip, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    MPN_COPY (mp, s->yp, s->size);					\
+    /* Must be odd */							\
+    mp[0] |= 1;								\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      {									\
+	MPN_COPY (up, s->xp, s->size);					\
+	function (ip, up, mp, s->size, 2*s->size*GMP_NUMB_BITS, tp);	\
+      }									\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_REDC_1(function)					\
+  {									\
+    unsigned   i;							\
+    mp_ptr     cp, mp, tp, ap;						\
+    mp_limb_t  inv;							\
+    double     t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp);		\
+    SPEED_TMP_ALLOC_LIMBS (mp, s->size,     s->align_yp);		\
+    SPEED_TMP_ALLOC_LIMBS (cp, s->size,     s->align_wp);		\
+    SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2);		\
+									\
+    MPN_COPY (ap,         s->xp, s->size);				\
+    MPN_COPY (ap+s->size, s->xp, s->size);				\
+									\
+    /* modulus must be odd */						\
+    MPN_COPY (mp, s->yp, s->size);					\
+    mp[0] |= 1;								\
+    binvert_limb (inv, mp[0]);						\
+    inv = -inv;								\
+									\
+    speed_operand_src (s, ap, 2*s->size+1);				\
+    speed_operand_dst (s, tp, 2*s->size+1);				\
+    speed_operand_src (s, mp, s->size);					\
+    speed_operand_dst (s, cp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do {								\
+      MPN_COPY (tp, ap, 2*s->size);					\
+      function (cp, tp, mp, s->size, inv);				\
+    } while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+#define SPEED_ROUTINE_REDC_2(function)					\
+  {									\
+    unsigned   i;							\
+    mp_ptr     cp, mp, tp, ap;						\
+    mp_limb_t  invp[2];							\
+    double     t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp);		\
+    SPEED_TMP_ALLOC_LIMBS (mp, s->size,     s->align_yp);		\
+    SPEED_TMP_ALLOC_LIMBS (cp, s->size,     s->align_wp);		\
+    SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2);		\
+									\
+    MPN_COPY (ap,         s->xp, s->size);				\
+    MPN_COPY (ap+s->size, s->xp, s->size);				\
+									\
+    /* modulus must be odd */						\
+    MPN_COPY (mp, s->yp, s->size);					\
+    mp[0] |= 1;								\
+    mpn_binvert (invp, mp, 2, tp);					\
+    invp[0] = -invp[0]; invp[1] = ~invp[1];				\
+									\
+    speed_operand_src (s, ap, 2*s->size+1);				\
+    speed_operand_dst (s, tp, 2*s->size+1);				\
+    speed_operand_src (s, mp, s->size);					\
+    speed_operand_dst (s, cp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do {								\
+      MPN_COPY (tp, ap, 2*s->size);					\
+      function (cp, tp, mp, s->size, invp);				\
+    } while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+#define SPEED_ROUTINE_REDC_N(function)					\
+  {									\
+    unsigned   i;							\
+    mp_ptr     cp, mp, tp, ap, invp;					\
+    double     t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size > 8);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp);		\
+    SPEED_TMP_ALLOC_LIMBS (mp, s->size,     s->align_yp);		\
+    SPEED_TMP_ALLOC_LIMBS (cp, s->size,     s->align_wp);		\
+    SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2);		\
+    SPEED_TMP_ALLOC_LIMBS (invp, s->size,   s->align_wp2); /* align? */	\
+									\
+    MPN_COPY (ap,         s->xp, s->size);				\
+    MPN_COPY (ap+s->size, s->xp, s->size);				\
+									\
+    /* modulus must be odd */						\
+    MPN_COPY (mp, s->yp, s->size);					\
+    mp[0] |= 1;								\
+    mpn_binvert (invp, mp, s->size, tp);				\
+									\
+    speed_operand_src (s, ap, 2*s->size+1);				\
+    speed_operand_dst (s, tp, 2*s->size+1);				\
+    speed_operand_src (s, mp, s->size);					\
+    speed_operand_dst (s, cp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do {								\
+      MPN_COPY (tp, ap, 2*s->size);					\
+      function (cp, tp, mp, s->size, invp);				\
+    } while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+
+#define SPEED_ROUTINE_MPN_POPCOUNT(function)				\
+  {									\
+    unsigned i;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (s->xp, s->size);					\
+    while (--i != 0);							\
+									\
+    return speed_endtime ();						\
+  }
+
+#define SPEED_ROUTINE_MPN_HAMDIST(function)				\
+  {									\
+    unsigned i;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_src (s, s->yp, s->size);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (s->xp, s->yp, s->size);					\
+    while (--i != 0);							\
+									\
+    return speed_endtime ();						\
+  }
+
+
+#define SPEED_ROUTINE_MPZ_UI(function)					\
+  {									\
+    mpz_t     z;							\
+    unsigned  i;							\
+    double    t;							\
+									\
+    SPEED_RESTRICT_COND (s->size >= 0);					\
+									\
+    mpz_init (z);							\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (z, s->size);						\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    mpz_clear (z);							\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPZ_FAC_UI(function)    SPEED_ROUTINE_MPZ_UI(function)
+#define SPEED_ROUTINE_MPZ_FIB_UI(function)    SPEED_ROUTINE_MPZ_UI(function)
+#define SPEED_ROUTINE_MPZ_LUCNUM_UI(function) SPEED_ROUTINE_MPZ_UI(function)
+
+
+#define SPEED_ROUTINE_MPZ_2_UI(function)				\
+  {									\
+    mpz_t     z, z2;							\
+    unsigned  i;							\
+    double    t;							\
+									\
+    SPEED_RESTRICT_COND (s->size >= 0);					\
+									\
+    mpz_init (z);							\
+    mpz_init (z2);							\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (z, z2, s->size);					\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    mpz_clear (z);							\
+    mpz_clear (z2);							\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPZ_FIB2_UI(function)    SPEED_ROUTINE_MPZ_2_UI(function)
+#define SPEED_ROUTINE_MPZ_LUCNUM2_UI(function) SPEED_ROUTINE_MPZ_2_UI(function)
+
+
+#define SPEED_ROUTINE_MPN_FIB2_UI(function)				\
+  {									\
+    mp_ptr     fp, f1p;							\
+    mp_size_t  alloc;							\
+    unsigned   i;							\
+    double     t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 0);					\
+									\
+    TMP_MARK;								\
+    alloc = MPN_FIB2_SIZE (s->size);					\
+    SPEED_TMP_ALLOC_LIMBS (fp,	alloc, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (f1p, alloc, s->align_yp);			\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (fp, f1p, s->size);					\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+
+
+/* Calculate b^e mod m for random b and m of s->size limbs and random e of 6
+   limbs.  m is forced to odd so that redc can be used.  e is limited in
+   size so the calculation doesn't take too long. */
+#define SPEED_ROUTINE_MPZ_POWM(function)				\
+  {									\
+    mpz_t     r, b, e, m;						\
+    unsigned  i;							\
+    double    t;							\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    mpz_init (r);							\
+    if (s->r < 2)							\
+      mpz_init_set_n (b, s->xp, s->size);				\
+    else								\
+      mpz_init_set_ui (b, s->r);					\
+    mpz_init_set_n (m, s->yp, s->size);					\
+    mpz_setbit (m, 0);	/* force m to odd */				\
+    mpz_init_set_n (e, s->xp_block, 6);					\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (r, b, e, m);						\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    mpz_clear (r);							\
+    mpz_clear (b);							\
+    mpz_clear (e);							\
+    mpz_clear (m);							\
+    return t;								\
+  }
+
+/* (m-2)^0xAAAAAAAA mod m */
+#define SPEED_ROUTINE_MPZ_POWM_UI(function)				\
+  {									\
+    mpz_t     r, b, m;							\
+    unsigned  long  e;							\
+    unsigned  i;							\
+    double    t;							\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    mpz_init (r);							\
+									\
+    /* force m to odd */						\
+    mpz_init (m);							\
+    mpz_set_n (m, s->xp, s->size);					\
+    PTR(m)[0] |= 1;							\
+									\
+    e = (~ (unsigned long) 0) / 3;					\
+    if (s->r != 0)							\
+      e = s->r;								\
+									\
+    mpz_init_set (b, m);						\
+    mpz_sub_ui (b, b, 2);						\
+/* printf ("%X\n", mpz_get_ui(m)); */					\
+    i = s->reps;							\
+    speed_starttime ();							\
+    do									\
+      function (r, b, e, m);						\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    mpz_clear (r);							\
+    mpz_clear (b);							\
+    mpz_clear (m);							\
+    return t;								\
+  }
+
+
+#define SPEED_ROUTINE_MPN_ADDSUB_CALL(call)				\
+  {									\
+    mp_ptr    wp, wp2, xp, yp;						\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 0);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp,	s->size, s->align_wp);			\
+    SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2);			\
+    xp = s->xp;								\
+    yp = s->yp;								\
+									\
+    if (s->r == 0)	;						\
+    else if (s->r == 1) { xp = wp;	      }				\
+    else if (s->r == 2) {	    yp = wp2; }				\
+    else if (s->r == 3) { xp = wp;  yp = wp2; }				\
+    else if (s->r == 4) { xp = wp2; yp = wp;  }				\
+    else {								\
+      TMP_FREE;								\
+      return -1.0;							\
+    }									\
+    if (xp != s->xp) MPN_COPY (xp, s->xp, s->size);			\
+    if (yp != s->yp) MPN_COPY (yp, s->yp, s->size);			\
+									\
+    speed_operand_src (s, xp, s->size);					\
+    speed_operand_src (s, yp, s->size);					\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_operand_dst (s, wp2, s->size);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_ADDSUB_N(function)				\
+  SPEED_ROUTINE_MPN_ADDSUB_CALL						\
+    (function (wp, wp2, xp, yp, s->size));
+
+#define SPEED_ROUTINE_MPN_ADDSUB_NC(function)				\
+  SPEED_ROUTINE_MPN_ADDSUB_CALL						\
+    (function (wp, wp2, xp, yp, s->size, 0));
+
+
+/* Doing an Nx1 gcd with the given r. */
+#define SPEED_ROUTINE_MPN_GCD_1N(function)				\
+  {									\
+    mp_ptr    xp;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+    SPEED_RESTRICT_COND (s->r != 0);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp);			\
+    MPN_COPY (xp, s->xp, s->size);					\
+    xp[0] |= refmpn_zero_p (xp, s->size);				\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (xp, s->size, s->r);					\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+
+/* SPEED_BLOCK_SIZE many one GCDs of s->size bits each. */
+
+#define SPEED_ROUTINE_MPN_GCD_1_CALL(setup, call)			\
+  {									\
+    unsigned  i, j;							\
+    mp_ptr    px, py;							\
+    mp_limb_t x_mask, y_mask;						\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+    SPEED_RESTRICT_COND (s->size <= mp_bits_per_limb);			\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (px, SPEED_BLOCK_SIZE, s->align_xp);		\
+    SPEED_TMP_ALLOC_LIMBS (py, SPEED_BLOCK_SIZE, s->align_yp);		\
+    MPN_COPY (px, s->xp_block, SPEED_BLOCK_SIZE);			\
+    MPN_COPY (py, s->yp_block, SPEED_BLOCK_SIZE);			\
+									\
+    x_mask = MP_LIMB_T_LOWBITMASK (s->size);				\
+    y_mask = MP_LIMB_T_LOWBITMASK (s->r != 0 ? s->r : s->size);		\
+    for (i = 0; i < SPEED_BLOCK_SIZE; i++)				\
+      {									\
+	px[i] &= x_mask; px[i] += (px[i] == 0);				\
+	py[i] &= y_mask; py[i] += (py[i] == 0);				\
+	setup;								\
+      }									\
+									\
+    speed_operand_src (s, px, SPEED_BLOCK_SIZE);			\
+    speed_operand_src (s, py, SPEED_BLOCK_SIZE);			\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      {									\
+	j = SPEED_BLOCK_SIZE;						\
+	do								\
+	  {								\
+	    call;							\
+	  }								\
+	while (--j != 0);						\
+      }									\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+									\
+    s->time_divisor = SPEED_BLOCK_SIZE;					\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_GCD_1(function)				\
+  SPEED_ROUTINE_MPN_GCD_1_CALL( , function (&px[j-1], 1, py[j-1]))
+
+#define SPEED_ROUTINE_MPN_GCD_11(function)				\
+  SPEED_ROUTINE_MPN_GCD_1_CALL((px[i] |= 1, py[i] |= 1),		\
+			       function (px[j-1], py[j-1]))
+
+/* Multiply limbs by (B+1). Then we get a gcd exceeding one limb, so
+   we can measure gcd_22 loop only, without gcd_11. */
+#define SPEED_ROUTINE_MPN_GCD_22(function)				\
+  SPEED_ROUTINE_MPN_GCD_1_CALL((px[i] |= 1, py[i] |= 1),		\
+			       function (px[j-1], px[j-1], py[j-1], py[j-1]))
+
+#define SPEED_ROUTINE_MPN_JACBASE(function)				\
+  SPEED_ROUTINE_MPN_GCD_1_CALL						\
+    ({									\
+       /* require x<y, y odd, y!=1 */					\
+       px[i] %= py[i];							\
+       px[i] |= 1;							\
+       py[i] |= 1;							\
+       if (py[i]==1) py[i]=3;						\
+     },									\
+     function (px[j-1], py[j-1], 0))
+
+#define SPEED_ROUTINE_MPN_HGCD2(function)				\
+  {									\
+    unsigned   i, j;							\
+    struct hgcd_matrix1 m = {{{0,0},{0,0}}};				\
+    double     t;							\
+									\
+    speed_operand_src (s, s->xp_block, SPEED_BLOCK_SIZE);		\
+    speed_operand_src (s, s->yp_block, SPEED_BLOCK_SIZE);		\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    mp_limb_t chain = 0;						\
+    do									\
+      {									\
+	for (j = 0; j < SPEED_BLOCK_SIZE; j+= 2)			\
+	  {								\
+	    /* randomized but successively dependent */			\
+	    function (s->xp_block[j] | GMP_NUMB_HIGHBIT,		\
+		      s->xp_block[j+1] + chain,				\
+		      s->yp_block[j] | GMP_NUMB_HIGHBIT,		\
+		      s->yp_block[j+1], &m);				\
+	    chain += m.u[0][0];						\
+	  }								\
+      }									\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    /* make sure the compiler won't optimize away chain */		\
+    noop_1 (chain);							\
+									\
+    s->time_divisor = SPEED_BLOCK_SIZE / 2;				\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_HGCD_CALL(func, itchfunc)			\
+  {									\
+    mp_size_t hgcd_init_itch, hgcd_itch;				\
+    mp_ptr ap, bp, wp, tmp1;						\
+    struct hgcd_matrix hgcd;						\
+    int res;								\
+    unsigned i;								\
+    double t;								\
+    TMP_DECL;								\
+									\
+    if (s->size < 2)							\
+      return -1;							\
+									\
+    TMP_MARK;								\
+									\
+    SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp);		\
+    SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp);		\
+									\
+    s->xp[s->size - 1] |= 1;						\
+    s->yp[s->size - 1] |= 1;						\
+									\
+    hgcd_init_itch = MPN_HGCD_MATRIX_INIT_ITCH (s->size);		\
+    hgcd_itch = itchfunc (s->size);					\
+									\
+    SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_itch, s->align_wp);		\
+    SPEED_TMP_ALLOC_LIMBS (wp, hgcd_itch, s->align_wp);			\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_src (s, s->yp, s->size);				\
+    speed_operand_dst (s, ap, s->size + 1);				\
+    speed_operand_dst (s, bp, s->size + 1);				\
+    speed_operand_dst (s, wp, hgcd_itch);				\
+    speed_operand_dst (s, tmp1, hgcd_init_itch);			\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      {									\
+	MPN_COPY (ap, s->xp, s->size);					\
+	MPN_COPY (bp, s->yp, s->size);					\
+	mpn_hgcd_matrix_init (&hgcd, s->size, tmp1);			\
+	res = func (ap, bp, s->size, &hgcd, wp);			\
+      }									\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL(func, itchfunc)		\
+  {									\
+    mp_size_t hgcd_init_itch, hgcd_step_itch;				\
+    mp_ptr ap, bp, wp, tmp1;						\
+    struct hgcd_matrix hgcd;						\
+    mp_size_t p = s->size/2;						\
+    int res;								\
+    unsigned i;								\
+    double t;								\
+    TMP_DECL;								\
+									\
+    if (s->size < 2)							\
+      return -1;							\
+									\
+    TMP_MARK;								\
+									\
+    SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp);		\
+    SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp);		\
+									\
+    s->xp[s->size - 1] |= 1;						\
+    s->yp[s->size - 1] |= 1;						\
+									\
+    hgcd_init_itch = MPN_HGCD_MATRIX_INIT_ITCH (s->size);		\
+    hgcd_step_itch = itchfunc (s->size, p);				\
+									\
+    SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_itch, s->align_wp);		\
+    SPEED_TMP_ALLOC_LIMBS (wp, hgcd_step_itch, s->align_wp);			\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_src (s, s->yp, s->size);				\
+    speed_operand_dst (s, ap, s->size + 1);				\
+    speed_operand_dst (s, bp, s->size + 1);				\
+    speed_operand_dst (s, wp, hgcd_step_itch);				\
+    speed_operand_dst (s, tmp1, hgcd_init_itch);			\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      {									\
+	MPN_COPY (ap, s->xp, s->size);					\
+	MPN_COPY (bp, s->yp, s->size);					\
+	mpn_hgcd_matrix_init (&hgcd, s->size, tmp1);			\
+	res = func (&hgcd, ap, bp, s->size, p, wp);			\
+      }									\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+/* Run some GCDs of s->size limbs each.  The number of different data values
+   is decreased as s->size**2, since GCD is a quadratic algorithm.
+   SPEED_ROUTINE_MPN_GCD runs more times than SPEED_ROUTINE_MPN_GCDEXT
+   though, because the plain gcd is about twice as fast as gcdext.  */
+
+#define SPEED_ROUTINE_MPN_GCD_CALL(datafactor, call)			\
+  {									\
+    unsigned  i;							\
+    mp_size_t j, pieces, psize;						\
+    mp_ptr    wp, wp2, xtmp, ytmp, px, py;				\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp);		\
+    SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp);		\
+    SPEED_TMP_ALLOC_LIMBS (wp,   s->size+1, s->align_wp);		\
+    SPEED_TMP_ALLOC_LIMBS (wp2,  s->size+1, s->align_wp2);		\
+									\
+    pieces = SPEED_BLOCK_SIZE * datafactor / s->size / s->size;		\
+    pieces = MIN (pieces, SPEED_BLOCK_SIZE / s->size);			\
+    pieces = MAX (pieces, 1);						\
+									\
+    psize = pieces * s->size;						\
+    px = TMP_ALLOC_LIMBS (psize);					\
+    py = TMP_ALLOC_LIMBS (psize);					\
+    MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize);		\
+    MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize);		\
+									\
+    /* Requirements: x >= y, y must be odd, high limbs != 0.		\
+       No need to ensure random numbers are really great.  */		\
+    for (j = 0; j < pieces; j++)					\
+      {									\
+	mp_ptr	x = px + j * s->size;					\
+	mp_ptr	y = py + j * s->size;					\
+	if (x[s->size - 1] == 0) x[s->size - 1] = 1;			\
+	if (y[s->size - 1] == 0) y[s->size - 1] = 1;			\
+									\
+	if (x[s->size - 1] < y[s->size - 1])				\
+	  MP_LIMB_T_SWAP (x[s->size - 1], y[s->size - 1]);		\
+	else if (x[s->size - 1] == y[s->size - 1])			\
+	  {								\
+	    x[s->size - 1] = 2;						\
+	    y[s->size - 1] = 1;						\
+	  }								\
+	y[0] |= 1;							\
+      }									\
+									\
+    speed_operand_src (s, px, psize);					\
+    speed_operand_src (s, py, psize);					\
+    speed_operand_dst (s, xtmp, s->size);				\
+    speed_operand_dst (s, ytmp, s->size);				\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      {									\
+	j = pieces;							\
+	do								\
+	  {								\
+	    MPN_COPY (xtmp, px+(j - 1)*s->size, s->size);		\
+	    MPN_COPY (ytmp, py+(j - 1)*s->size, s->size);		\
+	    call;							\
+	  }								\
+	while (--j != 0);						\
+      }									\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+									\
+    s->time_divisor = pieces;						\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_GCD(function)	\
+  SPEED_ROUTINE_MPN_GCD_CALL (8, function (wp, xtmp, s->size, ytmp, s->size))
+
+#define SPEED_ROUTINE_MPN_GCDEXT(function)				\
+  SPEED_ROUTINE_MPN_GCD_CALL						\
+    (4, { mp_size_t  wp2size;						\
+	  function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); })
+
+
+#define SPEED_ROUTINE_MPN_GCDEXT_ONE(function)				\
+  {									\
+    unsigned  i;							\
+    mp_size_t j, pieces, psize, wp2size;				\
+    mp_ptr    wp, wp2, xtmp, ytmp, px, py;				\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+									\
+    SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp);		\
+    SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp);		\
+    MPN_COPY (xtmp, s->xp, s->size);					\
+    MPN_COPY (ytmp, s->yp, s->size);					\
+									\
+    SPEED_TMP_ALLOC_LIMBS (wp,	s->size+1, s->align_wp);		\
+    SPEED_TMP_ALLOC_LIMBS (wp2, s->size+1, s->align_wp2);		\
+									\
+    pieces = SPEED_BLOCK_SIZE / 3;					\
+    psize = 3 * pieces;							\
+    px = TMP_ALLOC_LIMBS (psize);					\
+    py = TMP_ALLOC_LIMBS (psize);					\
+    MPN_COPY (px, s->xp_block, psize);					\
+    MPN_COPY (py, s->yp_block, psize);					\
+									\
+    /* x must have at least as many bits as y,				\
+       high limbs must be non-zero */					\
+    for (j = 0; j < pieces; j++)					\
+      {									\
+	mp_ptr	x = px+3*j;						\
+	mp_ptr	y = py+3*j;						\
+	x[2] += (x[2] == 0);						\
+	y[2] += (y[2] == 0);						\
+	if (x[2] < y[2])						\
+	  MP_LIMB_T_SWAP (x[2], y[2]);					\
+      }									\
+									\
+    speed_operand_src (s, px, psize);					\
+    speed_operand_src (s, py, psize);					\
+    speed_operand_dst (s, xtmp, s->size);				\
+    speed_operand_dst (s, ytmp, s->size);				\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      {									\
+	mp_ptr	x = px;							\
+	mp_ptr	y = py;							\
+	mp_ptr	xth = &xtmp[s->size-3];					\
+	mp_ptr	yth = &ytmp[s->size-3];					\
+	j = pieces;							\
+	do								\
+	  {								\
+	    xth[0] = x[0], xth[1] = x[1], xth[2] = x[2];		\
+	    yth[0] = y[0], yth[1] = y[1], yth[2] = y[2];		\
+									\
+	    ytmp[0] |= 1; /* y must be odd, */				\
+									\
+	    function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size);	\
+									\
+	    x += 3;							\
+	    y += 3;							\
+	  }								\
+	while (--j != 0);						\
+      }									\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+									\
+    s->time_divisor = pieces;						\
+    return t;								\
+  }
+
+/* Calculate nextprime(n) for random n of s->size bits (not limbs). */
+#define SPEED_ROUTINE_MPZ_NEXTPRIME(function)				\
+  {									\
+    unsigned  i, j;							\
+    mpz_t     wp, n;							\
+    double    t;							\
+									\
+    SPEED_RESTRICT_COND (s->size >= 10);				\
+									\
+    mpz_init (wp);							\
+    mpz_init_set_n (n, s->xp, s->size);					\
+    /* limit to s->size bits, as this function is very slow */		\
+    mpz_tdiv_r_2exp (n, n, s->size);					\
+    /* set high bits so operand and result are genaral s->size bits */	\
+    mpz_setbit (n, s->size - 1);					\
+    mpz_clrbit (n, s->size - 2);					\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      {									\
+        /* nextprime timing is variable, so average over many calls */	\
+        j = SPEED_BLOCK_SIZE - 1;					\
+        /* starts on random, after measures prime to next prime */	\
+        function (wp, n);						\
+        do								\
+          {								\
+            function (wp, wp);						\
+          }								\
+        while (--j != 0);						\
+      }									\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    mpz_clear (wp);							\
+    mpz_clear (n);							\
+									\
+    s->time_divisor = SPEED_BLOCK_SIZE;					\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPZ_JACOBI(function)				\
+  {									\
+    mpz_t     a, b;							\
+    unsigned  i;							\
+    mp_size_t j, pieces, psize;						\
+    mp_ptr    px, py;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    TMP_MARK;								\
+    pieces = SPEED_BLOCK_SIZE / MAX (s->size, 1);			\
+    pieces = MAX (pieces, 1);						\
+    s->time_divisor = pieces;						\
+									\
+    psize = pieces * s->size;						\
+    px = TMP_ALLOC_LIMBS (psize);					\
+    py = TMP_ALLOC_LIMBS (psize);					\
+    MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize);		\
+    MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize);		\
+									\
+    for (j = 0; j < pieces; j++)					\
+      {									\
+	mp_ptr	x = px+j*s->size;					\
+	mp_ptr	y = py+j*s->size;					\
+									\
+	/* y odd */							\
+	y[0] |= 1;							\
+									\
+	/* high limbs non-zero */					\
+	if (x[s->size-1] == 0) x[s->size-1] = 1;			\
+	if (y[s->size-1] == 0) y[s->size-1] = 1;			\
+      }									\
+									\
+    SIZ(a) = s->size;							\
+    SIZ(b) = s->size;							\
+									\
+    speed_operand_src (s, px, psize);					\
+    speed_operand_src (s, py, psize);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      {									\
+	j = pieces;							\
+	do								\
+	  {								\
+	    PTR(a) = px+(j-1)*s->size;					\
+	    PTR(b) = py+(j-1)*s->size;					\
+	    function (a, b);						\
+	  }								\
+	while (--j != 0);						\
+      }									\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_DIVREM_2(function)				\
+  {									\
+    mp_ptr    wp, xp;							\
+    mp_limb_t yp[2];							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 2);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
+									\
+    /* source is destroyed */						\
+    MPN_COPY (xp, s->xp, s->size);					\
+									\
+    /* divisor must be normalized */					\
+    MPN_COPY (yp, s->yp_block, 2);					\
+    yp[1] |= GMP_NUMB_HIGHBIT;						\
+									\
+    speed_operand_src (s, xp, s->size);					\
+    speed_operand_src (s, yp, 2);					\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (wp, 0, xp, s->size, yp);				\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_DIV_QR_1(function)				\
+  {									\
+    mp_ptr    wp, xp;							\
+    mp_limb_t d;							\
+    mp_limb_t r;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
+									\
+    d = s->r;								\
+    if (d == 0)								\
+      d = 1;								\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      r = function (wp, wp+s->size-1, s->xp, s->size, d);		\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_DIV_QR_1N_PI1(function)			\
+  {									\
+    mp_ptr    wp, xp;							\
+    mp_limb_t d, dinv;							\
+    mp_limb_t r;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
+									\
+    d = s->r;								\
+    /* divisor must be normalized */					\
+    SPEED_RESTRICT_COND (d & GMP_NUMB_HIGHBIT);				\
+    invert_limb (dinv, d);						\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      r = function (wp, s->xp, s->size, 0, d, dinv);			\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_DIV_QR_2(function, norm)			\
+  {									\
+    mp_ptr    wp, xp;							\
+    mp_limb_t yp[2];							\
+    mp_limb_t rp[2];							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 2);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
+									\
+    /* divisor must be normalized */					\
+    MPN_COPY (yp, s->yp_block, 2);					\
+    if (norm)								\
+      yp[1] |= GMP_NUMB_HIGHBIT;					\
+    else								\
+      {									\
+	yp[1] &= ~GMP_NUMB_HIGHBIT;					\
+	if (yp[1] == 0)							\
+	  yp[1] = 1;							\
+      }									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_src (s, yp, 2);					\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_operand_dst (s, rp, 2);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (wp, rp, s->xp, s->size, yp);				\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MODLIMB_INVERT(function)				\
+  {									\
+    unsigned   i, j;							\
+    mp_ptr     xp;							\
+    mp_limb_t  n = 1;							\
+    double     t;							\
+									\
+    xp = s->xp_block-1;							\
+									\
+    speed_operand_src (s, s->xp_block, SPEED_BLOCK_SIZE);		\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      {									\
+	j = SPEED_BLOCK_SIZE;						\
+	do								\
+	  {								\
+	    /* randomized but successively dependent */			\
+	    n += (xp[j] << 1);						\
+									\
+	    function (n, n);						\
+	  }								\
+	while (--j != 0);						\
+      }									\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    /* make sure the compiler won't optimize away n */			\
+    noop_1 (n);								\
+									\
+    s->time_divisor = SPEED_BLOCK_SIZE;					\
+    return t;								\
+  }
+
+
+#define SPEED_ROUTINE_MPN_SQRTROOT_CALL(call)				\
+  {									\
+    mp_ptr    wp, wp2;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp,	s->size, s->align_wp);			\
+    SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2);			\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_operand_dst (s, wp2, s->size);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+
+/* Calculate worst case for perfect_power
+   Worst case is multiple prime factors larger than trial div limit. */
+#define SPEED_ROUTINE_MPN_PERFECT_POWER(function)		 	\
+  {									\
+    mpz_t     r;							\
+    unsigned  i, power;							\
+    double    t;							\
+									\
+    SPEED_RESTRICT_COND (s->size >= 10);				\
+									\
+    mpz_init (r);							\
+    power = s->size * GMP_NUMB_BITS / 17;				\
+    mpz_ui_pow_ui(r, (1 << 17) - 1, power - 1);				\
+    mpz_mul_ui(r, r, (1 << 16) + 1);	/* larger than 1000th prime */	\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (PTR(r), SIZ(r));					\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    mpz_clear (r);							\
+    return t;								\
+  }
+
+/* Calculate worst case (larger prime) for perfect_square */
+#define SPEED_ROUTINE_MPN_PERFECT_SQUARE(function)			\
+  {									\
+    mpz_t     r;							\
+    unsigned  i;							\
+    double    t;							\
+									\
+    SPEED_RESTRICT_COND (s->size >= 2);					\
+    mpz_init_set_n (r, s->xp, s->size / 2);				\
+    mpz_setbit (r, s->size * GMP_NUMB_BITS / 2 - 1);			\
+    mpz_mul (r, r, r);							\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (PTR(r), SIZ(r));					\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    mpz_clear (r);							\
+    return t;								\
+  }
+
+
+/* s->size controls the number of limbs in the input, s->r is the base, or
+   decimal by default. */
+#define SPEED_ROUTINE_MPN_GET_STR(function)				\
+  {									\
+    unsigned char *wp;							\
+    mp_size_t wn;							\
+    mp_ptr xp;								\
+    int base;								\
+    unsigned i;								\
+    double t;								\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    base = s->r == 0 ? 10 : s->r;					\
+    SPEED_RESTRICT_COND (base >= 2 && base <= 256);			\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (xp, s->size + 1, s->align_xp);		\
+									\
+    MPN_SIZEINBASE (wn, s->xp, s->size, base);				\
+    wp = (unsigned char *) TMP_ALLOC (wn);				\
+									\
+    /* use this during development to guard against overflowing wp */	\
+    /*									\
+    MPN_COPY (xp, s->xp, s->size);					\
+    ASSERT_ALWAYS (mpn_get_str (wp, base, xp, s->size) <= wn);		\
+    */									\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_dst (s, xp, s->size);					\
+    speed_operand_dst (s, (mp_ptr) wp, wn/GMP_LIMB_BYTES);		\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      {									\
+	MPN_COPY (xp, s->xp, s->size);					\
+	function (wp, base, xp, s->size);				\
+      }									\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+/* s->size controls the number of digits in the input, s->r is the base, or
+   decimal by default. */
+#define SPEED_ROUTINE_MPN_SET_STR_CALL(call)				\
+  {									\
+    unsigned char *xp;							\
+    mp_ptr     wp;							\
+    mp_size_t  wn;							\
+    unsigned   i;							\
+    int        base;							\
+    double     t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    base = s->r == 0 ? 10 : s->r;					\
+    SPEED_RESTRICT_COND (base >= 2 && base <= 256);			\
+									\
+    TMP_MARK;								\
+									\
+    xp = (unsigned char *) TMP_ALLOC (s->size);				\
+    for (i = 0; i < s->size; i++)					\
+      xp[i] = s->xp[i] % base;						\
+									\
+    LIMBS_PER_DIGIT_IN_BASE (wn, s->size, base);			\
+    SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);			\
+									\
+    /* use this during development to check wn is big enough */		\
+    /*									\
+    ASSERT_ALWAYS (mpn_set_str (wp, xp, s->size, base) <= wn);		\
+    */									\
+									\
+    speed_operand_src (s, (mp_ptr) xp, s->size/GMP_LIMB_BYTES);	\
+    speed_operand_dst (s, wp, wn);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+
+/* Run an accel gcd find_a() function over various data values.  A set of
+   values is used in case some run particularly fast or slow.  The size
+   parameter is ignored, the amount of data tested is fixed.  */
+
+#define SPEED_ROUTINE_MPN_GCD_FINDA(function)				\
+  {									\
+    unsigned  i, j;							\
+    mp_limb_t cp[SPEED_BLOCK_SIZE][2];					\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    TMP_MARK;								\
+									\
+    /* low must be odd, high must be non-zero */			\
+    for (i = 0; i < SPEED_BLOCK_SIZE; i++)				\
+      {									\
+	cp[i][0] = s->xp_block[i] | 1;					\
+	cp[i][1] = s->yp_block[i] + (s->yp_block[i] == 0);		\
+      }									\
+									\
+    speed_operand_src (s, &cp[0][0], 2*SPEED_BLOCK_SIZE);		\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      {									\
+	j = SPEED_BLOCK_SIZE;						\
+	do								\
+	  {								\
+	    function (cp[j-1]);						\
+	  }								\
+	while (--j != 0);						\
+      }									\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+									\
+    s->time_divisor = SPEED_BLOCK_SIZE;					\
+    return t;								\
+  }
+
+
+/* "call" should do "count_foo_zeros(c,n)".
+   Give leading=1 if foo is leading zeros, leading=0 for trailing.
+   Give zero=1 if n=0 is allowed in the call, zero=0 if not.  */
+
+#define SPEED_ROUTINE_COUNT_ZEROS_A(leading, zero)			\
+  {									\
+    mp_ptr     xp;							\
+    int        i, c;							\
+    unsigned   j;							\
+    mp_limb_t  n;							\
+    double     t;							\
+    TMP_DECL;								\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (xp, SPEED_BLOCK_SIZE, s->align_xp);		\
+									\
+    if (! speed_routine_count_zeros_setup (s, xp, leading, zero))	\
+      return -1.0;							\
+    speed_operand_src (s, xp, SPEED_BLOCK_SIZE);			\
+    speed_cache_fill (s);						\
+									\
+    c = 0;								\
+    speed_starttime ();							\
+    j = s->reps;							\
+    do {								\
+      for (i = 0; i < SPEED_BLOCK_SIZE; i++)				\
+	{								\
+	  n = xp[i];							\
+	  n ^= c;							\
+
+#define SPEED_ROUTINE_COUNT_ZEROS_B()					\
+	}								\
+    } while (--j != 0);							\
+    t = speed_endtime ();						\
+									\
+    /* don't let c go dead */						\
+    noop_1 (c);								\
+									\
+    s->time_divisor = SPEED_BLOCK_SIZE;					\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }									\
+
+#define SPEED_ROUTINE_COUNT_ZEROS_C(call, leading, zero)		\
+  do {									\
+    SPEED_ROUTINE_COUNT_ZEROS_A (leading, zero);			\
+    call;								\
+    SPEED_ROUTINE_COUNT_ZEROS_B ();					\
+  } while (0)								\
+
+#define SPEED_ROUTINE_COUNT_LEADING_ZEROS_C(call,zero)			\
+  SPEED_ROUTINE_COUNT_ZEROS_C (call, 1, zero)
+#define SPEED_ROUTINE_COUNT_LEADING_ZEROS(fun)				\
+  SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 1, 0)
+
+#define SPEED_ROUTINE_COUNT_TRAILING_ZEROS_C(call,zero)			\
+  SPEED_ROUTINE_COUNT_ZEROS_C (call, 0, zero)
+#define SPEED_ROUTINE_COUNT_TRAILING_ZEROS(call)			\
+  SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 0, 0)
+
+
+#define SPEED_ROUTINE_INVERT_LIMB_CALL(call)				\
+  {									\
+    unsigned   i, j;							\
+    mp_limb_t  d, dinv=0;						\
+    mp_ptr     xp = s->xp_block - 1;					\
+									\
+    s->time_divisor = SPEED_BLOCK_SIZE;					\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      {									\
+	j = SPEED_BLOCK_SIZE;						\
+	do								\
+	  {								\
+	    d = dinv ^ xp[j];						\
+	    d |= GMP_LIMB_HIGHBIT;					\
+	    do { call; } while (0);					\
+	  }								\
+	while (--j != 0);						\
+      }									\
+    while (--i != 0);							\
+									\
+    /* don't let the compiler optimize everything away */		\
+    noop_1 (dinv);							\
+									\
+    return speed_endtime();						\
+  }
+
+
+#define SPEED_ROUTINE_MPN_BACK_TO_BACK(function)			\
+  {									\
+    unsigned  i;							\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function ();							\
+    while (--i != 0);							\
+    return speed_endtime ();						\
+  }
+
+
+#define SPEED_ROUTINE_MPN_ZERO_CALL(call)				\
+  {									\
+    mp_ptr    wp;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 0);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_ZERO(function)				\
+  SPEED_ROUTINE_MPN_ZERO_CALL (function (wp, s->size))
+
+
+#endif