Revert "Remove gmp from AOS" This reverts commit f37c97684f0910a3f241394549392f00145ab0f7. We need gmp for SymEngine for symbolicmanipultion in C++ Change-Id: Ia13216d1715cf96944f7b4f422b7a799f921d4a4 Signed-off-by: Austin Schuh <austin.linux@gmail.com>

commit: bb1338cd84d865f1eb7a653969204a06bba8261c [log] [tgz]
author: Austin Schuh <austin.linux@gmail.com> Sat Jun 15 19:31:16 2024 -0700
committer: Austin Schuh <austin.linux@gmail.com> Wed Jun 19 19:49:35 2024 -0700
tree: e291dbf975ebfaebab464c64131d63191b8b0e39
parent: e4a8c6c24f636a763b512a3f87dee2225762d817 [diff]
diff --git a/third_party/gmp/tune/Makefile.am b/third_party/gmp/tune/Makefile.am
new file mode 100644
index 0000000..9e38256
--- /dev/null
+++ b/third_party/gmp/tune/Makefile.am

@@ -0,0 +1,186 @@
+## Process this file with automake to generate Makefile.in
+
+# Copyright 2000-2003, 2005-2011 Free Software Foundation, Inc.
+#
+#  This file is part of the GNU MP Library.
+#
+#  The GNU MP Library is free software; you can redistribute it and/or modify
+#  it under the terms of either:
+#
+#    * the GNU Lesser General Public License as published by the Free
+#      Software Foundation; either version 3 of the License, or (at your
+#      option) any later version.
+#
+#  or
+#
+#    * the GNU General Public License as published by the Free Software
+#      Foundation; either version 2 of the License, or (at your option) any
+#      later version.
+#
+#  or both in parallel, as here.
+#
+#  The GNU MP Library is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+#  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+#  for more details.
+#
+#  You should have received copies of the GNU General Public License and the
+#  GNU Lesser General Public License along with the GNU MP Library.  If not,
+#  see https://www.gnu.org/licenses/.
+
+
+AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/tests
+AM_LDFLAGS = -no-install
+
+EXTRA_DIST = alpha.asm pentium.asm sparcv9.asm hppa.asm hppa2.asm hppa2w.asm \
+  ia64.asm powerpc.asm powerpc64.asm x86_64.asm many.pl
+noinst_HEADERS = speed.h
+
+# Prefer -static on the speed and tune programs, since that can avoid
+# overheads of shared library linkages on some systems.  Libtool tends to
+# botch -static if configured with --disable-static, perhaps reasonably
+# enough.  In any event under --disable-static the only choice is a dynamic
+# link so there's no point in -static.
+#
+if ENABLE_STATIC
+STATIC = -static
+else
+STATIC =
+endif
+
+
+EXTRA_LTLIBRARIES = libspeed.la
+
+libspeed_la_SOURCES =							\
+  common.c divrem1div.c divrem1inv.c divrem2div.c divrem2inv.c		\
+  div_qr_1n_pi1_1.c div_qr_1n_pi1_2.c div_qr_1_tune.c			\
+  freq.c								\
+  gcdext_single.c gcdext_double.c gcdextod.c gcdextos.c			\
+  hgcd_lehmer.c hgcd_appr_lehmer.c hgcd_reduce_1.c hgcd_reduce_2.c	\
+  jacbase1.c jacbase2.c jacbase3.c jacbase4.c				\
+  hgcd2-1.c hgcd2-2.c hgcd2-3.c hgcd2-4.c hgcd2-5.c			\
+  mod_1_div.c mod_1_inv.c mod_1_1-1.c mod_1_1-2.c modlinv.c		\
+  noop.c powm_mod.c powm_redc.c pre_divrem_1.c				\
+  set_strb.c set_strs.c set_strp.c time.c
+
+libspeed_la_DEPENDENCIES = $(SPEED_CYCLECOUNTER_OBJ) \
+  $(top_builddir)/tests/libtests.la $(top_builddir)/libgmp.la
+libspeed_la_LIBADD = $(libspeed_la_DEPENDENCIES) $(LIBM)
+libspeed_la_LDFLAGS = $(STATIC)
+
+$(top_builddir)/tests/libtests.la:
+	cd $(top_builddir)/tests; $(MAKE) $(AM_MAKEFLAGS) libtests.la
+
+
+# The library code is faster static than shared on some systems, so do
+# tuning and measuring with static, since users who care about maximizing
+# speed will be using that.  speed-dynamic exists to show the difference.
+#
+# On Solaris 8, gcc 2.95.2 -static is somehow broken (it creates executables
+# that immediately seg fault), so -all-static is not used.  The only thing
+# -all-static does is make libc static linked as well as libgmp, and that
+# makes a difference only when measuring malloc and friends in the speed
+# program.  This can always be forced with "make speed_LDFLAGS=-all-static
+# ..." if desired, see tune/README.
+
+EXTRA_PROGRAMS = speed speed-dynamic speed-ext tuneup tune-gcd-p
+
+DEPENDENCIES = libspeed.la
+LDADD = $(DEPENDENCIES) $(TUNE_LIBS)
+
+speed_SOURCES = speed.c
+speed_LDFLAGS = $(STATIC)
+
+speed_dynamic_SOURCES = speed.c
+
+speed_ext_SOURCES = speed-ext.c
+speed_ext_LDFLAGS = $(STATIC)
+
+tuneup_SOURCES = tuneup.c hgcd2.c
+nodist_tuneup_SOURCES = sqr_basecase.c fac_ui.c $(TUNE_MPN_SRCS)
+tuneup_DEPENDENCIES = $(TUNE_SQR_OBJ) libspeed.la
+tuneup_LDADD = $(tuneup_DEPENDENCIES) $(TUNE_LIBS)
+tuneup_LDFLAGS = $(STATIC)
+
+tune_gcd_p_SOURCES = tune-gcd-p.c
+tune_gcd_p_DEPENDENCIES = ../mpn/gcd.c
+tune_gcd_p_LDFLAGS = $(STATIC)
+
+
+tune:
+	$(MAKE) $(AM_MAKEFLAGS) tuneup$(EXEEXT)
+	./tuneup
+
+allprogs: $(EXTRA_PROGRAMS)
+
+# $(MANY_CLEAN) and $(MANY_DISTCLEAN) are hooks for many.pl
+CLEANFILES = $(EXTRA_PROGRAMS) $(EXTRA_LTLIBRARIES) \
+	$(TUNE_MPN_SRCS) fac_ui.c sqr_asm.asm \
+	stg.gnuplot stg.data \
+	mtg.gnuplot mtg.data \
+	fibg.gnuplot fibg.data \
+	graph.gnuplot graph.data \
+	$(MANY_CLEAN)
+DISTCLEANFILES = sqr_basecase.c  $(MANY_DISTCLEAN)
+
+
+# Generating these little files at build time seems better than including
+# them in the distribution, since the list can be changed more easily.
+#
+# mpn/generic/tdiv_qr.c uses mpn_divrem_1 and mpn_divrem_2, but only for 1
+# and 2 limb divisors, which are never used during tuning, so it doesn't
+# matter whether it picks up a tuned or untuned version of those.
+#
+# divrem_1 and mod_1 are recompiled renamed to "_tune" to avoid a linking
+# problem.  If a native divrem_1 provides an mpn_divrem_1c entrypoint then
+# common.c will want that, but the generic divrem_1 doesn't provide it,
+# likewise for mod_1.  The simplest way around this is to have the tune
+# build versions renamed suitably.
+#
+# FIXME: Would like say mul_n.c to depend on $(top_builddir)/mul_n.c so the
+# recompiled object will be rebuilt if that file changes.
+
+TUNE_MPN_SRCS = $(TUNE_MPN_SRCS_BASIC) divrem_1.c mod_1.c
+TUNE_MPN_SRCS_BASIC = div_qr_2.c bdiv_q.c bdiv_qr.c			\
+  dcpi1_div_qr.c dcpi1_divappr_q.c dcpi1_bdiv_qr.c dcpi1_bdiv_q.c	\
+  invertappr.c invert.c binvert.c divrem_2.c gcd.c gcdext.c		\
+  get_str.c set_str.c matrix22_mul.c					\
+  hgcd.c hgcd_appr.c hgcd_reduce.c					\
+  mul_n.c sqr.c sec_powm.c						\
+  mullo_n.c mul_fft.c mul.c tdiv_qr.c mulmod_bnm1.c sqrmod_bnm1.c	\
+  mulmid.c mulmid_n.c toom42_mulmid.c sqrlo.c sqrlo_basecase.c		\
+  nussbaumer_mul.c toom6h_mul.c toom8h_mul.c toom6_sqr.c toom8_sqr.c	\
+  toom22_mul.c toom2_sqr.c toom33_mul.c toom3_sqr.c toom44_mul.c toom4_sqr.c
+
+$(TUNE_MPN_SRCS_BASIC):
+	for i in $(TUNE_MPN_SRCS_BASIC); do \
+	  echo "#define TUNE_PROGRAM_BUILD 1" >$$i; \
+	  echo "#include \"mpn/generic/$$i\"" >>$$i; \
+	done
+
+divrem_1.c:
+	echo "#define TUNE_PROGRAM_BUILD 1"                >divrem_1.c
+	echo "#define __gmpn_divrem_1  mpn_divrem_1_tune" >>divrem_1.c
+	echo "#include \"mpn/generic/divrem_1.c\""        >>divrem_1.c
+
+mod_1.c:
+	echo "#define TUNE_PROGRAM_BUILD 1"          >mod_1.c
+	echo "#define __gmpn_mod_1  mpn_mod_1_tune" >>mod_1.c
+	echo "#include \"mpn/generic/mod_1.c\""     >>mod_1.c
+
+sqr_asm.asm: $(top_builddir)/mpn/sqr_basecase.asm
+	echo 'define(SQR_TOOM2_THRESHOLD_OVERRIDE,SQR_TOOM2_THRESHOLD_MAX)' >sqr_asm.asm
+	echo 'include(../mpn/sqr_basecase.asm)' >>sqr_asm.asm
+
+# FIXME: Should it depend on $(top_builddir)/fac_ui.h too?
+fac_ui.c: $(top_builddir)/mpz/fac_ui.c
+	echo "#define TUNE_PROGRAM_BUILD 1"          >fac_ui.c
+	echo "#define __gmpz_fac_ui mpz_fac_ui_tune" >>fac_ui.c
+	echo "#define __gmpz_oddfac_1 mpz_oddfac_1_tune" >>fac_ui.c
+	echo "#include \"mpz/oddfac_1.c\""           >>fac_ui.c
+	echo "#include \"mpz/fac_ui.c\""             >>fac_ui.c
+
+include ../mpn/Makeasm.am
+
+.NOTPARALLEL:
+

diff --git a/third_party/gmp/tune/Makefile.in b/third_party/gmp/tune/Makefile.in
new file mode 100644
index 0000000..e8eb56c
--- /dev/null
+++ b/third_party/gmp/tune/Makefile.in

@@ -0,0 +1,956 @@
+# Makefile.in generated by automake 1.15 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994-2014 Free Software Foundation, Inc.
+
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+# Copyright 2000-2003, 2005-2011 Free Software Foundation, Inc.
+#
+#  This file is part of the GNU MP Library.
+#
+#  The GNU MP Library is free software; you can redistribute it and/or modify
+#  it under the terms of either:
+#
+#    * the GNU Lesser General Public License as published by the Free
+#      Software Foundation; either version 3 of the License, or (at your
+#      option) any later version.
+#
+#  or
+#
+#    * the GNU General Public License as published by the Free Software
+#      Foundation; either version 2 of the License, or (at your option) any
+#      later version.
+#
+#  or both in parallel, as here.
+#
+#  The GNU MP Library is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+#  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+#  for more details.
+#
+#  You should have received copies of the GNU General Public License and the
+#  GNU Lesser General Public License along with the GNU MP Library.  If not,
+#  see https://www.gnu.org/licenses/.
+
+# Copyright 1996, 1998-2002 Free Software Foundation, Inc.
+#
+#  This file is part of the GNU MP Library.
+#
+#  The GNU MP Library is free software; you can redistribute it and/or modify
+#  it under the terms of either:
+#
+#    * the GNU Lesser General Public License as published by the Free
+#      Software Foundation; either version 3 of the License, or (at your
+#      option) any later version.
+#
+#  or
+#
+#    * the GNU General Public License as published by the Free Software
+#      Foundation; either version 2 of the License, or (at your option) any
+#      later version.
+#
+#  or both in parallel, as here.
+#
+#  The GNU MP Library is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+#  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+#  for more details.
+#
+#  You should have received copies of the GNU General Public License and the
+#  GNU Lesser General Public License along with the GNU MP Library.  If not,
+#  see https://www.gnu.org/licenses/.
+
+VPATH = @srcdir@
+am__is_gnu_make = { \
+  if test -z '$(MAKELEVEL)'; then \
+    false; \
+  elif test -n '$(MAKE_HOST)'; then \
+    true; \
+  elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
+    true; \
+  else \
+    false; \
+  fi; \
+}
+am__make_running_with_option = \
+  case $${target_option-} in \
+      ?) ;; \
+      *) echo "am__make_running_with_option: internal error: invalid" \
+              "target option '$${target_option-}' specified" >&2; \
+         exit 1;; \
+  esac; \
+  has_opt=no; \
+  sane_makeflags=$$MAKEFLAGS; \
+  if $(am__is_gnu_make); then \
+    sane_makeflags=$$MFLAGS; \
+  else \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        bs=\\; \
+        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
+          | sed "s/$$bs$$bs[$$bs $$bs	]*//g"`;; \
+    esac; \
+  fi; \
+  skip_next=no; \
+  strip_trailopt () \
+  { \
+    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
+  }; \
+  for flg in $$sane_makeflags; do \
+    test $$skip_next = yes && { skip_next=no; continue; }; \
+    case $$flg in \
+      *=*|--*) continue;; \
+        -*I) strip_trailopt 'I'; skip_next=yes;; \
+      -*I?*) strip_trailopt 'I';; \
+        -*O) strip_trailopt 'O'; skip_next=yes;; \
+      -*O?*) strip_trailopt 'O';; \
+        -*l) strip_trailopt 'l'; skip_next=yes;; \
+      -*l?*) strip_trailopt 'l';; \
+      -[dEDm]) skip_next=yes;; \
+      -[JT]) skip_next=yes;; \
+    esac; \
+    case $$flg in \
+      *$$target_option*) has_opt=yes; break;; \
+    esac; \
+  done; \
+  test $$has_opt = yes
+am__make_dryrun = (target_option=n; $(am__make_running_with_option))
+am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+EXTRA_PROGRAMS = speed$(EXEEXT) speed-dynamic$(EXEEXT) \
+	speed-ext$(EXEEXT) tuneup$(EXEEXT) tune-gcd-p$(EXEEXT)
+subdir = tune
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/acinclude.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+DIST_COMMON = $(srcdir)/Makefile.am $(noinst_HEADERS) \
+	$(am__DIST_COMMON)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+am__DEPENDENCIES_1 =
+am__DEPENDENCIES_2 = $(am__DEPENDENCIES_1) \
+	$(top_builddir)/tests/libtests.la $(top_builddir)/libgmp.la
+am_libspeed_la_OBJECTS = common.lo divrem1div.lo divrem1inv.lo \
+	divrem2div.lo divrem2inv.lo div_qr_1n_pi1_1.lo \
+	div_qr_1n_pi1_2.lo div_qr_1_tune.lo freq.lo gcdext_single.lo \
+	gcdext_double.lo gcdextod.lo gcdextos.lo hgcd_lehmer.lo \
+	hgcd_appr_lehmer.lo hgcd_reduce_1.lo hgcd_reduce_2.lo \
+	jacbase1.lo jacbase2.lo jacbase3.lo jacbase4.lo hgcd2-1.lo \
+	hgcd2-2.lo hgcd2-3.lo hgcd2-4.lo hgcd2-5.lo mod_1_div.lo \
+	mod_1_inv.lo mod_1_1-1.lo mod_1_1-2.lo modlinv.lo noop.lo \
+	powm_mod.lo powm_redc.lo pre_divrem_1.lo set_strb.lo \
+	set_strs.lo set_strp.lo time.lo
+libspeed_la_OBJECTS = $(am_libspeed_la_OBJECTS)
+AM_V_lt = $(am__v_lt_@AM_V@)
+am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
+am__v_lt_0 = --silent
+am__v_lt_1 = 
+libspeed_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
+	$(libspeed_la_LDFLAGS) $(LDFLAGS) -o $@
+am_speed_OBJECTS = speed.$(OBJEXT)
+speed_OBJECTS = $(am_speed_OBJECTS)
+speed_LDADD = $(LDADD)
+speed_DEPENDENCIES = $(DEPENDENCIES) $(am__DEPENDENCIES_1)
+speed_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
+	$(speed_LDFLAGS) $(LDFLAGS) -o $@
+am_speed_dynamic_OBJECTS = speed.$(OBJEXT)
+speed_dynamic_OBJECTS = $(am_speed_dynamic_OBJECTS)
+speed_dynamic_LDADD = $(LDADD)
+speed_dynamic_DEPENDENCIES = $(DEPENDENCIES) $(am__DEPENDENCIES_1)
+am_speed_ext_OBJECTS = speed-ext.$(OBJEXT)
+speed_ext_OBJECTS = $(am_speed_ext_OBJECTS)
+speed_ext_LDADD = $(LDADD)
+speed_ext_DEPENDENCIES = $(DEPENDENCIES) $(am__DEPENDENCIES_1)
+speed_ext_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
+	$(speed_ext_LDFLAGS) $(LDFLAGS) -o $@
+am_tune_gcd_p_OBJECTS = tune-gcd-p.$(OBJEXT)
+tune_gcd_p_OBJECTS = $(am_tune_gcd_p_OBJECTS)
+tune_gcd_p_LDADD = $(LDADD)
+tune_gcd_p_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
+	$(tune_gcd_p_LDFLAGS) $(LDFLAGS) -o $@
+am_tuneup_OBJECTS = tuneup.$(OBJEXT) hgcd2.$(OBJEXT)
+am__objects_1 = div_qr_2.$(OBJEXT) bdiv_q.$(OBJEXT) bdiv_qr.$(OBJEXT) \
+	dcpi1_div_qr.$(OBJEXT) dcpi1_divappr_q.$(OBJEXT) \
+	dcpi1_bdiv_qr.$(OBJEXT) dcpi1_bdiv_q.$(OBJEXT) \
+	invertappr.$(OBJEXT) invert.$(OBJEXT) binvert.$(OBJEXT) \
+	divrem_2.$(OBJEXT) gcd.$(OBJEXT) gcdext.$(OBJEXT) \
+	get_str.$(OBJEXT) set_str.$(OBJEXT) matrix22_mul.$(OBJEXT) \
+	hgcd.$(OBJEXT) hgcd_appr.$(OBJEXT) hgcd_reduce.$(OBJEXT) \
+	mul_n.$(OBJEXT) sqr.$(OBJEXT) sec_powm.$(OBJEXT) \
+	mullo_n.$(OBJEXT) mul_fft.$(OBJEXT) mul.$(OBJEXT) \
+	tdiv_qr.$(OBJEXT) mulmod_bnm1.$(OBJEXT) sqrmod_bnm1.$(OBJEXT) \
+	mulmid.$(OBJEXT) mulmid_n.$(OBJEXT) toom42_mulmid.$(OBJEXT) \
+	sqrlo.$(OBJEXT) sqrlo_basecase.$(OBJEXT) \
+	nussbaumer_mul.$(OBJEXT) toom6h_mul.$(OBJEXT) \
+	toom8h_mul.$(OBJEXT) toom6_sqr.$(OBJEXT) toom8_sqr.$(OBJEXT) \
+	toom22_mul.$(OBJEXT) toom2_sqr.$(OBJEXT) toom33_mul.$(OBJEXT) \
+	toom3_sqr.$(OBJEXT) toom44_mul.$(OBJEXT) toom4_sqr.$(OBJEXT)
+am__objects_2 = $(am__objects_1) divrem_1.$(OBJEXT) mod_1.$(OBJEXT)
+nodist_tuneup_OBJECTS = sqr_basecase.$(OBJEXT) fac_ui.$(OBJEXT) \
+	$(am__objects_2)
+tuneup_OBJECTS = $(am_tuneup_OBJECTS) $(nodist_tuneup_OBJECTS)
+am__DEPENDENCIES_3 = $(am__DEPENDENCIES_1) libspeed.la
+tuneup_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
+	$(tuneup_LDFLAGS) $(LDFLAGS) -o $@
+AM_V_P = $(am__v_P_@AM_V@)
+am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
+am__v_P_0 = false
+am__v_P_1 = :
+AM_V_GEN = $(am__v_GEN_@AM_V@)
+am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
+am__v_GEN_0 = @echo "  GEN     " $@;
+am__v_GEN_1 = 
+AM_V_at = $(am__v_at_@AM_V@)
+am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
+am__v_at_0 = @
+am__v_at_1 = 
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp =
+am__depfiles_maybe =
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
+	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
+	$(AM_CFLAGS) $(CFLAGS)
+AM_V_CC = $(am__v_CC_@AM_V@)
+am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
+am__v_CC_0 = @echo "  CC      " $@;
+am__v_CC_1 = 
+CCLD = $(CC)
+LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
+AM_V_CCLD = $(am__v_CCLD_@AM_V@)
+am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
+am__v_CCLD_0 = @echo "  CCLD    " $@;
+am__v_CCLD_1 = 
+SOURCES = $(libspeed_la_SOURCES) $(speed_SOURCES) \
+	$(speed_dynamic_SOURCES) $(speed_ext_SOURCES) \
+	$(tune_gcd_p_SOURCES) $(tuneup_SOURCES) \
+	$(nodist_tuneup_SOURCES)
+DIST_SOURCES = $(libspeed_la_SOURCES) $(speed_SOURCES) \
+	$(speed_dynamic_SOURCES) $(speed_ext_SOURCES) \
+	$(tune_gcd_p_SOURCES) $(tuneup_SOURCES)
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+HEADERS = $(noinst_HEADERS)
+am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
+# Read a list of newline-separated strings from the standard input,
+# and print each of them once, without duplicates.  Input order is
+# *not* preserved.
+am__uniquify_input = $(AWK) '\
+  BEGIN { nonempty = 0; } \
+  { items[$$0] = 1; nonempty = 1; } \
+  END { if (nonempty) { for (i in items) print i; }; } \
+'
+# Make sure the list of sources is unique.  This is necessary because,
+# e.g., the same source file might be shared among _SOURCES variables
+# for different programs/libraries.
+am__define_uniq_tagged_files = \
+  list='$(am__tagged_files)'; \
+  unique=`for i in $$list; do \
+    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+  done | $(am__uniquify_input)`
+ETAGS = etags
+CTAGS = ctags
+am__DIST_COMMON = $(srcdir)/../mpn/Makeasm.am $(srcdir)/Makefile.in \
+	README
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ABI = @ABI@
+ACLOCAL = @ACLOCAL@
+AMTAR = @AMTAR@
+AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
+AR = @AR@
+AS = @AS@
+ASMFLAGS = @ASMFLAGS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AWK = @AWK@
+CALLING_CONVENTIONS_OBJS = @CALLING_CONVENTIONS_OBJS@
+CC = @CC@
+CCAS = @CCAS@
+CC_FOR_BUILD = @CC_FOR_BUILD@
+CFLAGS = @CFLAGS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CPP_FOR_BUILD = @CPP_FOR_BUILD@
+CXX = @CXX@
+CXXCPP = @CXXCPP@
+CXXFLAGS = @CXXFLAGS@
+CYGPATH_W = @CYGPATH_W@
+DEFN_LONG_LONG_LIMB = @DEFN_LONG_LONG_LIMB@
+DEFS = @DEFS@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+EXEEXT_FOR_BUILD = @EXEEXT_FOR_BUILD@
+FGREP = @FGREP@
+GMP_LDFLAGS = @GMP_LDFLAGS@
+GMP_LIMB_BITS = @GMP_LIMB_BITS@
+GMP_NAIL_BITS = @GMP_NAIL_BITS@
+GREP = @GREP@
+HAVE_CLOCK_01 = @HAVE_CLOCK_01@
+HAVE_CPUTIME_01 = @HAVE_CPUTIME_01@
+HAVE_GETRUSAGE_01 = @HAVE_GETRUSAGE_01@
+HAVE_GETTIMEOFDAY_01 = @HAVE_GETTIMEOFDAY_01@
+HAVE_HOST_CPU_FAMILY_power = @HAVE_HOST_CPU_FAMILY_power@
+HAVE_HOST_CPU_FAMILY_powerpc = @HAVE_HOST_CPU_FAMILY_powerpc@
+HAVE_SIGACTION_01 = @HAVE_SIGACTION_01@
+HAVE_SIGALTSTACK_01 = @HAVE_SIGALTSTACK_01@
+HAVE_SIGSTACK_01 = @HAVE_SIGSTACK_01@
+HAVE_STACK_T_01 = @HAVE_STACK_T_01@
+HAVE_SYS_RESOURCE_H_01 = @HAVE_SYS_RESOURCE_H_01@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LEX = @LEX@
+LEXLIB = @LEXLIB@
+LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@
+LIBCURSES = @LIBCURSES@
+LIBGMPXX_LDFLAGS = @LIBGMPXX_LDFLAGS@
+LIBGMP_DLL = @LIBGMP_DLL@
+LIBGMP_LDFLAGS = @LIBGMP_LDFLAGS@
+LIBM = @LIBM@
+LIBM_FOR_BUILD = @LIBM_FOR_BUILD@
+LIBOBJS = @LIBOBJS@
+LIBREADLINE = @LIBREADLINE@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@
+M4 = @M4@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHELL = @SHELL@
+SPEED_CYCLECOUNTER_OBJ = @SPEED_CYCLECOUNTER_OBJ@
+STRIP = @STRIP@
+TAL_OBJECT = @TAL_OBJECT@
+TUNE_LIBS = @TUNE_LIBS@
+TUNE_SQR_OBJ = @TUNE_SQR_OBJ@
+U_FOR_BUILD = @U_FOR_BUILD@
+VERSION = @VERSION@
+WITH_READLINE_01 = @WITH_READLINE_01@
+YACC = @YACC@
+YFLAGS = @YFLAGS@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_CXX = @ac_ct_CXX@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+am__leading_dot = @am__leading_dot@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+gmp_srclinks = @gmp_srclinks@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+mpn_objects = @mpn_objects@
+mpn_objs_in_libgmp = @mpn_objs_in_libgmp@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/tests
+AM_LDFLAGS = -no-install
+EXTRA_DIST = alpha.asm pentium.asm sparcv9.asm hppa.asm hppa2.asm hppa2w.asm \
+  ia64.asm powerpc.asm powerpc64.asm x86_64.asm many.pl
+
+noinst_HEADERS = speed.h
+@ENABLE_STATIC_FALSE@STATIC = 
+
+# Prefer -static on the speed and tune programs, since that can avoid
+# overheads of shared library linkages on some systems.  Libtool tends to
+# botch -static if configured with --disable-static, perhaps reasonably
+# enough.  In any event under --disable-static the only choice is a dynamic
+# link so there's no point in -static.
+#
+@ENABLE_STATIC_TRUE@STATIC = -static
+EXTRA_LTLIBRARIES = libspeed.la
+libspeed_la_SOURCES = \
+  common.c divrem1div.c divrem1inv.c divrem2div.c divrem2inv.c		\
+  div_qr_1n_pi1_1.c div_qr_1n_pi1_2.c div_qr_1_tune.c			\
+  freq.c								\
+  gcdext_single.c gcdext_double.c gcdextod.c gcdextos.c			\
+  hgcd_lehmer.c hgcd_appr_lehmer.c hgcd_reduce_1.c hgcd_reduce_2.c	\
+  jacbase1.c jacbase2.c jacbase3.c jacbase4.c				\
+  hgcd2-1.c hgcd2-2.c hgcd2-3.c hgcd2-4.c hgcd2-5.c			\
+  mod_1_div.c mod_1_inv.c mod_1_1-1.c mod_1_1-2.c modlinv.c		\
+  noop.c powm_mod.c powm_redc.c pre_divrem_1.c				\
+  set_strb.c set_strs.c set_strp.c time.c
+
+libspeed_la_DEPENDENCIES = $(SPEED_CYCLECOUNTER_OBJ) \
+  $(top_builddir)/tests/libtests.la $(top_builddir)/libgmp.la
+
+libspeed_la_LIBADD = $(libspeed_la_DEPENDENCIES) $(LIBM)
+libspeed_la_LDFLAGS = $(STATIC)
+DEPENDENCIES = libspeed.la
+LDADD = $(DEPENDENCIES) $(TUNE_LIBS)
+speed_SOURCES = speed.c
+speed_LDFLAGS = $(STATIC)
+speed_dynamic_SOURCES = speed.c
+speed_ext_SOURCES = speed-ext.c
+speed_ext_LDFLAGS = $(STATIC)
+tuneup_SOURCES = tuneup.c hgcd2.c
+nodist_tuneup_SOURCES = sqr_basecase.c fac_ui.c $(TUNE_MPN_SRCS)
+tuneup_DEPENDENCIES = $(TUNE_SQR_OBJ) libspeed.la
+tuneup_LDADD = $(tuneup_DEPENDENCIES) $(TUNE_LIBS)
+tuneup_LDFLAGS = $(STATIC)
+tune_gcd_p_SOURCES = tune-gcd-p.c
+tune_gcd_p_DEPENDENCIES = ../mpn/gcd.c
+tune_gcd_p_LDFLAGS = $(STATIC)
+
+# $(MANY_CLEAN) and $(MANY_DISTCLEAN) are hooks for many.pl
+CLEANFILES = $(EXTRA_PROGRAMS) $(EXTRA_LTLIBRARIES) \
+	$(TUNE_MPN_SRCS) fac_ui.c sqr_asm.asm \
+	stg.gnuplot stg.data \
+	mtg.gnuplot mtg.data \
+	fibg.gnuplot fibg.data \
+	graph.gnuplot graph.data \
+	$(MANY_CLEAN)
+
+DISTCLEANFILES = sqr_basecase.c  $(MANY_DISTCLEAN)
+
+# Generating these little files at build time seems better than including
+# them in the distribution, since the list can be changed more easily.
+#
+# mpn/generic/tdiv_qr.c uses mpn_divrem_1 and mpn_divrem_2, but only for 1
+# and 2 limb divisors, which are never used during tuning, so it doesn't
+# matter whether it picks up a tuned or untuned version of those.
+#
+# divrem_1 and mod_1 are recompiled renamed to "_tune" to avoid a linking
+# problem.  If a native divrem_1 provides an mpn_divrem_1c entrypoint then
+# common.c will want that, but the generic divrem_1 doesn't provide it,
+# likewise for mod_1.  The simplest way around this is to have the tune
+# build versions renamed suitably.
+#
+# FIXME: Would like say mul_n.c to depend on $(top_builddir)/mul_n.c so the
+# recompiled object will be rebuilt if that file changes.
+TUNE_MPN_SRCS = $(TUNE_MPN_SRCS_BASIC) divrem_1.c mod_1.c
+TUNE_MPN_SRCS_BASIC = div_qr_2.c bdiv_q.c bdiv_qr.c			\
+  dcpi1_div_qr.c dcpi1_divappr_q.c dcpi1_bdiv_qr.c dcpi1_bdiv_q.c	\
+  invertappr.c invert.c binvert.c divrem_2.c gcd.c gcdext.c		\
+  get_str.c set_str.c matrix22_mul.c					\
+  hgcd.c hgcd_appr.c hgcd_reduce.c					\
+  mul_n.c sqr.c sec_powm.c						\
+  mullo_n.c mul_fft.c mul.c tdiv_qr.c mulmod_bnm1.c sqrmod_bnm1.c	\
+  mulmid.c mulmid_n.c toom42_mulmid.c sqrlo.c sqrlo_basecase.c		\
+  nussbaumer_mul.c toom6h_mul.c toom8h_mul.c toom6_sqr.c toom8_sqr.c	\
+  toom22_mul.c toom2_sqr.c toom33_mul.c toom3_sqr.c toom44_mul.c toom4_sqr.c
+
+
+# COMPILE minus CC.
+#
+COMPILE_FLAGS = $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) $(ASMFLAGS)
+
+
+# Flags used for preprocessing (in ansi2knr rules).
+#
+PREPROCESS_FLAGS = $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS)
+
+
+# Recent versions of automake (1.5 and up for instance) append automake
+# generated suffixes to this $(SUFFIXES) list.  This is essential for us,
+# since .c must come after .s, .S and .asm.  If .c is before .s, for
+# instance, then in the mpn directory "make" will see add_n.c mentioned in
+# an explicit rule (the ansi2knr stuff) and decide it must have add_n.c,
+# even if add_n.c doesn't exist but add_n.s does.  See GNU make
+# documentation "(make)Implicit Rule Search", part 5c.
+#
+# On IRIX 6 native make this doesn't work properly though.  Somehow .c
+# remains ahead of .s, perhaps because .c.s is a builtin rule.  .asm works
+# fine though, and mpn/mips3 uses this.
+#
+SUFFIXES = .s .S .asm
+
+# can be overridden during development, eg. "make RM_TMP=: mul_1.lo"
+RM_TMP = rm -f
+all: all-am
+
+.SUFFIXES:
+.SUFFIXES: .s .S .asm .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(srcdir)/../mpn/Makeasm.am $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu --ignore-deps tune/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu --ignore-deps tune/Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+$(srcdir)/../mpn/Makeasm.am $(am__empty):
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+libspeed.la: $(libspeed_la_OBJECTS) $(libspeed_la_DEPENDENCIES) $(EXTRA_libspeed_la_DEPENDENCIES) 
+	$(AM_V_CCLD)$(libspeed_la_LINK)  $(libspeed_la_OBJECTS) $(libspeed_la_LIBADD) $(LIBS)
+
+speed$(EXEEXT): $(speed_OBJECTS) $(speed_DEPENDENCIES) $(EXTRA_speed_DEPENDENCIES) 
+	@rm -f speed$(EXEEXT)
+	$(AM_V_CCLD)$(speed_LINK) $(speed_OBJECTS) $(speed_LDADD) $(LIBS)
+
+speed-dynamic$(EXEEXT): $(speed_dynamic_OBJECTS) $(speed_dynamic_DEPENDENCIES) $(EXTRA_speed_dynamic_DEPENDENCIES) 
+	@rm -f speed-dynamic$(EXEEXT)
+	$(AM_V_CCLD)$(LINK) $(speed_dynamic_OBJECTS) $(speed_dynamic_LDADD) $(LIBS)
+
+speed-ext$(EXEEXT): $(speed_ext_OBJECTS) $(speed_ext_DEPENDENCIES) $(EXTRA_speed_ext_DEPENDENCIES) 
+	@rm -f speed-ext$(EXEEXT)
+	$(AM_V_CCLD)$(speed_ext_LINK) $(speed_ext_OBJECTS) $(speed_ext_LDADD) $(LIBS)
+
+tune-gcd-p$(EXEEXT): $(tune_gcd_p_OBJECTS) $(tune_gcd_p_DEPENDENCIES) $(EXTRA_tune_gcd_p_DEPENDENCIES) 
+	@rm -f tune-gcd-p$(EXEEXT)
+	$(AM_V_CCLD)$(tune_gcd_p_LINK) $(tune_gcd_p_OBJECTS) $(tune_gcd_p_LDADD) $(LIBS)
+
+tuneup$(EXEEXT): $(tuneup_OBJECTS) $(tuneup_DEPENDENCIES) $(EXTRA_tuneup_DEPENDENCIES) 
+	@rm -f tuneup$(EXEEXT)
+	$(AM_V_CCLD)$(tuneup_LINK) $(tuneup_OBJECTS) $(tuneup_LDADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+.c.o:
+	$(AM_V_CC)$(COMPILE) -c -o $@ $<
+
+.c.obj:
+	$(AM_V_CC)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+
+.c.lo:
+	$(AM_V_CC)$(LTCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+ID: $(am__tagged_files)
+	$(am__define_uniq_tagged_files); mkid -fID $$unique
+tags: tags-am
+TAGS: tags
+
+tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	set x; \
+	here=`pwd`; \
+	$(am__define_uniq_tagged_files); \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: ctags-am
+
+CTAGS: ctags
+ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	$(am__define_uniq_tagged_files); \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+cscopelist: cscopelist-am
+
+cscopelist-am: $(am__tagged_files)
+	list='$(am__tagged_files)'; \
+	case "$(srcdir)" in \
+	  [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
+	  *) sdir=$(subdir)/$(srcdir) ;; \
+	esac; \
+	for i in $$list; do \
+	  if test -f "$$i"; then \
+	    echo "$(subdir)/$$i"; \
+	  else \
+	    echo "$$sdir/$$i"; \
+	  fi; \
+	done >> $(top_builddir)/cscope.files
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-am
+all-am: Makefile $(HEADERS)
+installdirs:
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+	-test -z "$(CLEANFILES)" || rm -f $(CLEANFILES)
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+	-test -z "$(DISTCLEANFILES)" || rm -f $(DISTCLEANFILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-generic clean-libtool mostlyclean-am
+
+distclean: distclean-am
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: install-am install-strip
+
+.PHONY: CTAGS GTAGS TAGS all all-am check check-am clean clean-generic \
+	clean-libtool cscopelist-am ctags ctags-am distclean \
+	distclean-compile distclean-generic distclean-libtool \
+	distclean-tags distdir dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	maintainer-clean maintainer-clean-generic mostlyclean \
+	mostlyclean-compile mostlyclean-generic mostlyclean-libtool \
+	pdf pdf-am ps ps-am tags tags-am uninstall uninstall-am
+
+.PRECIOUS: Makefile
+
+
+$(top_builddir)/tests/libtests.la:
+	cd $(top_builddir)/tests; $(MAKE) $(AM_MAKEFLAGS) libtests.la
+
+tune:
+	$(MAKE) $(AM_MAKEFLAGS) tuneup$(EXEEXT)
+	./tuneup
+
+allprogs: $(EXTRA_PROGRAMS)
+
+$(TUNE_MPN_SRCS_BASIC):
+	for i in $(TUNE_MPN_SRCS_BASIC); do \
+	  echo "#define TUNE_PROGRAM_BUILD 1" >$$i; \
+	  echo "#include \"mpn/generic/$$i\"" >>$$i; \
+	done
+
+divrem_1.c:
+	echo "#define TUNE_PROGRAM_BUILD 1"                >divrem_1.c
+	echo "#define __gmpn_divrem_1  mpn_divrem_1_tune" >>divrem_1.c
+	echo "#include \"mpn/generic/divrem_1.c\""        >>divrem_1.c
+
+mod_1.c:
+	echo "#define TUNE_PROGRAM_BUILD 1"          >mod_1.c
+	echo "#define __gmpn_mod_1  mpn_mod_1_tune" >>mod_1.c
+	echo "#include \"mpn/generic/mod_1.c\""     >>mod_1.c
+
+sqr_asm.asm: $(top_builddir)/mpn/sqr_basecase.asm
+	echo 'define(SQR_TOOM2_THRESHOLD_OVERRIDE,SQR_TOOM2_THRESHOLD_MAX)' >sqr_asm.asm
+	echo 'include(../mpn/sqr_basecase.asm)' >>sqr_asm.asm
+
+# FIXME: Should it depend on $(top_builddir)/fac_ui.h too?
+fac_ui.c: $(top_builddir)/mpz/fac_ui.c
+	echo "#define TUNE_PROGRAM_BUILD 1"          >fac_ui.c
+	echo "#define __gmpz_fac_ui mpz_fac_ui_tune" >>fac_ui.c
+	echo "#define __gmpz_oddfac_1 mpz_oddfac_1_tune" >>fac_ui.c
+	echo "#include \"mpz/oddfac_1.c\""           >>fac_ui.c
+	echo "#include \"mpz/fac_ui.c\""             >>fac_ui.c
+
+# .s assembler, no preprocessing.
+#
+.s.o:
+	$(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$<
+.s.obj:
+	$(CCAS) $(COMPILE_FLAGS) `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi`
+.s.lo:
+	$(LIBTOOL) --mode=compile --tag=CC $(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$<
+
+# .S assembler, preprocessed with cpp.
+#
+# It's necessary to run $(CPP) separately, since it seems not all compilers
+# recognise .S files, in particular "cc" on HP-UX 10 and 11 doesn't (and
+# will silently do nothing if given a .S).
+#
+# For .lo we need a helper script, as described below for .asm.lo.
+#
+.S.o:
+	$(CPP) $(PREPROCESS_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$< | grep -v '^#' >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	$(RM_TMP) tmp-$*.s
+.S.obj:
+	$(CPP) $(PREPROCESS_FLAGS) `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi` | grep -v '^#' >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	$(RM_TMP) tmp-$*.s
+.S.lo:
+	$(LIBTOOL) --mode=compile --tag=CC $(top_srcdir)/mpn/cpp-ccas --cpp="$(CPP) $(PREPROCESS_FLAGS)" $(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$<
+
+# .asm assembler, preprocessed with m4.
+#
+# .o and .obj are non-PIC and just need m4 followed by a compile.
+#
+# .lo is a bit tricky.  Libtool (as of version 1.5) has foo.lo as a little
+# text file, and .libs/foo.o and foo.o as the PIC and non-PIC objects,
+# respectively.  It'd be asking for lots of trouble to try to create foo.lo
+# ourselves, so instead arrange to invoke libtool like a --mode=compile, but
+# with a special m4-ccas script which first m4 preprocesses, then compiles.
+# --tag=CC is necessary since foo.asm is otherwise unknown to libtool.
+#
+# Libtool adds -DPIC when building a shared object and the .asm files look
+# for that.  But it should be noted that the other PIC flags are on occasion
+# important too, in particular FreeBSD 2.2.8 gas 1.92.3 requires -k before
+# it accepts PIC constructs like @GOT, and gcc adds that flag only under
+# -fPIC.  (Later versions of gas are happy to accept PIC stuff any time.)
+#
+.asm.o:
+	$(M4) -DOPERATION_$* `test -f '$<' || echo '$(srcdir)/'`$< >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	$(RM_TMP) tmp-$*.s
+.asm.obj:
+	$(M4) -DOPERATION_$* `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi` >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	$(RM_TMP) tmp-$*.s
+.asm.lo:
+	$(LIBTOOL) --mode=compile --tag=CC $(top_srcdir)/mpn/m4-ccas --m4="$(M4)" $(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$<
+
+.NOTPARALLEL:
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:

diff --git a/third_party/gmp/tune/README b/third_party/gmp/tune/README
new file mode 100644
index 0000000..f76407f
--- /dev/null
+++ b/third_party/gmp/tune/README

@@ -0,0 +1,501 @@
+Copyright 2000-2002, 2004 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+               GMP SPEED MEASURING AND PARAMETER TUNING
+
+
+The programs in this directory are for knowledgeable users who want to
+measure GMP routines on their machine, and perhaps tweak some settings or
+identify things that can be improved.
+
+The programs here are tools, not ready to run solutions.  Nothing is built
+in a normal "make all", but various Makefile targets described below exist.
+
+Relatively few systems and CPUs have been tested, so be sure to verify that
+results are sensible before relying on them.
+
+
+
+
+MISCELLANEOUS NOTES
+
+--enable-assert
+
+    Don't configure with --enable-assert, since the extra code added by
+    assertion checking may influence measurements.
+
+Direct mapped caches
+
+    Some effort has been made to accommodate CPUs with direct mapped caches,
+    by putting data blocks more or less contiguously on the stack.  But this
+    will depend on TMP_ALLOC using alloca, and even then it may or may not
+    be enough.
+
+FreeBSD 4.2 i486 getrusage
+
+    This getrusage seems to be a bit doubtful, it looks like it's
+    microsecond accurate, but sometimes ru_utime remains unchanged after a
+    time of many microseconds has elapsed.  It'd be good to detect this in
+    the time.c initializations, but for now the suggestion is to pretend it
+    doesn't exist.
+
+        ./configure ac_cv_func_getrusage=no
+
+NetBSD 1.4.1 m68k macintosh time base
+
+    On this system it's been found getrusage often goes backwards, making it
+    unusable (time.c getrusage_backwards_p detects this).  gettimeofday
+    sometimes doesn't update atomically when it crosses a 1 second boundary.
+    Not sure what to do about this.  Expect possible intermittent failures.
+
+SCO OpenUNIX 8 /etc/hw
+
+    /etc/hw takes about a second to return the cpu frequency, which suggests
+    perhaps it's measuring each time it runs.  If this is annoying when
+    running the speed program repeatedly then set a GMP_CPU_FREQUENCY
+    environment variable (see TIME BASE section below).
+
+Timing on GNU/Linux
+
+    On Linux, timing currently uses the cycle counter. This is unreliable,
+    since the counter is not saved and restored at context switches (unlike
+    FreeBSD and Solaris where the cycle counter is "virtualized").
+
+    Using the clock_gettime method with CLOCK_PROCESS_CPUTIME_ID (posix) or
+    CLOCK_VIRTUAL (BSD) should be more reliable. To get clock_gettime
+    with glibc, one has to link with -lrt (which also drags in the pthreads
+    threading library). configure.in must be hacked to detect this and
+    arrange proper linking. Something like
+
+      old_LIBS="$LIBS"
+      AC_SEARCH_LIBS(clock_gettime, rt, [AC_DEFINE(HAVE_CLOCK_GETTIME)])
+      TUNE_LIBS="$LIBS"
+      LIBS="$old_LIBS"
+
+      AC_SUBST(TUNE_LIBS)
+
+    might work.
+
+Low resolution timebase
+
+    Parameter tuning can be very time consuming if the only timebase
+    available is a 10 millisecond clock tick, to the point of being
+    unusable.  This is currently the case on VAX and ARM systems.
+
+
+
+
+PARAMETER TUNING
+
+The "tuneup" program runs some tests designed to find the best settings for
+various thresholds, like MUL_TOOM22_THRESHOLD.  Its output can be put
+into gmp-mparam.h.  The program is built and run with
+
+        make tune
+
+If the thresholds indicated are grossly different from the values in the
+selected gmp-mparam.h then there may be a performance boost in applicable
+size ranges by changing gmp-mparam.h accordingly.
+
+Be sure to do a full reconfigure and rebuild to get any newly set thresholds
+to take effect.  A partial rebuild is enough sometimes, but a fresh
+configure and make is certain to be correct.
+
+If a CPU has specific tuned parameters coming from a gmp-mparam.h in one of
+the mpn subdirectories then the values from "make tune" should be similar.
+But check that the configured CPU is right and there are no machine specific
+effects causing a difference.
+
+It's hoped the compiler and options used won't have too much effect on
+thresholds, since for most CPUs they ultimately come down to comparisons
+between assembler subroutines.  Missing out on the longlong.h macros by not
+using gcc will probably have an effect.
+
+Some thresholds produced by the tune program are merely single values chosen
+from what's a range of sizes where two algorithms are pretty much the same
+speed.  When this happens the program is likely to give somewhat different
+values on successive runs.  This is noticeable on the toom3 thresholds for
+instance.
+
+
+
+
+SPEED PROGRAM
+
+The "speed" program can be used for measuring and comparing various
+routines, and producing tables of data or gnuplot graphs.  Compile it with
+
+	make speed
+
+(Or on DOS systems "make speed.exe".)
+
+Here are some examples of how to use it.  Check the code for all the
+options.
+
+Draw a graph of mpn_mul_n, stepping through sizes by 10 or a factor of 1.05
+(whichever is greater).
+
+        ./speed -s 10-5000 -t 10 -f 1.05 -P foo mpn_mul_n
+	gnuplot foo.gnuplot
+
+Compare mpn_add_n and an mpn_lshift by 1, showing times in cycles and
+showing under mpn_lshift the difference between it and mpn_add_n.
+
+	./speed -s 1-40 -c -d mpn_add_n mpn_lshift.1
+
+Using option -c for times in cycles is interesting but normally only
+necessary when looking carefully at assembler subroutines.  You might think
+it would always give an integer value, but this doesn't happen in practice,
+probably due to overheads in the time measurements.
+
+In the free-form output the "#" symbol against a measurement means the
+corresponding routine is fastest at that size.  This is a convenient visual
+cue when comparing different routines.  The graph data files <name>.data
+don't get this since it would upset gnuplot or other data viewers.
+
+
+
+
+TIME BASE
+
+The time measuring method is determined in time.c, based on what the
+configured host has available.  A cycle counter is preferred, possibly
+supplemented by another method if the counter has a limited range.  A
+microsecond accurate getrusage() or gettimeofday() will work quite well too.
+
+The cycle counters (except possibly on alpha) and gettimeofday() will depend
+on the machine being otherwise idle, or rather on other jobs not stealing
+CPU time from the measuring program.  Short routines (those that complete
+within a timeslice) should work even on a busy machine.
+
+Some trouble is taken by speed_measure() in common.c to avoid ill effects
+from sporadic interrupts, or other intermittent things (like cron waking up
+every minute).  But generally an idle machine will be necessary to be
+certain of consistent results.
+
+The CPU frequency is needed to convert between cycles and seconds, or for
+when a cycle counter is supplemented by getrusage() etc.  The speed program
+will convert as necessary according to the output format requested.  The
+tune program will work with either cycles or seconds.
+
+freq.c knows how to get the frequency on some systems, or can measure a
+cycle counter against gettimeofday() or getrusage(), but when that fails, or
+needs to be overridden, an environment variable GMP_CPU_FREQUENCY can be
+used (in Hertz).  For example in "bash" on a 650 MHz machine,
+
+	export GMP_CPU_FREQUENCY=650e6
+
+A high precision time base makes it possible to get accurate measurements in
+a shorter time.
+
+
+
+
+EXAMPLE COMPARISONS - VARIOUS
+
+Here are some ideas for things that can be done with the speed program.
+
+There's always going to be a certain amount of overhead in the time
+measurements, due to reading the time base, and in the loop that runs a
+routine enough times to get a reading of the desired precision.  Noop
+functions taking various arguments are available to measure this.  The
+"overhead" printed by the speed program each time in its intro is the "noop"
+routine, but note that this is just for information, it isn't deducted from
+the times printed or anything.
+
+	./speed -s 1 noop noop_wxs noop_wxys
+
+To see how many cycles per limb a routine is taking, look at the time
+increase when the size increments, using option -D.  This avoids fixed
+overheads in the measuring.  Also, remember many of the assembler routines
+have unrolled loops, so it might be necessary to compare times at, say, 16,
+32, 48, 64 etc to see what the unrolled part is taking, as opposed to any
+finishing off.
+
+        ./speed -s 16-64 -t 16 -C -D mpn_add_n
+
+The -C option on its own gives cycles per limb, but is really only useful at
+big sizes where fixed overheads are small compared to the code doing the
+real work.  Remember of course memory caching and/or page swapping will
+affect results at large sizes.
+
+        ./speed -s 500000 -C mpn_add_n
+
+Once a calculation stops fitting in the CPU data cache, it's going to start
+taking longer.  Exactly where this happens depends on the cache priming in
+the measuring routines, and on what sort of "least recently used" the
+hardware does.  Here's an example for a CPU with a 16kbyte L1 data cache and
+32-bit limb, showing a suddenly steeper curve for mpn_add_n at about 2000
+limbs.
+
+        ./speed -s 1-4000 -t 5 -f 1.02 -P foo mpn_add_n
+	gnuplot foo.gnuplot
+
+When a routine has an unrolled loop for, say, multiples of 8 limbs and then
+an ordinary loop for the remainder, it can happen that it's actually faster
+to do an operation on, say, 8 limbs than it is on 7 limbs.  The following
+draws a graph of mpn_sub_n, to see whether times smoothly increase with
+size.
+
+        ./speed -s 1-100 -c -P foo mpn_sub_n
+	gnuplot foo.gnuplot
+
+If mpn_lshift and mpn_rshift have special case code for shifts by 1, it
+ought to be faster (or at least not slower) than shifting by, say, 2 bits.
+
+        ./speed -s 1-200 -c mpn_rshift.1 mpn_rshift.2
+
+An mpn_lshift by 1 can be done by mpn_add_n adding a number to itself, and
+if the lshift isn't faster there's an obvious improvement that's possible.
+
+        ./speed -s 1-200 -c mpn_lshift.1 mpn_add_n_self
+
+On some CPUs (AMD K6 for example) an "in-place" mpn_add_n where the
+destination is one of the sources is faster than a separate destination.
+Here's an example to see this.  ".1" selects dst==src1 for mpn_add_n (and
+mpn_sub_n), for other values see speed.h SPEED_ROUTINE_MPN_BINARY_N_CALL.
+
+        ./speed -s 1-200 -c mpn_add_n mpn_add_n.1
+
+The gmp manual points out that divisions by powers of two should be done
+using a right shift because it'll be significantly faster than an actual
+division.  The following shows by what factor mpn_rshift is faster than
+mpn_divrem_1, using division by 32 as an example.
+
+        ./speed -s 10-20 -r mpn_rshift.5 mpn_divrem_1.32
+
+
+
+
+EXAMPLE COMPARISONS - MULTIPLICATION
+
+mul_basecase takes a ".<r>" parameter. If positive, it gives the second
+(smaller) operand size.  For example to show speeds for 3x3 up to 20x3 in
+cycles,
+
+        ./speed -s 3-20 -c mpn_mul_basecase.3
+
+A negative ".<-r>" parameter fixes the size of the product to the absolute
+value r.  For example to show speeds for 10x10 up to 19x1 in cycles,
+
+        ./speed -s 10-19 -c mpn_mul_basecase.-20
+
+mul_basecase with no parameter does an NxN multiply, so for example to show
+speeds in cycles for 1x1, 2x2, 3x3, etc, up to 20x20, in cycles,
+
+        ./speed -s 1-20 -c mpn_mul_basecase
+
+sqr_basecase is implemented by a "triangular" method on most CPUs, making it
+up to twice as fast as mul_basecase.  In practice loop overheads and the
+products on the diagonal mean it falls short of this.  Here's an example
+running the two and showing by what factor an NxN mul_basecase is slower
+than an NxN sqr_basecase.  (Some versions of sqr_basecase only allow sizes
+below SQR_TOOM2_THRESHOLD, so if it crashes at that point don't worry.)
+
+        ./speed -s 1-20 -r mpn_sqr_basecase mpn_mul_basecase
+
+The technique described above with -CD for showing the time difference in
+cycles per limb between two size operations can be done on an NxN
+mul_basecase using -E to change the basis for the size increment to N*N.
+For instance a 20x20 operation is taken to be doing 400 limbs, and a 16x16
+doing 256 limbs.  The following therefore shows the per crossproduct speed
+of mul_basecase and sqr_basecase at around 20x20 limbs.
+
+        ./speed -s 16-20 -t 4 -CDE mpn_mul_basecase mpn_sqr_basecase
+
+Of course sqr_basecase isn't really doing NxN crossproducts, but it can be
+interesting to compare it to mul_basecase as if it was.  For sqr_basecase
+the -F option can be used to base the deltas on N*(N+1)/2 operations, which
+is the triangular products sqr_basecase does.  For example,
+
+        ./speed -s 16-20 -t 4 -CDF mpn_sqr_basecase
+
+Both -E and -F are preliminary and might change.  A consistent approach to
+using them when claiming certain per crossproduct or per triangularproduct
+speeds hasn't really been established, but the increment between speeds in
+the range karatsuba will call seems sensible, that being k to k/2.  For
+instance, if the karatsuba threshold was 20 for the multiply and 30 for the
+square,
+
+        ./speed -s 10-20 -t 10 -CDE mpn_mul_basecase
+        ./speed -s 15-30 -t 15 -CDF mpn_sqr_basecase
+
+
+
+EXAMPLE COMPARISONS - MALLOC
+
+The gmp manual recommends application programs avoid excessive initializing
+and clearing of mpz_t variables (and mpq_t and mpf_t too).  Every new
+variable will at a minimum go through an init, a realloc for its first
+store, and finally a clear.  Quite how long that takes depends on the C
+library.  The following compares an mpz_init/realloc/clear to a 10 limb
+mpz_add.  Don't be surprised if the mallocing is quite slow.
+
+        ./speed -s 10 -c mpz_init_realloc_clear mpz_add
+
+On some systems malloc and free are much slower when dynamic linked.  The
+speed-dynamic program can be used to see this.  For example the following
+measures malloc/free, first static then dynamic.
+
+        ./speed -s 10 -c malloc_free
+        ./speed-dynamic -s 10 -c malloc_free
+
+Of course a real world program has big problems if it's doing so many
+mallocs and frees that it gets slowed down by a dynamic linked malloc.
+
+
+
+
+
+EXAMPLE COMPARISONS - STRING CONVERSIONS
+
+mpn_get_str does a binary to string conversion.  The base is specified with
+a ".<r>" parameter, or decimal by default.  Power of 2 bases are much faster
+than general bases.  The following compares decimal and hex for instance.
+
+        ./speed -s 1-20 -c mpn_get_str mpn_get_str.16
+
+Smaller bases need more divisions to split a given size number, and so are
+slower.  The following compares base 3 and base 9.  On small operands 9 will
+be nearly twice as fast, though at bigger sizes this reduces since in the
+current implementation both divide repeatedly by 3^20 (or 3^40 for 64 bit
+limbs) and those divisions come to dominate.
+
+        ./speed -s 1-20 -cr mpn_get_str.3 mpn_get_str.9
+
+mpn_set_str does a string to binary conversion.  The base is specified with
+a ".<r>" parameter, or decimal by default.  Power of 2 bases are faster than
+general bases on large conversions.
+
+	./speed -s 1-512 -f 2 -c mpn_set_str.8 mpn_set_str.10
+
+mpn_set_str also has some special case code for decimal which is a bit
+faster than the general case, basically by giving the compiler a chance to
+optimize some multiplications by 10.
+
+	./speed -s 20-40 -c mpn_set_str.9 mpn_set_str.10 mpn_set_str.11
+
+
+
+
+EXAMPLE COMPARISONS - GCDs
+
+mpn_gcd_1 has a threshold for when to reduce using an initial x%y when both
+x and y are single limbs.  This isn't tuned currently, but a value can be
+established by a measurement like
+
+	./speed -s 10-32 mpn_gcd_1.10
+
+This runs src[0] from 10 to 32 bits, and y fixed at 10 bits.  If the div
+threshold is high, say 31 so it's effectively disabled then a 32x10 bit gcd
+is done by nibbling away at the 32-bit operands bit-by-bit.  When the
+threshold is small, say 1 bit, then an initial x%y is done to reduce it to a
+10x10 bit operation.
+
+The threshold in mpn/generic/gcd_1.c or the various assembler
+implementations can be tweaked up or down until there's no more speedups on
+interesting combinations of sizes.  Note that this affects only a 1x1 limb
+operation and so isn't very important.  (An Nx1 limb operation always does
+an initial modular reduction, using mpn_mod_1 or mpn_modexact_1_odd.)
+
+
+
+
+SPEED PROGRAM EXTENSIONS
+
+Potentially lots of things could be made available in the program, but it's
+been left at only the things that have actually been wanted and are likely
+to be reasonably useful in the future.
+
+Extensions should be fairly easy to make though.  speed-ext.c is an example,
+in a style that should suit one-off tests, or new code fragments under
+development.
+
+many.pl is a script for generating a new speed program supplemented with
+alternate versions of the standard routines.  It can be used for measuring
+experimental code, or for comparing different implementations that exist
+within a CPU family.
+
+
+
+
+THRESHOLD EXAMINING
+
+The speed program can be used to examine the speeds of different algorithms
+to check the tune program has done the right thing.  For example to examine
+the karatsuba multiply threshold,
+
+	./speed -s 5-40 mpn_mul_basecase mpn_kara_mul_n
+
+When examining the toom3 threshold, remember it depends on the karatsuba
+threshold, so the right karatsuba threshold needs to be compiled into the
+library first.  The tune program uses specially recompiled versions of
+mpn/mul_n.c etc for this reason, but the speed program simply uses the
+normal libgmp.la.
+
+Note further that the various routines may recurse into themselves on sizes
+far enough above applicable thresholds.  For example, mpn_kara_mul_n will
+recurse into itself on sizes greater than twice the compiled-in
+MUL_TOOM22_THRESHOLD.
+
+When doing the above comparison between mul_basecase and kara_mul_n what's
+probably of interest is mul_basecase versus a kara_mul_n that does one level
+of Karatsuba then calls to mul_basecase, but this only happens on sizes less
+than twice the compiled MUL_TOOM22_THRESHOLD.  A larger value for that
+setting can be compiled-in to avoid the problem if necessary.  The same
+applies to toom3 and DC, though in a trickier fashion.
+
+There are some upper limits on some of the thresholds, arising from arrays
+dimensioned according to a threshold (mpn_mul_n), or asm code with certain
+sized displacements (some x86 versions of sqr_basecase).  So putting huge
+values for the thresholds, even just for testing, may fail.
+
+
+
+
+FUTURE
+
+Make a program to check the time base is working properly, for small and
+large measurements.  Make it able to test each available method, including
+perhaps the apparent resolution of each.
+
+Make a general mechanism for specifying operand overlap, and a syntax like
+maybe "mpn_add_n.dst=src2" to select it.  Some measuring routines do this
+sort of thing with the "r" parameter currently.
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:

diff --git a/third_party/gmp/tune/alpha.asm b/third_party/gmp/tune/alpha.asm
new file mode 100644
index 0000000..888c77f
--- /dev/null
+++ b/third_party/gmp/tune/alpha.asm

@@ -0,0 +1,59 @@
+dnl  Alpha time stamp counter access routine.
+
+dnl  Copyright 2000, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C void speed_cyclecounter (unsigned int p[2]);
+C
+
+C The rpcc instruction returns a 64-bit value split into two 32-bit fields.
+C The lower 32 bits are set by the hardware, and the upper 32 bits are set
+C by the operating system.  The real per-process cycle count is the sum of
+C these halves.
+
+C Unfortunately, some operating systems don't get this right.  NetBSD 1.3 is
+C known to sometimes put garbage in the upper half.  Whether newer NetBSD
+C versions get it right, is unknown to us.
+
+C rpcc measures cycles elapsed in the user program and hence should be very
+C accurate even on a busy system.  Losing cache contents due to task
+C switching may have an effect though.
+
+ASM_START()
+PROLOGUE(speed_cyclecounter)
+	rpcc	r0
+	srl	r0,32,r1
+	addq	r1,r0,r0
+	stl	r0,0(r16)
+	stl	r31,4(r16)		C zero upper return word
+	ret	r31,(r26),1
+EPILOGUE(speed_cyclecounter)
+ASM_END()

diff --git a/third_party/gmp/tune/common.c b/third_party/gmp/tune/common.c
new file mode 100644
index 0000000..c8b17fc
--- /dev/null
+++ b/third_party/gmp/tune/common.c

@@ -0,0 +1,2852 @@
+/* Shared speed subroutines.
+
+Copyright 1999-2006, 2008-2017, 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define __GMP_NO_ATTRIBUTE_CONST_PURE
+
+#include <errno.h>
+#include <fcntl.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h> /* for qsort */
+#include <string.h>
+#include <unistd.h>
+#if 0
+#include <sys/ioctl.h>
+#endif
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#include "tests.h"
+#include "speed.h"
+
+
+int   speed_option_addrs = 0;
+int   speed_option_verbose = 0;
+int   speed_option_cycles_broken = 0;
+
+
+/* Provide __clz_tab even if it's not required, for the benefit of new code
+   being tested with many.pl. */
+#ifndef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
+#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
+#include "mp_clz_tab.c"
+#undef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
+#endif
+
+
+void
+pentium_wbinvd(void)
+{
+#if 0
+  {
+    static int  fd = -2;
+
+    if (fd == -2)
+      {
+	fd = open ("/dev/wbinvd", O_RDWR);
+	if (fd == -1)
+	  perror ("open /dev/wbinvd");
+      }
+
+    if (fd != -1)
+      ioctl (fd, 0, 0);
+  }
+#endif
+
+#if 0
+#define WBINVDSIZE  1024*1024*2
+  {
+    static char  *p = NULL;
+    int   i, sum;
+
+    if (p == NULL)
+      p = malloc (WBINVDSIZE);
+
+#if 0
+    for (i = 0; i < WBINVDSIZE; i++)
+      p[i] = i & 0xFF;
+#endif
+
+    sum = 0;
+    for (i = 0; i < WBINVDSIZE; i++)
+      sum += p[i];
+
+    mpn_cache_fill_dummy (sum);
+  }
+#endif
+}
+
+
+int
+double_cmp_ptr (const double *p, const double *q)
+{
+  if (*p > *q)  return 1;
+  if (*p < *q)  return -1;
+  return 0;
+}
+
+
+/* Measure the speed of a given routine.
+
+   The routine is run with enough repetitions to make it take at least
+   speed_precision * speed_unittime.  This aims to minimize the effects of a
+   limited accuracy time base and the overhead of the measuring itself.
+
+   Measurements are made looking for 4 results within TOLERANCE of each
+   other (or 3 for routines taking longer than 2 seconds).  This aims to get
+   an accurate reading even if some runs are bloated by interrupts or task
+   switches or whatever.
+
+   The given (*fun)() is expected to run its function "s->reps" many times
+   and return the total elapsed time measured using speed_starttime() and
+   speed_endtime().  If the function doesn't support the given s->size or
+   s->r, -1.0 should be returned.  See the various base routines below.  */
+
+double
+speed_measure (double (*fun) (struct speed_params *s), struct speed_params *s)
+{
+#define TOLERANCE    1.01  /* 1% */
+  const int max_zeros = 10;
+
+  struct speed_params  s_dummy;
+  int     i, j, e;
+  double  t[30];
+  double  t_unsorted[30];
+  double  reps_d;
+  int     zeros = 0;
+
+  /* Use dummy parameters if caller doesn't provide any.  Only a few special
+     "fun"s will cope with this, speed_noop() is one.  */
+  if (s == NULL)
+    {
+      memset (&s_dummy, '\0', sizeof (s_dummy));
+      s = &s_dummy;
+    }
+
+  s->reps = 1;
+  s->time_divisor = 1.0;
+  for (i = 0; i < numberof (t); i++)
+    {
+      for (;;)
+	{
+	  s->src_num = 0;
+	  s->dst_num = 0;
+
+	  t[i] = (*fun) (s);
+
+	  if (speed_option_verbose >= 3)
+	    gmp_printf("size=%ld reps=%u r=%Md attempt=%d  %.9f\n",
+		       (long) s->size, s->reps, s->r, i, t[i]);
+
+	  if (t[i] == 0.0)
+	    {
+	      zeros++;
+	      if (zeros > max_zeros)
+		{
+		  fprintf (stderr, "Fatal error: too many (%d) failed measurements (0.0)\n", zeros);
+		  abort ();
+		}
+	     if (s->reps < 10000)
+	       s->reps *= 2;
+
+	      continue;
+	    }
+
+	  if (t[i] == -1.0)
+	    return -1.0;
+
+	  if (t[i] >= speed_unittime * speed_precision)
+	    break;
+
+	  /* go to a value of reps to make t[i] >= precision */
+	  reps_d = ceil (1.1 * s->reps
+			 * speed_unittime * speed_precision
+			 / MAX (t[i], speed_unittime));
+	  if (reps_d > 2e9 || reps_d < 1.0)
+	    {
+	      fprintf (stderr, "Fatal error: new reps bad: %.2f\n", reps_d);
+	      fprintf (stderr, "  (old reps %u, unittime %.4g, precision %d, t[i] %.4g)\n",
+		       s->reps, speed_unittime, speed_precision, t[i]);
+	      abort ();
+	    }
+	  s->reps = (unsigned) reps_d;
+	}
+      t[i] /= s->reps;
+      t_unsorted[i] = t[i];
+
+      if (speed_precision == 0)
+	return t[i];
+
+      /* require 3 values within TOLERANCE when >= 2 secs, 4 when below */
+      if (t[0] >= 2.0)
+	e = 3;
+      else
+	e = 4;
+
+      /* Look for e many t[]'s within TOLERANCE of each other to consider a
+	 valid measurement.  Return smallest among them.  */
+      if (i >= e)
+	{
+	  qsort (t, i+1, sizeof(t[0]), (qsort_function_t) double_cmp_ptr);
+	  for (j = e-1; j < i; j++)
+	    if (t[j] <= t[j-e+1] * TOLERANCE)
+	      return t[j-e+1] / s->time_divisor;
+	}
+    }
+
+  fprintf (stderr, "speed_measure() could not get %d results within %.1f%%\n",
+	   e, (TOLERANCE-1.0)*100.0);
+  fprintf (stderr, "    unsorted         sorted\n");
+  fprintf (stderr, "  %.12f    %.12f    is about %.1f%%\n",
+	   t_unsorted[0]*(TOLERANCE-1.0), t[0]*(TOLERANCE-1.0),
+	   100*(TOLERANCE-1.0));
+  for (i = 0; i < numberof (t); i++)
+    fprintf (stderr, "  %.09f       %.09f\n", t_unsorted[i], t[i]);
+
+  return -1.0;
+}
+
+
+/* Read all of ptr,size to get it into the CPU memory cache.
+
+   A call to mpn_cache_fill_dummy() is used to make sure the compiler
+   doesn't optimize away the whole loop.  Using "volatile mp_limb_t sum"
+   would work too, but the function call means we don't rely on every
+   compiler actually implementing volatile properly.
+
+   mpn_cache_fill_dummy() is in a separate source file to stop gcc thinking
+   it can inline it.  */
+
+void
+mpn_cache_fill (mp_srcptr ptr, mp_size_t size)
+{
+  mp_limb_t  sum = 0;
+  mp_size_t  i;
+
+  for (i = 0; i < size; i++)
+    sum += ptr[i];
+
+  mpn_cache_fill_dummy(sum);
+}
+
+
+void
+mpn_cache_fill_write (mp_ptr ptr, mp_size_t size)
+{
+  mpn_cache_fill (ptr, size);
+
+#if 0
+  mpn_random (ptr, size);
+#endif
+
+#if 0
+  mp_size_t  i;
+
+  for (i = 0; i < size; i++)
+    ptr[i] = i;
+#endif
+}
+
+
+void
+speed_operand_src (struct speed_params *s, mp_ptr ptr, mp_size_t size)
+{
+  if (s->src_num >= numberof (s->src))
+    {
+      fprintf (stderr, "speed_operand_src: no room left in s->src[]\n");
+      abort ();
+    }
+  s->src[s->src_num].ptr = ptr;
+  s->src[s->src_num].size = size;
+  s->src_num++;
+}
+
+
+void
+speed_operand_dst (struct speed_params *s, mp_ptr ptr, mp_size_t size)
+{
+  if (s->dst_num >= numberof (s->dst))
+    {
+      fprintf (stderr, "speed_operand_dst: no room left in s->dst[]\n");
+      abort ();
+    }
+  s->dst[s->dst_num].ptr = ptr;
+  s->dst[s->dst_num].size = size;
+  s->dst_num++;
+}
+
+
+void
+speed_cache_fill (struct speed_params *s)
+{
+  static struct speed_params  prev;
+  int  i;
+
+  /* FIXME: need a better way to get the format string for a pointer */
+
+  if (speed_option_addrs)
+    {
+      int  different;
+
+      different = (s->dst_num != prev.dst_num || s->src_num != prev.src_num);
+      for (i = 0; i < s->dst_num; i++)
+	different |= (s->dst[i].ptr != prev.dst[i].ptr);
+      for (i = 0; i < s->src_num; i++)
+	different |= (s->src[i].ptr != prev.src[i].ptr);
+
+      if (different)
+	{
+	  if (s->dst_num != 0)
+	    {
+	      printf ("dst");
+	      for (i = 0; i < s->dst_num; i++)
+		printf (" %08lX", (unsigned long) s->dst[i].ptr);
+	      printf (" ");
+	    }
+
+	  if (s->src_num != 0)
+	    {
+	      printf ("src");
+	      for (i = 0; i < s->src_num; i++)
+		printf (" %08lX", (unsigned long) s->src[i].ptr);
+	      printf (" ");
+	    }
+	  printf ("  (cf sp approx %08lX)\n", (unsigned long) &different);
+
+	}
+
+      memcpy (&prev, s, sizeof(prev));
+    }
+
+  switch (s->cache) {
+  case 0:
+    for (i = 0; i < s->dst_num; i++)
+      mpn_cache_fill_write (s->dst[i].ptr, s->dst[i].size);
+    for (i = 0; i < s->src_num; i++)
+      mpn_cache_fill (s->src[i].ptr, s->src[i].size);
+    break;
+  case 1:
+    pentium_wbinvd();
+    break;
+  }
+}
+
+
+/* Miscellaneous options accepted by tune and speed programs under -o. */
+
+void
+speed_option_set (const char *s)
+{
+  int  n;
+
+  if (strcmp (s, "addrs") == 0)
+    {
+      speed_option_addrs = 1;
+    }
+  else if (strcmp (s, "verbose") == 0)
+    {
+      speed_option_verbose++;
+    }
+  else if (sscanf (s, "verbose=%d", &n) == 1)
+    {
+      speed_option_verbose = n;
+    }
+  else if (strcmp (s, "cycles-broken") == 0)
+    {
+      speed_option_cycles_broken = 1;
+    }
+  else
+    {
+      printf ("Unrecognised -o option: %s\n", s);
+      exit (1);
+    }
+}
+
+
+/* The following are basic speed running routines for various gmp functions.
+   Many are very similar and use speed.h macros.
+
+   Each routine allocates it's own destination space for the result of the
+   function, because only it can know what the function needs.
+
+   speed_starttime() and speed_endtime() are put tight around the code to be
+   measured.  Any setups are done outside the timed portion.
+
+   Each routine is responsible for its own cache priming.
+   speed_cache_fill() is a good way to do this, see examples in speed.h.
+   One cache priming possibility, for CPUs with write-allocate cache, and
+   functions that don't take too long, is to do one dummy call before timing
+   so as to cache everything that gets used.  But speed_measure() runs a
+   routine at least twice and will take the smaller time, so this might not
+   be necessary.
+
+   Data alignment will be important, for source, destination and temporary
+   workspace.  A routine can align its destination and workspace.  Programs
+   using the routines will ensure s->xp and s->yp are aligned.  Aligning
+   onto a CACHE_LINE_SIZE boundary is suggested.  s->align_wp and
+   s->align_wp2 should be respected where it makes sense to do so.
+   SPEED_TMP_ALLOC_LIMBS is a good way to do this.
+
+   A loop of the following form can be expected to turn into good assembler
+   code on most CPUs, thereby minimizing overhead in the measurement.  It
+   can always be assumed s->reps >= 1.
+
+	  i = s->reps
+	  do
+	    foo();
+	  while (--i != 0);
+
+   Additional parameters might be added to "struct speed_params" in the
+   future.  Routines should ignore anything they don't use.
+
+   s->size can be used creatively, and s->xp and s->yp can be ignored.  For
+   example, speed_mpz_fac_ui() uses s->size as n for the factorial.  s->r is
+   just a user-supplied parameter.  speed_mpn_lshift() uses it as a shift,
+   speed_mpn_mul_1() uses it as a multiplier.  */
+
+
+/* MPN_COPY etc can be macros, so the _CALL forms are necessary */
+double
+speed_MPN_COPY (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_COPY (MPN_COPY);
+}
+double
+speed_MPN_COPY_INCR (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_COPY (MPN_COPY_INCR);
+}
+double
+speed_MPN_COPY_DECR (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_COPY (MPN_COPY_DECR);
+}
+#if HAVE_NATIVE_mpn_copyi
+double
+speed_mpn_copyi (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_COPY (mpn_copyi);
+}
+#endif
+#if HAVE_NATIVE_mpn_copyd
+double
+speed_mpn_copyd (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_COPY (mpn_copyd);
+}
+#endif
+double
+speed_memcpy (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_COPY_BYTES (memcpy);
+}
+double
+speed_mpn_com (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_COPY (mpn_com);
+}
+double
+speed_mpn_neg (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_COPY (mpn_neg);
+}
+double
+speed_mpn_sec_tabselect (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_TABSELECT (mpn_sec_tabselect);
+}
+
+
+double
+speed_mpn_addmul_1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_UNARY_1 (mpn_addmul_1);
+}
+double
+speed_mpn_submul_1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_UNARY_1 (mpn_submul_1);
+}
+
+#if HAVE_NATIVE_mpn_addmul_2
+double
+speed_mpn_addmul_2 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_UNARY_2 (mpn_addmul_2);
+}
+#endif
+#if HAVE_NATIVE_mpn_addmul_3
+double
+speed_mpn_addmul_3 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_UNARY_3 (mpn_addmul_3);
+}
+#endif
+#if HAVE_NATIVE_mpn_addmul_4
+double
+speed_mpn_addmul_4 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_UNARY_4 (mpn_addmul_4);
+}
+#endif
+#if HAVE_NATIVE_mpn_addmul_5
+double
+speed_mpn_addmul_5 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_UNARY_5 (mpn_addmul_5);
+}
+#endif
+#if HAVE_NATIVE_mpn_addmul_6
+double
+speed_mpn_addmul_6 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_UNARY_6 (mpn_addmul_6);
+}
+#endif
+#if HAVE_NATIVE_mpn_addmul_7
+double
+speed_mpn_addmul_7 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_UNARY_7 (mpn_addmul_7);
+}
+#endif
+#if HAVE_NATIVE_mpn_addmul_8
+double
+speed_mpn_addmul_8 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_UNARY_8 (mpn_addmul_8);
+}
+#endif
+
+double
+speed_mpn_mul_1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_UNARY_1 (mpn_mul_1);
+}
+double
+speed_mpn_mul_1_inplace (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_UNARY_1_INPLACE (mpn_mul_1);
+}
+
+#if HAVE_NATIVE_mpn_mul_2
+double
+speed_mpn_mul_2 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_UNARY_2 (mpn_mul_2);
+}
+#endif
+#if HAVE_NATIVE_mpn_mul_3
+double
+speed_mpn_mul_3 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_UNARY_3 (mpn_mul_3);
+}
+#endif
+#if HAVE_NATIVE_mpn_mul_4
+double
+speed_mpn_mul_4 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_UNARY_4 (mpn_mul_4);
+}
+#endif
+#if HAVE_NATIVE_mpn_mul_5
+double
+speed_mpn_mul_5 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_UNARY_5 (mpn_mul_5);
+}
+#endif
+#if HAVE_NATIVE_mpn_mul_6
+double
+speed_mpn_mul_6 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_UNARY_6 (mpn_mul_6);
+}
+#endif
+
+
+double
+speed_mpn_lshift (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_UNARY_1 (mpn_lshift);
+}
+double
+speed_mpn_lshiftc (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_UNARY_1 (mpn_lshiftc);
+}
+double
+speed_mpn_rshift (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_UNARY_1 (mpn_rshift);
+}
+
+
+/* The carry-in variants (if available) are good for measuring because they
+   won't skip a division if high<divisor.  Alternately, use -1 as a divisor
+   with the plain _1 forms. */
+double
+speed_mpn_divrem_1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1);
+}
+double
+speed_mpn_divrem_1f (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1);
+}
+#if HAVE_NATIVE_mpn_divrem_1c
+double
+speed_mpn_divrem_1c (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_DIVREM_1C (mpn_divrem_1c);
+}
+double
+speed_mpn_divrem_1cf (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_DIVREM_1CF (mpn_divrem_1c);
+}
+#endif
+
+double
+speed_mpn_divrem_1_div (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_div);
+}
+double
+speed_mpn_divrem_1f_div (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1_div);
+}
+double
+speed_mpn_divrem_1_inv (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_inv);
+}
+double
+speed_mpn_divrem_1f_inv (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1_inv);
+}
+double
+speed_mpn_mod_1_div (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_div);
+}
+double
+speed_mpn_mod_1_inv (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_inv);
+}
+
+double
+speed_mpn_preinv_divrem_1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_PREINV_DIVREM_1 (mpn_preinv_divrem_1);
+}
+double
+speed_mpn_preinv_divrem_1f (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_PREINV_DIVREM_1F (mpn_preinv_divrem_1);
+}
+
+#if GMP_NUMB_BITS % 4 == 0
+double
+speed_mpn_mod_34lsub1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MOD_34LSUB1 (mpn_mod_34lsub1);
+}
+#endif
+
+double
+speed_mpn_divrem_2 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2);
+}
+double
+speed_mpn_divrem_2_div (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2_div);
+}
+double
+speed_mpn_divrem_2_inv (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2_inv);
+}
+
+double
+speed_mpn_div_qr_1n_pi1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_DIV_QR_1N_PI1 (mpn_div_qr_1n_pi1);
+}
+double
+speed_mpn_div_qr_1n_pi1_1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_DIV_QR_1N_PI1 (mpn_div_qr_1n_pi1_1);
+}
+double
+speed_mpn_div_qr_1n_pi1_2 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_DIV_QR_1N_PI1 (mpn_div_qr_1n_pi1_2);
+}
+
+double
+speed_mpn_div_qr_1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_DIV_QR_1 (mpn_div_qr_1);
+}
+
+double
+speed_mpn_div_qr_2n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_DIV_QR_2 (mpn_div_qr_2, 1);
+}
+double
+speed_mpn_div_qr_2u (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_DIV_QR_2 (mpn_div_qr_2, 0);
+}
+
+double
+speed_mpn_mod_1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1);
+}
+#if HAVE_NATIVE_mpn_mod_1c
+double
+speed_mpn_mod_1c (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MOD_1C (mpn_mod_1c);
+}
+#endif
+double
+speed_mpn_preinv_mod_1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_PREINV_MOD_1 (mpn_preinv_mod_1);
+}
+double
+speed_mpn_mod_1_1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MOD_1_1 (mpn_mod_1_1p,mpn_mod_1_1p_cps);
+}
+double
+speed_mpn_mod_1_1_1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MOD_1_1 (mpn_mod_1_1p_1,mpn_mod_1_1p_cps_1);
+}
+double
+speed_mpn_mod_1_1_2 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MOD_1_1 (mpn_mod_1_1p_2,mpn_mod_1_1p_cps_2);
+}
+double
+speed_mpn_mod_1_2 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MOD_1_N (mpn_mod_1s_2p,mpn_mod_1s_2p_cps,2);
+}
+double
+speed_mpn_mod_1_3 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MOD_1_N (mpn_mod_1s_3p,mpn_mod_1s_3p_cps,3);
+}
+double
+speed_mpn_mod_1_4 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MOD_1_N (mpn_mod_1s_4p,mpn_mod_1s_4p_cps,4);
+}
+
+double
+speed_mpn_divexact_1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_DIVEXACT_1 (mpn_divexact_1);
+}
+
+double
+speed_mpn_divexact_by3 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_COPY (mpn_divexact_by3);
+}
+
+double
+speed_mpn_bdiv_dbm1c (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BDIV_DBM1C (mpn_bdiv_dbm1c);
+}
+
+double
+speed_mpn_bdiv_q_1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BDIV_Q_1 (mpn_bdiv_q_1);
+}
+
+double
+speed_mpn_pi1_bdiv_q_1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_PI1_BDIV_Q_1 (mpn_pi1_bdiv_q_1);
+}
+
+#if HAVE_NATIVE_mpn_modexact_1_odd
+double
+speed_mpn_modexact_1_odd (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MODEXACT_1_ODD (mpn_modexact_1_odd);
+}
+#endif
+
+double
+speed_mpn_modexact_1c_odd (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MODEXACT_1C_ODD (mpn_modexact_1c_odd);
+}
+
+double
+speed_mpz_mod (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPZ_MOD (mpz_mod);
+}
+
+double
+speed_mpn_sbpi1_div_qr (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_PI1_DIV (mpn_sbpi1_div_qr, inv.inv32, 2,0);
+}
+double
+speed_mpn_dcpi1_div_qr (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_PI1_DIV (mpn_dcpi1_div_qr, &inv, 6,3);
+}
+double
+speed_mpn_sbpi1_divappr_q (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_PI1_DIV (mpn_sbpi1_divappr_q, inv.inv32, 2,0);
+}
+double
+speed_mpn_dcpi1_divappr_q (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_PI1_DIV (mpn_dcpi1_divappr_q, &inv, 6,3);
+}
+double
+speed_mpn_mu_div_qr (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MU_DIV_QR (mpn_mu_div_qr, mpn_mu_div_qr_itch);
+}
+double
+speed_mpn_mu_divappr_q (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MU_DIV_Q (mpn_mu_divappr_q, mpn_mu_divappr_q_itch);
+}
+double
+speed_mpn_mu_div_q (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MU_DIV_Q (mpn_mu_div_q, mpn_mu_div_q_itch);
+}
+double
+speed_mpn_mupi_div_qr (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MUPI_DIV_QR (mpn_preinv_mu_div_qr, mpn_preinv_mu_div_qr_itch);
+}
+
+double
+speed_mpn_sbpi1_bdiv_qr (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_PI1_BDIV_QR (mpn_sbpi1_bdiv_qr);
+}
+double
+speed_mpn_dcpi1_bdiv_qr (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_PI1_BDIV_QR (mpn_dcpi1_bdiv_qr);
+}
+double
+speed_mpn_sbpi1_bdiv_q (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_PI1_BDIV_Q (mpn_sbpi1_bdiv_q);
+}
+double
+speed_mpn_dcpi1_bdiv_q (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_PI1_BDIV_Q (mpn_dcpi1_bdiv_q);
+}
+double
+speed_mpn_sbpi1_bdiv_r (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_PI1_BDIV_R (mpn_sbpi1_bdiv_r);
+}
+double
+speed_mpn_mu_bdiv_q (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MU_BDIV_Q (mpn_mu_bdiv_q, mpn_mu_bdiv_q_itch);
+}
+double
+speed_mpn_mu_bdiv_qr (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MU_BDIV_QR (mpn_mu_bdiv_qr, mpn_mu_bdiv_qr_itch);
+}
+
+double
+speed_mpn_broot (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BROOT (mpn_broot);
+}
+double
+speed_mpn_broot_invm1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BROOT (mpn_broot_invm1);
+}
+double
+speed_mpn_brootinv (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BROOTINV (mpn_brootinv, 5*s->size);
+}
+
+double
+speed_mpn_binvert (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BINVERT (mpn_binvert, mpn_binvert_itch);
+}
+
+double
+speed_mpn_invert (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_INVERT (mpn_invert, mpn_invert_itch);
+}
+
+double
+speed_mpn_invertappr (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_INVERTAPPR (mpn_invertappr, mpn_invertappr_itch);
+}
+
+double
+speed_mpn_ni_invertappr (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_INVERTAPPR (mpn_ni_invertappr, mpn_invertappr_itch);
+}
+
+double
+speed_mpn_sec_invert (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_SEC_INVERT (mpn_sec_invert, mpn_sec_invert_itch);
+}
+
+double
+speed_mpn_redc_1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_REDC_1 (mpn_redc_1);
+}
+double
+speed_mpn_redc_2 (struct speed_params *s)
+{
+  SPEED_ROUTINE_REDC_2 (mpn_redc_2);
+}
+double
+speed_mpn_redc_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_REDC_N (mpn_redc_n);
+}
+
+
+double
+speed_mpn_popcount (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_POPCOUNT (mpn_popcount);
+}
+double
+speed_mpn_hamdist (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_HAMDIST (mpn_hamdist);
+}
+
+
+double
+speed_mpn_add_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BINARY_N (mpn_add_n);
+}
+double
+speed_mpn_sub_n (struct speed_params *s)
+{
+SPEED_ROUTINE_MPN_BINARY_N (mpn_sub_n);
+}
+double
+speed_mpn_add_1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_UNARY_1 (mpn_add_1);
+}
+double
+speed_mpn_add_1_inplace (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_UNARY_1_INPLACE (mpn_add_1);
+}
+double
+speed_mpn_sub_1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_UNARY_1 (mpn_sub_1);
+}
+double
+speed_mpn_sub_1_inplace (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_UNARY_1_INPLACE (mpn_sub_1);
+}
+
+double
+speed_mpn_add_err1_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BINARY_ERR1_N (mpn_add_err1_n);
+}
+double
+speed_mpn_sub_err1_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BINARY_ERR1_N (mpn_sub_err1_n);
+}
+double
+speed_mpn_add_err2_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BINARY_ERR2_N (mpn_add_err2_n);
+}
+double
+speed_mpn_sub_err2_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BINARY_ERR2_N (mpn_sub_err2_n);
+}
+double
+speed_mpn_add_err3_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BINARY_ERR3_N (mpn_add_err3_n);
+}
+double
+speed_mpn_sub_err3_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BINARY_ERR3_N (mpn_sub_err3_n);
+}
+
+
+#if HAVE_NATIVE_mpn_add_n_sub_n
+double
+speed_mpn_add_n_sub_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_ADDSUB_N_CALL (mpn_add_n_sub_n (ap, sp, s->xp, s->yp, s->size));
+}
+#endif
+
+#if HAVE_NATIVE_mpn_addlsh1_n == 1
+double
+speed_mpn_addlsh1_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BINARY_N (mpn_addlsh1_n);
+}
+#endif
+#if HAVE_NATIVE_mpn_sublsh1_n == 1
+double
+speed_mpn_sublsh1_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BINARY_N (mpn_sublsh1_n);
+}
+#endif
+#if HAVE_NATIVE_mpn_addlsh1_n_ip1
+double
+speed_mpn_addlsh1_n_ip1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_COPY (mpn_addlsh1_n_ip1);
+}
+#endif
+#if HAVE_NATIVE_mpn_addlsh1_n_ip2
+double
+speed_mpn_addlsh1_n_ip2 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_COPY (mpn_addlsh1_n_ip2);
+}
+#endif
+#if HAVE_NATIVE_mpn_sublsh1_n_ip1
+double
+speed_mpn_sublsh1_n_ip1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_COPY (mpn_sublsh1_n_ip1);
+}
+#endif
+#if HAVE_NATIVE_mpn_rsblsh1_n == 1
+double
+speed_mpn_rsblsh1_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BINARY_N (mpn_rsblsh1_n);
+}
+#endif
+#if HAVE_NATIVE_mpn_addlsh2_n == 1
+double
+speed_mpn_addlsh2_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BINARY_N (mpn_addlsh2_n);
+}
+#endif
+#if HAVE_NATIVE_mpn_sublsh2_n == 1
+double
+speed_mpn_sublsh2_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BINARY_N (mpn_sublsh2_n);
+}
+#endif
+#if HAVE_NATIVE_mpn_addlsh2_n_ip1
+double
+speed_mpn_addlsh2_n_ip1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_COPY (mpn_addlsh2_n_ip1);
+}
+#endif
+#if HAVE_NATIVE_mpn_addlsh2_n_ip2
+double
+speed_mpn_addlsh2_n_ip2 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_COPY (mpn_addlsh2_n_ip2);
+}
+#endif
+#if HAVE_NATIVE_mpn_sublsh2_n_ip1
+double
+speed_mpn_sublsh2_n_ip1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_COPY (mpn_sublsh2_n_ip1);
+}
+#endif
+#if HAVE_NATIVE_mpn_rsblsh2_n == 1
+double
+speed_mpn_rsblsh2_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BINARY_N (mpn_rsblsh2_n);
+}
+#endif
+#if HAVE_NATIVE_mpn_addlsh_n
+double
+speed_mpn_addlsh_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_addlsh_n (wp, xp, yp, s->size, 7));
+}
+#endif
+#if HAVE_NATIVE_mpn_sublsh_n
+double
+speed_mpn_sublsh_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_sublsh_n (wp, xp, yp, s->size, 7));
+}
+#endif
+#if HAVE_NATIVE_mpn_addlsh_n_ip1
+double
+speed_mpn_addlsh_n_ip1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_UNARY_1_CALL (mpn_addlsh_n_ip1 (wp, s->xp, s->size, 7));
+}
+#endif
+#if HAVE_NATIVE_mpn_addlsh_n_ip2
+double
+speed_mpn_addlsh_n_ip2 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_UNARY_1_CALL (mpn_addlsh_n_ip2 (wp, s->xp, s->size, 7));
+}
+#endif
+#if HAVE_NATIVE_mpn_sublsh_n_ip1
+double
+speed_mpn_sublsh_n_ip1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_UNARY_1_CALL (mpn_sublsh_n_ip1 (wp, s->xp, s->size, 7));
+}
+#endif
+#if HAVE_NATIVE_mpn_rsblsh_n
+double
+speed_mpn_rsblsh_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_rsblsh_n (wp, xp, yp, s->size, 7));
+}
+#endif
+#if HAVE_NATIVE_mpn_rsh1add_n
+double
+speed_mpn_rsh1add_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BINARY_N (mpn_rsh1add_n);
+}
+#endif
+#if HAVE_NATIVE_mpn_rsh1sub_n
+double
+speed_mpn_rsh1sub_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BINARY_N (mpn_rsh1sub_n);
+}
+#endif
+
+double
+speed_mpn_cnd_add_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_cnd_add_n (1, wp, xp, yp, s->size));
+}
+double
+speed_mpn_cnd_sub_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_cnd_sub_n (1, wp, xp, yp, s->size));
+}
+
+/* mpn_and_n etc can be macros and so have to be handled with
+   SPEED_ROUTINE_MPN_BINARY_N_CALL forms */
+double
+speed_mpn_and_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_and_n (wp, xp, yp, s->size));
+}
+double
+speed_mpn_andn_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_andn_n (wp, xp, yp, s->size));
+}
+double
+speed_mpn_nand_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_nand_n (wp, xp, yp, s->size));
+}
+double
+speed_mpn_ior_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_ior_n (wp, xp, yp, s->size));
+}
+double
+speed_mpn_iorn_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_iorn_n (wp, xp, yp, s->size));
+}
+double
+speed_mpn_nior_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_nior_n (wp, xp, yp, s->size));
+}
+double
+speed_mpn_xor_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_xor_n (wp, xp, yp, s->size));
+}
+double
+speed_mpn_xnor_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_xnor_n (wp, xp, yp, s->size));
+}
+
+
+double
+speed_mpn_mul_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MUL_N (mpn_mul_n);
+}
+double
+speed_mpn_sqr (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_SQR (mpn_sqr);
+}
+double
+speed_mpn_mul_n_sqr (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_SQR_CALL (mpn_mul_n (wp, s->xp, s->xp, s->size));
+}
+
+double
+speed_mpn_mul_basecase (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MUL(mpn_mul_basecase);
+}
+double
+speed_mpn_mul (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MUL(mpn_mul);
+}
+double
+speed_mpn_sqr_basecase (struct speed_params *s)
+{
+  /* FIXME: size restrictions on some versions of sqr_basecase */
+  SPEED_ROUTINE_MPN_SQR (mpn_sqr_basecase);
+}
+
+#if HAVE_NATIVE_mpn_sqr_diagonal
+double
+speed_mpn_sqr_diagonal (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_SQR (mpn_sqr_diagonal);
+}
+#endif
+
+#if HAVE_NATIVE_mpn_sqr_diag_addlsh1
+double
+speed_mpn_sqr_diag_addlsh1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_SQR_DIAG_ADDLSH1_CALL (mpn_sqr_diag_addlsh1 (wp, tp, s->xp, s->size));
+}
+#endif
+
+double
+speed_mpn_toom2_sqr (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_TOOM2_SQR (mpn_toom2_sqr);
+}
+double
+speed_mpn_toom3_sqr (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_TOOM3_SQR (mpn_toom3_sqr);
+}
+double
+speed_mpn_toom4_sqr (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_TOOM4_SQR (mpn_toom4_sqr);
+}
+double
+speed_mpn_toom6_sqr (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_TOOM6_SQR (mpn_toom6_sqr);
+}
+double
+speed_mpn_toom8_sqr (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_TOOM8_SQR (mpn_toom8_sqr);
+}
+double
+speed_mpn_toom22_mul (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_TOOM22_MUL_N (mpn_toom22_mul);
+}
+double
+speed_mpn_toom33_mul (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_TOOM33_MUL_N (mpn_toom33_mul);
+}
+double
+speed_mpn_toom44_mul (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_TOOM44_MUL_N (mpn_toom44_mul);
+}
+double
+speed_mpn_toom6h_mul (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_TOOM6H_MUL_N (mpn_toom6h_mul);
+}
+double
+speed_mpn_toom8h_mul (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_TOOM8H_MUL_N (mpn_toom8h_mul);
+}
+
+double
+speed_mpn_toom32_mul (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_TOOM32_MUL (mpn_toom32_mul);
+}
+double
+speed_mpn_toom42_mul (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_TOOM42_MUL (mpn_toom42_mul);
+}
+double
+speed_mpn_toom43_mul (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_TOOM43_MUL (mpn_toom43_mul);
+}
+double
+speed_mpn_toom63_mul (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_TOOM63_MUL (mpn_toom63_mul);
+}
+double
+speed_mpn_toom32_for_toom43_mul (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM43_MUL (mpn_toom32_mul);
+}
+double
+speed_mpn_toom43_for_toom32_mul (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM32_MUL (mpn_toom43_mul);
+}
+double
+speed_mpn_toom32_for_toom53_mul (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM53_MUL (mpn_toom32_mul);
+}
+double
+speed_mpn_toom53_for_toom32_mul (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM32_MUL (mpn_toom53_mul);
+}
+double
+speed_mpn_toom42_for_toom53_mul (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_TOOM42_FOR_TOOM53_MUL (mpn_toom42_mul);
+}
+double
+speed_mpn_toom53_for_toom42_mul (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM42_MUL (mpn_toom53_mul);
+}
+double
+speed_mpn_toom43_for_toom54_mul (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM54_MUL (mpn_toom43_mul);
+}
+double
+speed_mpn_toom54_for_toom43_mul (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_TOOM54_FOR_TOOM43_MUL (mpn_toom54_mul);
+}
+
+double
+speed_mpn_nussbaumer_mul (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MUL_N_CALL
+    (mpn_nussbaumer_mul (wp, s->xp, s->size, s->yp, s->size));
+}
+double
+speed_mpn_nussbaumer_mul_sqr (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_SQR_CALL
+    (mpn_nussbaumer_mul (wp, s->xp, s->size, s->xp, s->size));
+}
+
+#if WANT_OLD_FFT_FULL
+double
+speed_mpn_mul_fft_full (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MUL_N_CALL
+    (mpn_mul_fft_full (wp, s->xp, s->size, s->yp, s->size));
+}
+double
+speed_mpn_mul_fft_full_sqr (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_SQR_CALL
+    (mpn_mul_fft_full (wp, s->xp, s->size, s->xp, s->size));
+}
+#endif
+
+/* These are mod 2^N+1 multiplies and squares.  If s->r is supplied it's
+   used as k, otherwise the best k for the size is used.  If s->size isn't a
+   multiple of 2^k it's rounded up to make the effective operation size.  */
+
+#define SPEED_ROUTINE_MPN_MUL_FFT_CALL(call, sqr)       \
+  {                                                     \
+    mp_ptr     wp;                                      \
+    mp_size_t  pl;                                      \
+    int        k;                                       \
+    unsigned   i;                                       \
+    double     t;                                       \
+    TMP_DECL;                                           \
+							\
+    SPEED_RESTRICT_COND (s->size >= 1);                 \
+							\
+    if (s->r != 0)                                      \
+      k = s->r;                                         \
+    else                                                \
+      k = mpn_fft_best_k (s->size, sqr);                \
+							\
+    TMP_MARK;                                           \
+    pl = mpn_fft_next_size (s->size, k);                \
+    SPEED_TMP_ALLOC_LIMBS (wp, pl+1, s->align_wp);      \
+							\
+    speed_operand_src (s, s->xp, s->size);              \
+    if (!sqr)                                           \
+      speed_operand_src (s, s->yp, s->size);            \
+    speed_operand_dst (s, wp, pl+1);                    \
+    speed_cache_fill (s);                               \
+							\
+    speed_starttime ();                                 \
+    i = s->reps;                                        \
+    do                                                  \
+      call;                                             \
+    while (--i != 0);                                   \
+    t = speed_endtime ();                               \
+							\
+    TMP_FREE;                                           \
+    return t;                                           \
+  }
+
+double
+speed_mpn_mul_fft (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MUL_FFT_CALL
+    (mpn_mul_fft (wp, pl, s->xp, s->size, s->yp, s->size, k), 0);
+}
+
+double
+speed_mpn_mul_fft_sqr (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MUL_FFT_CALL
+    (mpn_mul_fft (wp, pl, s->xp, s->size, s->xp, s->size, k), 1);
+}
+
+double
+speed_mpn_fft_mul (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MUL_N_CALL (mpn_fft_mul (wp, s->xp, s->size, s->yp, s->size));
+}
+
+double
+speed_mpn_fft_sqr (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_SQR_CALL (mpn_fft_mul (wp, s->xp, s->size, s->xp, s->size));
+}
+
+double
+speed_mpn_sqrlo (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_SQRLO (mpn_sqrlo);
+}
+double
+speed_mpn_sqrlo_basecase (struct speed_params *s)
+{
+  SPEED_RESTRICT_COND (ABOVE_THRESHOLD (s->size, MIN (3, SQRLO_BASECASE_THRESHOLD))
+		       && BELOW_THRESHOLD (s->size, SQRLO_DC_THRESHOLD));
+  SPEED_ROUTINE_MPN_SQRLO (mpn_sqrlo_basecase);
+}
+double
+speed_mpn_mullo_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MULLO_N (mpn_mullo_n);
+}
+double
+speed_mpn_mullo_basecase (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MULLO_BASECASE (mpn_mullo_basecase);
+}
+
+double
+speed_mpn_mulmid_basecase (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MULMID (mpn_mulmid_basecase);
+}
+
+double
+speed_mpn_mulmid (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MULMID (mpn_mulmid);
+}
+
+double
+speed_mpn_mulmid_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MULMID_N (mpn_mulmid_n);
+}
+
+double
+speed_mpn_toom42_mulmid (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_TOOM42_MULMID (mpn_toom42_mulmid);
+}
+
+double
+speed_mpn_mulmod_bnm1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL (mpn_mulmod_bnm1 (wp, s->size, s->xp, s->size, s->yp, s->size, tp));
+}
+
+double
+speed_mpn_bc_mulmod_bnm1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL (mpn_bc_mulmod_bnm1 (wp, s->xp, s->yp, s->size, tp));
+}
+
+double
+speed_mpn_mulmod_bnm1_rounded (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MULMOD_BNM1_ROUNDED (mpn_mulmod_bnm1);
+}
+
+double
+speed_mpn_sqrmod_bnm1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL (mpn_sqrmod_bnm1 (wp, s->size, s->xp, s->size, tp));
+}
+
+double
+speed_mpn_matrix22_mul (struct speed_params *s)
+{
+  /* Speed params only includes 2 inputs, so we have to invent the
+     other 6. */
+
+  mp_ptr a;
+  mp_ptr r;
+  mp_ptr b;
+  mp_ptr tp;
+  mp_size_t itch;
+  unsigned i;
+  double t;
+  TMP_DECL;
+
+  TMP_MARK;
+  SPEED_TMP_ALLOC_LIMBS (a, 4 * s->size, s->align_xp);
+  SPEED_TMP_ALLOC_LIMBS (b, 4 * s->size, s->align_yp);
+  SPEED_TMP_ALLOC_LIMBS (r, 8 * s->size + 4, s->align_wp);
+
+  MPN_COPY (a, s->xp, s->size);
+  mpn_random (a + s->size, 3 * s->size);
+  MPN_COPY (b, s->yp, s->size);
+  mpn_random (b + s->size, 3 * s->size);
+
+  itch = mpn_matrix22_mul_itch (s->size, s->size);
+  SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2);
+
+  speed_operand_src (s, a, 4 * s->size);
+  speed_operand_src (s, b, 4 * s->size);
+  speed_operand_dst (s, r, 8 * s->size + 4);
+  speed_operand_dst (s, tp, itch);
+  speed_cache_fill (s);
+
+  speed_starttime ();
+  i = s->reps;
+  do
+    {
+      mp_size_t sz = s->size;
+      MPN_COPY (r + 0 * sz + 0, a + 0 * sz, sz);
+      MPN_COPY (r + 2 * sz + 1, a + 1 * sz, sz);
+      MPN_COPY (r + 4 * sz + 2, a + 2 * sz, sz);
+      MPN_COPY (r + 6 * sz + 3, a + 3 * sz, sz);
+      mpn_matrix22_mul (r, r + 2 * sz + 1, r + 4 * sz + 2, r + 6 * sz + 3, sz,
+			b, b + 1 * sz,     b + 2 * sz,     b + 3 * sz,     sz,
+			tp);
+    }
+  while (--i != 0);
+  t = speed_endtime();
+  TMP_FREE;
+  return t;
+}
+
+double
+speed_mpn_hgcd2 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_HGCD2 (mpn_hgcd2);
+}
+double
+speed_mpn_hgcd2_1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_HGCD2 (mpn_hgcd2_1);
+}
+double
+speed_mpn_hgcd2_2 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_HGCD2 (mpn_hgcd2_2);
+}
+double
+speed_mpn_hgcd2_3 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_HGCD2 (mpn_hgcd2_3);
+}
+double
+speed_mpn_hgcd2_4 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_HGCD2 (mpn_hgcd2_4);
+}
+double
+speed_mpn_hgcd2_5 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_HGCD2 (mpn_hgcd2_5);
+}
+
+double
+speed_mpn_hgcd (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd, mpn_hgcd_itch);
+}
+
+double
+speed_mpn_hgcd_lehmer (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd_lehmer, mpn_hgcd_lehmer_itch);
+}
+
+double
+speed_mpn_hgcd_appr (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd_appr, mpn_hgcd_appr_itch);
+}
+
+double
+speed_mpn_hgcd_appr_lehmer (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd_appr_lehmer, mpn_hgcd_appr_lehmer_itch);
+}
+
+double
+speed_mpn_hgcd_reduce (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL (mpn_hgcd_reduce, mpn_hgcd_reduce_itch);
+}
+double
+speed_mpn_hgcd_reduce_1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL (mpn_hgcd_reduce_1, mpn_hgcd_reduce_1_itch);
+}
+double
+speed_mpn_hgcd_reduce_2 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL (mpn_hgcd_reduce_2, mpn_hgcd_reduce_2_itch);
+}
+
+double
+speed_mpn_gcd (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_GCD (mpn_gcd);
+}
+
+double
+speed_mpn_gcdext (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext);
+}
+#if 0
+double
+speed_mpn_gcdext_lehmer (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_GCDEXT (__gmpn_gcdext_lehmer);
+}
+#endif
+double
+speed_mpn_gcdext_single (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext_single);
+}
+double
+speed_mpn_gcdext_double (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext_double);
+}
+double
+speed_mpn_gcdext_one_single (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_GCDEXT_ONE (mpn_gcdext_one_single);
+}
+double
+speed_mpn_gcdext_one_double (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_GCDEXT_ONE (mpn_gcdext_one_double);
+}
+double
+speed_mpn_gcd_1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_GCD_1 (mpn_gcd_1);
+}
+double
+speed_mpn_gcd_11 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_GCD_11 (mpn_gcd_11);
+}
+double
+speed_mpn_gcd_1N (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_GCD_1N (mpn_gcd_1);
+}
+double
+speed_mpn_gcd_22 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_GCD_22 (mpn_gcd_22);
+}
+
+double
+speed_mpz_nextprime (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPZ_NEXTPRIME (mpz_nextprime);
+}
+
+double
+speed_mpz_jacobi (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPZ_JACOBI (mpz_jacobi);
+}
+double
+speed_mpn_jacobi_base (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base);
+}
+double
+speed_mpn_jacobi_base_1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_1);
+}
+double
+speed_mpn_jacobi_base_2 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_2);
+}
+double
+speed_mpn_jacobi_base_3 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_3);
+}
+double
+speed_mpn_jacobi_base_4 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_4);
+}
+
+
+double
+speed_mpn_sqrtrem (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_SQRTROOT_CALL (mpn_sqrtrem (wp, wp2, s->xp, s->size));
+}
+
+double
+speed_mpn_sqrt (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_SQRTROOT_CALL (mpn_sqrtrem (wp, NULL, s->xp, s->size));
+}
+
+double
+speed_mpn_rootrem (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_SQRTROOT_CALL (mpn_rootrem (wp, wp2, s->xp, s->size, s->r));
+}
+
+double
+speed_mpn_root (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_SQRTROOT_CALL (mpn_rootrem (wp, NULL, s->xp, s->size, s->r));
+}
+
+
+double
+speed_mpn_perfect_power_p (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_PERFECT_POWER (mpn_perfect_power_p);
+}
+
+double
+speed_mpn_perfect_square_p (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_PERFECT_SQUARE (mpn_perfect_square_p);
+}
+
+
+double
+speed_mpz_fac_ui (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui);
+}
+
+double
+speed_mpz_2fac_ui (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPZ_UI (mpz_2fac_ui);
+}
+
+double
+speed_mpz_primorial_ui (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPZ_UI (mpz_primorial_ui);
+}
+
+
+double
+speed_mpn_fib2_ui (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_FIB2_UI (mpn_fib2_ui);
+}
+double
+speed_mpz_fib_ui (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPZ_FIB_UI (mpz_fib_ui);
+}
+double
+speed_mpz_fib2_ui (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPZ_FIB2_UI (mpz_fib2_ui);
+}
+double
+speed_mpz_lucnum_ui (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPZ_LUCNUM_UI (mpz_lucnum_ui);
+}
+double
+speed_mpz_lucnum2_ui (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPZ_LUCNUM2_UI (mpz_lucnum2_ui);
+}
+
+
+double
+speed_mpz_powm (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPZ_POWM (mpz_powm);
+}
+double
+speed_mpz_powm_mod (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPZ_POWM (mpz_powm_mod);
+}
+double
+speed_mpz_powm_redc (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPZ_POWM (mpz_powm_redc);
+}
+double
+speed_mpz_powm_sec (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPZ_POWM (mpz_powm_sec);
+}
+double
+speed_mpz_powm_ui (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPZ_POWM_UI (mpz_powm_ui);
+}
+
+
+double
+speed_binvert_limb (struct speed_params *s)
+{
+  SPEED_ROUTINE_MODLIMB_INVERT (binvert_limb);
+}
+
+
+double
+speed_noop (struct speed_params *s)
+{
+  unsigned  i;
+
+  speed_starttime ();
+  i = s->reps;
+  do
+    noop ();
+  while (--i != 0);
+  return speed_endtime ();
+}
+
+double
+speed_noop_wxs (struct speed_params *s)
+{
+  mp_ptr   wp;
+  unsigned i;
+  double   t;
+  TMP_DECL;
+
+  TMP_MARK;
+  wp = TMP_ALLOC_LIMBS (1);
+
+  speed_starttime ();
+  i = s->reps;
+  do
+    noop_wxs (wp, s->xp, s->size);
+  while (--i != 0);
+  t = speed_endtime ();
+
+  TMP_FREE;
+  return t;
+}
+
+double
+speed_noop_wxys (struct speed_params *s)
+{
+  mp_ptr   wp;
+  unsigned i;
+  double   t;
+  TMP_DECL;
+
+  TMP_MARK;
+  wp = TMP_ALLOC_LIMBS (1);
+
+  speed_starttime ();
+  i = s->reps;
+  do
+    noop_wxys (wp, s->xp, s->yp, s->size);
+  while (--i != 0);
+  t = speed_endtime ();
+
+  TMP_FREE;
+  return t;
+}
+
+
+#define SPEED_ROUTINE_ALLOC_FREE(variables, calls)      \
+  {                                                     \
+    unsigned  i;                                        \
+    variables;                                          \
+							\
+    speed_starttime ();                                 \
+    i = s->reps;                                        \
+    do                                                  \
+      {                                                 \
+	calls;                                          \
+      }                                                 \
+    while (--i != 0);                                   \
+    return speed_endtime ();                            \
+  }
+
+
+/* Compare these to see how much malloc/free costs and then how much
+   __gmp_default_allocate/free and mpz_init/clear add.  mpz_init/clear or
+   mpq_init/clear will be doing a 1 limb allocate, so use that as the size
+   when including them in comparisons.  */
+
+double
+speed_malloc_free (struct speed_params *s)
+{
+  size_t  bytes = s->size * GMP_LIMB_BYTES;
+  SPEED_ROUTINE_ALLOC_FREE (void *p,
+			    p = malloc (bytes);
+			    free (p));
+}
+
+double
+speed_malloc_realloc_free (struct speed_params *s)
+{
+  size_t  bytes = s->size * GMP_LIMB_BYTES;
+  SPEED_ROUTINE_ALLOC_FREE (void *p,
+			    p = malloc (GMP_LIMB_BYTES);
+			    p = realloc (p, bytes);
+			    free (p));
+}
+
+double
+speed_gmp_allocate_free (struct speed_params *s)
+{
+  size_t  bytes = s->size * GMP_LIMB_BYTES;
+  SPEED_ROUTINE_ALLOC_FREE (void *p,
+			    p = (*__gmp_allocate_func) (bytes);
+			    (*__gmp_free_func) (p, bytes));
+}
+
+double
+speed_gmp_allocate_reallocate_free (struct speed_params *s)
+{
+  size_t  bytes = s->size * GMP_LIMB_BYTES;
+  SPEED_ROUTINE_ALLOC_FREE
+    (void *p,
+     p = (*__gmp_allocate_func) (GMP_LIMB_BYTES);
+     p = (*__gmp_reallocate_func) (p, bytes, GMP_LIMB_BYTES);
+     (*__gmp_free_func) (p, bytes));
+}
+
+double
+speed_mpz_init_clear (struct speed_params *s)
+{
+  SPEED_ROUTINE_ALLOC_FREE (mpz_t z,
+			    mpz_init (z);
+			    mpz_clear (z));
+}
+
+double
+speed_mpz_init_realloc_clear (struct speed_params *s)
+{
+  SPEED_ROUTINE_ALLOC_FREE (mpz_t z,
+			    mpz_init (z);
+			    _mpz_realloc (z, s->size);
+			    mpz_clear (z));
+}
+
+double
+speed_mpq_init_clear (struct speed_params *s)
+{
+  SPEED_ROUTINE_ALLOC_FREE (mpq_t q,
+			    mpq_init (q);
+			    mpq_clear (q));
+}
+
+double
+speed_mpf_init_clear (struct speed_params *s)
+{
+  SPEED_ROUTINE_ALLOC_FREE (mpf_t f,
+			    mpf_init (f);
+			    mpf_clear (f));
+}
+
+
+/* Compare this to mpn_add_n to see how much overhead mpz_add adds.  Note
+   that repeatedly calling mpz_add with the same data gives branch prediction
+   in it an advantage.  */
+
+double
+speed_mpz_add (struct speed_params *s)
+{
+  mpz_t     w, x, y;
+  unsigned  i;
+  double    t;
+
+  mpz_init (w);
+  mpz_init (x);
+  mpz_init (y);
+
+  mpz_set_n (x, s->xp, s->size);
+  mpz_set_n (y, s->yp, s->size);
+  mpz_add (w, x, y);
+
+  speed_starttime ();
+  i = s->reps;
+  do
+    {
+      mpz_add (w, x, y);
+    }
+  while (--i != 0);
+  t = speed_endtime ();
+
+  mpz_clear (w);
+  mpz_clear (x);
+  mpz_clear (y);
+  return t;
+}
+
+
+/* An inverse (s->r) or (s->size)/2 modulo s->size limbs */
+
+double
+speed_mpz_invert (struct speed_params *s)
+{
+  mpz_t     a, m, r;
+  mp_size_t k;
+  unsigned  i;
+  double    t;
+
+  if (s->r == 0)
+    k = s->size/2;
+  else if (s->r < GMP_LIMB_HIGHBIT)
+    k = s->r;
+  else /* s->r < 0 */
+    k = s->size - (-s->r);
+
+  SPEED_RESTRICT_COND (k > 0 && k <= s->size);
+
+  mpz_init_set_n (m, s->yp, s->size);
+  mpz_setbit (m, 0);	/* force m to odd */
+
+  mpz_init_set_n (a, s->xp, k);
+
+  mpz_init (r);
+  while (mpz_invert (r, a, m) == 0)
+    mpz_add_ui (a, a, 1);
+
+  speed_starttime ();
+  i = s->reps;
+  do
+    mpz_invert (r, a, m);
+  while (--i != 0);
+  t = speed_endtime ();
+
+  mpz_clear (r);
+  mpz_clear (a);
+  mpz_clear (m);
+  return t;
+  }
+
+/* If r==0, calculate binomial(size,size/2),
+   otherwise calculate binomial(size,r). */
+
+double
+speed_mpz_bin_uiui (struct speed_params *s)
+{
+  mpz_t          w;
+  unsigned long  k;
+  unsigned  i;
+  double    t;
+
+  mpz_init (w);
+  if (s->r != 0)
+    k = s->r;
+  else
+    k = s->size/2;
+
+  speed_starttime ();
+  i = s->reps;
+  do
+    {
+      mpz_bin_uiui (w, s->size, k);
+    }
+  while (--i != 0);
+  t = speed_endtime ();
+
+  mpz_clear (w);
+  return t;
+}
+
+/* If r==0, calculate binomial(2^size,size),
+   otherwise calculate binomial(2^size,r). */
+
+double
+speed_mpz_bin_ui (struct speed_params *s)
+{
+  mpz_t          w, x;
+  unsigned long  k;
+  unsigned  i;
+  double    t;
+
+  mpz_init (w);
+  mpz_init_set_ui (x, 0);
+
+  mpz_setbit (x, s->size);
+
+  if (s->r != 0)
+    k = s->r;
+  else
+    k = s->size;
+
+  speed_starttime ();
+  i = s->reps;
+  do
+    {
+      mpz_bin_ui (w, x, k);
+    }
+  while (--i != 0);
+  t = speed_endtime ();
+
+  mpz_clear (w);
+  mpz_clear (x);
+  return t;
+}
+
+/* If r==0, calculate mfac(size,log(size)),
+   otherwise calculate mfac(size,r). */
+
+double
+speed_mpz_mfac_uiui (struct speed_params *s)
+{
+  mpz_t          w;
+  unsigned long  k;
+  unsigned  i;
+  double    t;
+
+  mpz_init (w);
+  if (s->r != 0)
+    k = s->r;
+  else
+    for (k = 1; s->size >> k; ++k);
+
+  speed_starttime ();
+  i = s->reps;
+  do
+    {
+      mpz_mfac_uiui (w, s->size, k);
+    }
+  while (--i != 0);
+  t = speed_endtime ();
+
+  mpz_clear (w);
+  return t;
+}
+
+/* The multiplies are successively dependent so the latency is measured, not
+   the issue rate.  There's only 10 per loop so the code doesn't get too big
+   since umul_ppmm is several instructions on some cpus.
+
+   Putting the arguments as "h,l,l,h" gets slightly better code from gcc
+   2.95.2 on x86, it puts only one mov between each mul, not two.  That mov
+   though will probably show up as a bogus extra cycle though.
+
+   The measuring function macros are into three parts to avoid overflowing
+   preprocessor expansion space if umul_ppmm is big.
+
+   Limitations:
+
+   The default umul_ppmm doing h*l will be getting increasing numbers of
+   high zero bits in the calculation.  CPUs with data-dependent multipliers
+   will want to use umul_ppmm.1 to get some randomization into the
+   calculation.  The extra xors and fetches will be a slowdown of course.  */
+
+#define SPEED_MACRO_UMUL_PPMM_A \
+  {                             \
+    mp_limb_t  h, l;            \
+    unsigned   i;               \
+    double     t;               \
+				\
+    s->time_divisor = 10;       \
+				\
+    h = s->xp[0];               \
+    l = s->yp[0];               \
+				\
+    if (s->r == 1)              \
+      {                         \
+	speed_starttime ();     \
+	i = s->reps;            \
+	do                      \
+	  {
+
+#define SPEED_MACRO_UMUL_PPMM_B \
+	  }                     \
+	while (--i != 0);       \
+	t = speed_endtime ();   \
+      }                         \
+    else                        \
+      {                         \
+	speed_starttime ();     \
+	i = s->reps;            \
+	do                      \
+	  {
+
+#define SPEED_MACRO_UMUL_PPMM_C                                         \
+	  }                                                             \
+	while (--i != 0);                                               \
+	t = speed_endtime ();                                           \
+      }                                                                 \
+									\
+    /* stop the compiler optimizing away the whole calculation! */      \
+    noop_1 (h);                                                         \
+    noop_1 (l);                                                         \
+									\
+    return t;                                                           \
+  }
+
+
+double
+speed_umul_ppmm (struct speed_params *s)
+{
+  SPEED_MACRO_UMUL_PPMM_A;
+  {
+    umul_ppmm (h, l, l, h);  h ^= s->xp_block[0]; l ^= s->yp_block[0];
+     umul_ppmm (h, l, l, h); h ^= s->xp_block[1]; l ^= s->yp_block[1];
+     umul_ppmm (h, l, l, h); h ^= s->xp_block[2]; l ^= s->yp_block[2];
+    umul_ppmm (h, l, l, h);  h ^= s->xp_block[3]; l ^= s->yp_block[3];
+     umul_ppmm (h, l, l, h); h ^= s->xp_block[4]; l ^= s->yp_block[4];
+     umul_ppmm (h, l, l, h); h ^= s->xp_block[5]; l ^= s->yp_block[5];
+    umul_ppmm (h, l, l, h);  h ^= s->xp_block[6]; l ^= s->yp_block[6];
+     umul_ppmm (h, l, l, h); h ^= s->xp_block[7]; l ^= s->yp_block[7];
+     umul_ppmm (h, l, l, h); h ^= s->xp_block[8]; l ^= s->yp_block[8];
+    umul_ppmm (h, l, l, h);  h ^= s->xp_block[9]; l ^= s->yp_block[9];
+  }
+  SPEED_MACRO_UMUL_PPMM_B;
+  {
+    umul_ppmm (h, l, l, h);
+     umul_ppmm (h, l, l, h);
+     umul_ppmm (h, l, l, h);
+    umul_ppmm (h, l, l, h);
+     umul_ppmm (h, l, l, h);
+     umul_ppmm (h, l, l, h);
+    umul_ppmm (h, l, l, h);
+     umul_ppmm (h, l, l, h);
+     umul_ppmm (h, l, l, h);
+    umul_ppmm (h, l, l, h);
+  }
+  SPEED_MACRO_UMUL_PPMM_C;
+}
+
+
+#if HAVE_NATIVE_mpn_umul_ppmm
+double
+speed_mpn_umul_ppmm (struct speed_params *s)
+{
+  SPEED_MACRO_UMUL_PPMM_A;
+  {
+    h = mpn_umul_ppmm (&l, h, l);  h ^= s->xp_block[0]; l ^= s->yp_block[0];
+     h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[1]; l ^= s->yp_block[1];
+     h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[2]; l ^= s->yp_block[2];
+    h = mpn_umul_ppmm (&l, h, l);  h ^= s->xp_block[3]; l ^= s->yp_block[3];
+     h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[4]; l ^= s->yp_block[4];
+     h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[5]; l ^= s->yp_block[5];
+    h = mpn_umul_ppmm (&l, h, l);  h ^= s->xp_block[6]; l ^= s->yp_block[6];
+     h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[7]; l ^= s->yp_block[7];
+     h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[8]; l ^= s->yp_block[8];
+    h = mpn_umul_ppmm (&l, h, l);  h ^= s->xp_block[9]; l ^= s->yp_block[9];
+  }
+  SPEED_MACRO_UMUL_PPMM_B;
+  {
+    h = mpn_umul_ppmm (&l, h, l);
+     h = mpn_umul_ppmm (&l, h, l);
+     h = mpn_umul_ppmm (&l, h, l);
+    h = mpn_umul_ppmm (&l, h, l);
+     h = mpn_umul_ppmm (&l, h, l);
+     h = mpn_umul_ppmm (&l, h, l);
+    h = mpn_umul_ppmm (&l, h, l);
+     h = mpn_umul_ppmm (&l, h, l);
+     h = mpn_umul_ppmm (&l, h, l);
+    h = mpn_umul_ppmm (&l, h, l);
+  }
+  SPEED_MACRO_UMUL_PPMM_C;
+}
+#endif
+
+#if HAVE_NATIVE_mpn_umul_ppmm_r
+double
+speed_mpn_umul_ppmm_r (struct speed_params *s)
+{
+  SPEED_MACRO_UMUL_PPMM_A;
+  {
+    h = mpn_umul_ppmm_r (h, l, &l);  h ^= s->xp_block[0]; l ^= s->yp_block[0];
+     h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[1]; l ^= s->yp_block[1];
+     h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[2]; l ^= s->yp_block[2];
+    h = mpn_umul_ppmm_r (h, l, &l);  h ^= s->xp_block[3]; l ^= s->yp_block[3];
+     h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[4]; l ^= s->yp_block[4];
+     h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[5]; l ^= s->yp_block[5];
+    h = mpn_umul_ppmm_r (h, l, &l);  h ^= s->xp_block[6]; l ^= s->yp_block[6];
+     h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[7]; l ^= s->yp_block[7];
+     h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[8]; l ^= s->yp_block[8];
+    h = mpn_umul_ppmm_r (h, l, &l);  h ^= s->xp_block[9]; l ^= s->yp_block[9];
+  }
+  SPEED_MACRO_UMUL_PPMM_B;
+  {
+    h = mpn_umul_ppmm_r (h, l, &l);
+     h = mpn_umul_ppmm_r (h, l, &l);
+     h = mpn_umul_ppmm_r (h, l, &l);
+    h = mpn_umul_ppmm_r (h, l, &l);
+     h = mpn_umul_ppmm_r (h, l, &l);
+     h = mpn_umul_ppmm_r (h, l, &l);
+    h = mpn_umul_ppmm_r (h, l, &l);
+     h = mpn_umul_ppmm_r (h, l, &l);
+     h = mpn_umul_ppmm_r (h, l, &l);
+    h = mpn_umul_ppmm_r (h, l, &l);
+  }
+  SPEED_MACRO_UMUL_PPMM_C;
+}
+#endif
+
+
+/* The divisions are successively dependent so latency is measured, not
+   issue rate.  There's only 10 per loop so the code doesn't get too big,
+   especially for udiv_qrnnd_preinv and preinv2norm, which are several
+   instructions each.
+
+   Note that it's only the division which is measured here, there's no data
+   fetching and no shifting if the divisor gets normalized.
+
+   In speed_udiv_qrnnd with gcc 2.95.2 on x86 the parameters "q,r,r,q,d"
+   generate x86 div instructions with nothing in between.
+
+   The measuring function macros are in two parts to avoid overflowing
+   preprocessor expansion space if udiv_qrnnd etc are big.
+
+   Limitations:
+
+   Don't blindly use this to set UDIV_TIME in gmp-mparam.h, check the code
+   generated first.
+
+   CPUs with data-dependent divisions may want more attention paid to the
+   randomness of the data used.  Probably the measurement wanted is over
+   uniformly distributed numbers, but what's here might not be giving that.  */
+
+#define SPEED_ROUTINE_UDIV_QRNND_A(normalize)           \
+  {                                                     \
+    double     t;                                       \
+    unsigned   i;                                       \
+    mp_limb_t  q, r, d;                                 \
+    mp_limb_t  dinv;                                    \
+							\
+    s->time_divisor = 10;                               \
+							\
+    /* divisor from "r" parameter, or a default */      \
+    d = s->r;                                           \
+    if (d == 0)                                         \
+      d = mp_bases[10].big_base;                        \
+							\
+    if (normalize)                                      \
+      {                                                 \
+	unsigned  norm;                                 \
+	count_leading_zeros (norm, d);                  \
+	d <<= norm;                                     \
+	invert_limb (dinv, d);                          \
+      }                                                 \
+							\
+    q = s->xp[0];                                       \
+    r = s->yp[0] % d;                                   \
+							\
+    speed_starttime ();                                 \
+    i = s->reps;                                        \
+    do                                                  \
+      {
+
+#define SPEED_ROUTINE_UDIV_QRNND_B                                      \
+      }                                                                 \
+    while (--i != 0);                                                   \
+    t = speed_endtime ();                                               \
+									\
+    /* stop the compiler optimizing away the whole calculation! */      \
+    noop_1 (q);                                                         \
+    noop_1 (r);                                                         \
+									\
+    return t;                                                           \
+  }
+
+double
+speed_udiv_qrnnd (struct speed_params *s)
+{
+  SPEED_ROUTINE_UDIV_QRNND_A (UDIV_NEEDS_NORMALIZATION);
+  {
+    udiv_qrnnd (q, r, r, q, d);
+     udiv_qrnnd (q, r, r, q, d);
+     udiv_qrnnd (q, r, r, q, d);
+    udiv_qrnnd (q, r, r, q, d);
+     udiv_qrnnd (q, r, r, q, d);
+     udiv_qrnnd (q, r, r, q, d);
+    udiv_qrnnd (q, r, r, q, d);
+     udiv_qrnnd (q, r, r, q, d);
+     udiv_qrnnd (q, r, r, q, d);
+    udiv_qrnnd (q, r, r, q, d);
+  }
+  SPEED_ROUTINE_UDIV_QRNND_B;
+}
+
+double
+speed_udiv_qrnnd_c (struct speed_params *s)
+{
+  SPEED_ROUTINE_UDIV_QRNND_A (1);
+  {
+    __udiv_qrnnd_c (q, r, r, q, d);
+     __udiv_qrnnd_c (q, r, r, q, d);
+     __udiv_qrnnd_c (q, r, r, q, d);
+    __udiv_qrnnd_c (q, r, r, q, d);
+     __udiv_qrnnd_c (q, r, r, q, d);
+     __udiv_qrnnd_c (q, r, r, q, d);
+    __udiv_qrnnd_c (q, r, r, q, d);
+     __udiv_qrnnd_c (q, r, r, q, d);
+     __udiv_qrnnd_c (q, r, r, q, d);
+    __udiv_qrnnd_c (q, r, r, q, d);
+  }
+  SPEED_ROUTINE_UDIV_QRNND_B;
+}
+
+#if HAVE_NATIVE_mpn_udiv_qrnnd
+double
+speed_mpn_udiv_qrnnd (struct speed_params *s)
+{
+  SPEED_ROUTINE_UDIV_QRNND_A (1);
+  {
+    q = mpn_udiv_qrnnd (&r, r, q, d);
+     q = mpn_udiv_qrnnd (&r, r, q, d);
+     q = mpn_udiv_qrnnd (&r, r, q, d);
+    q = mpn_udiv_qrnnd (&r, r, q, d);
+     q = mpn_udiv_qrnnd (&r, r, q, d);
+     q = mpn_udiv_qrnnd (&r, r, q, d);
+    q = mpn_udiv_qrnnd (&r, r, q, d);
+     q = mpn_udiv_qrnnd (&r, r, q, d);
+     q = mpn_udiv_qrnnd (&r, r, q, d);
+    q = mpn_udiv_qrnnd (&r, r, q, d);
+  }
+  SPEED_ROUTINE_UDIV_QRNND_B;
+}
+#endif
+
+#if HAVE_NATIVE_mpn_udiv_qrnnd_r
+double
+speed_mpn_udiv_qrnnd_r (struct speed_params *s)
+{
+  SPEED_ROUTINE_UDIV_QRNND_A (1);
+  {
+    q = mpn_udiv_qrnnd_r (r, q, d, &r);
+     q = mpn_udiv_qrnnd_r (r, q, d, &r);
+     q = mpn_udiv_qrnnd_r (r, q, d, &r);
+    q = mpn_udiv_qrnnd_r (r, q, d, &r);
+     q = mpn_udiv_qrnnd_r (r, q, d, &r);
+     q = mpn_udiv_qrnnd_r (r, q, d, &r);
+    q = mpn_udiv_qrnnd_r (r, q, d, &r);
+     q = mpn_udiv_qrnnd_r (r, q, d, &r);
+     q = mpn_udiv_qrnnd_r (r, q, d, &r);
+    q = mpn_udiv_qrnnd_r (r, q, d, &r);
+  }
+  SPEED_ROUTINE_UDIV_QRNND_B;
+}
+#endif
+
+
+double
+speed_invert_limb (struct speed_params *s)
+{
+  SPEED_ROUTINE_INVERT_LIMB_CALL (invert_limb (dinv, d));
+}
+
+
+/* xp[0] might not be particularly random, but should give an indication how
+   "/" runs.  Same for speed_operator_mod below.  */
+double
+speed_operator_div (struct speed_params *s)
+{
+  double     t;
+  unsigned   i;
+  mp_limb_t  x, q, d;
+
+  s->time_divisor = 10;
+
+  /* divisor from "r" parameter, or a default */
+  d = s->r;
+  if (d == 0)
+    d = mp_bases[10].big_base;
+
+  x = s->xp[0];
+  q = 0;
+
+  speed_starttime ();
+  i = s->reps;
+  do
+    {
+      q ^= x; q /= d;
+       q ^= x; q /= d;
+       q ^= x; q /= d;
+      q ^= x; q /= d;
+       q ^= x; q /= d;
+       q ^= x; q /= d;
+      q ^= x; q /= d;
+       q ^= x; q /= d;
+       q ^= x; q /= d;
+      q ^= x; q /= d;
+    }
+  while (--i != 0);
+  t = speed_endtime ();
+
+  /* stop the compiler optimizing away the whole calculation! */
+  noop_1 (q);
+
+  return t;
+}
+
+double
+speed_operator_mod (struct speed_params *s)
+{
+  double     t;
+  unsigned   i;
+  mp_limb_t  x, r, d;
+
+  s->time_divisor = 10;
+
+  /* divisor from "r" parameter, or a default */
+  d = s->r;
+  if (d == 0)
+    d = mp_bases[10].big_base;
+
+  x = s->xp[0];
+  r = 0;
+
+  speed_starttime ();
+  i = s->reps;
+  do
+    {
+      r ^= x; r %= d;
+       r ^= x; r %= d;
+       r ^= x; r %= d;
+      r ^= x; r %= d;
+       r ^= x; r %= d;
+       r ^= x; r %= d;
+      r ^= x; r %= d;
+       r ^= x; r %= d;
+       r ^= x; r %= d;
+      r ^= x; r %= d;
+    }
+  while (--i != 0);
+  t = speed_endtime ();
+
+  /* stop the compiler optimizing away the whole calculation! */
+  noop_1 (r);
+
+  return t;
+}
+
+
+/* r==0 measures on data with the values uniformly distributed.  This will
+   be typical for count_trailing_zeros in a GCD etc.
+
+   r==1 measures on data with the resultant count uniformly distributed
+   between 0 and GMP_LIMB_BITS-1.  This is probably sensible for
+   count_leading_zeros on the high limbs of divisors.  */
+
+int
+speed_routine_count_zeros_setup (struct speed_params *s,
+				 mp_ptr xp, int leading, int zero)
+{
+  int        i, c;
+  mp_limb_t  n;
+
+  if (s->r == 0)
+    {
+      /* Make uniformly distributed data.  If zero isn't allowed then change
+	 it to 1 for leading, or 0x800..00 for trailing.  */
+      MPN_COPY (xp, s->xp_block, SPEED_BLOCK_SIZE);
+      if (! zero)
+	for (i = 0; i < SPEED_BLOCK_SIZE; i++)
+	  if (xp[i] == 0)
+	    xp[i] = leading ? 1 : GMP_LIMB_HIGHBIT;
+    }
+  else if (s->r == 1)
+    {
+      /* Make counts uniformly distributed.  A randomly chosen bit is set, and
+	 for leading the rest above it are cleared, or for trailing then the
+	 rest below.  */
+      for (i = 0; i < SPEED_BLOCK_SIZE; i++)
+	{
+	  mp_limb_t  set = CNST_LIMB(1) << (s->yp_block[i] % GMP_LIMB_BITS);
+	  mp_limb_t  keep_below = set-1;
+	  mp_limb_t  keep_above = MP_LIMB_T_MAX ^ keep_below;
+	  mp_limb_t  keep = (leading ? keep_below : keep_above);
+	  xp[i] = (s->xp_block[i] & keep) | set;
+	}
+    }
+  else
+    {
+      return 0;
+    }
+
+  /* Account for the effect of n^=c. */
+  c = 0;
+  for (i = 0; i < SPEED_BLOCK_SIZE; i++)
+    {
+      n = xp[i];
+      xp[i] ^= c;
+
+      if (leading)
+	count_leading_zeros (c, n);
+      else
+	count_trailing_zeros (c, n);
+    }
+
+  return 1;
+}
+
+double
+speed_count_leading_zeros (struct speed_params *s)
+{
+#ifdef COUNT_LEADING_ZEROS_0
+#define COUNT_LEADING_ZEROS_0_ALLOWED   1
+#else
+#define COUNT_LEADING_ZEROS_0_ALLOWED   0
+#endif
+
+  SPEED_ROUTINE_COUNT_ZEROS_A (1, COUNT_LEADING_ZEROS_0_ALLOWED);
+  count_leading_zeros (c, n);
+  SPEED_ROUTINE_COUNT_ZEROS_B ();
+}
+double
+speed_count_trailing_zeros (struct speed_params *s)
+{
+  SPEED_ROUTINE_COUNT_ZEROS_A (0, 0);
+  count_trailing_zeros (c, n);
+  SPEED_ROUTINE_COUNT_ZEROS_B ();
+}
+
+
+double
+speed_mpn_get_str (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_GET_STR (mpn_get_str);
+}
+
+double
+speed_mpn_set_str (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_SET_STR_CALL (mpn_set_str (wp, xp, s->size, base));
+}
+double
+speed_mpn_bc_set_str (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_SET_STR_CALL (mpn_bc_set_str (wp, xp, s->size, base));
+}
+
+double
+speed_MPN_ZERO (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_ZERO_CALL (MPN_ZERO (wp, s->size));
+}
+
+
+int
+speed_randinit (struct speed_params *s, gmp_randstate_ptr rstate)
+{
+  if (s->r == 0)
+    gmp_randinit_default (rstate);
+  else if (s->r == 1)
+    gmp_randinit_mt (rstate);
+  else
+    {
+      return gmp_randinit_lc_2exp_size (rstate, s->r);
+    }
+  return 1;
+}
+
+double
+speed_gmp_randseed (struct speed_params *s)
+{
+  gmp_randstate_t  rstate;
+  unsigned  i;
+  double    t;
+  mpz_t     x;
+
+  SPEED_RESTRICT_COND (s->size >= 1);
+  SPEED_RESTRICT_COND (speed_randinit (s, rstate));
+
+  /* s->size bits of seed */
+  mpz_init_set_n (x, s->xp, s->size);
+  mpz_fdiv_r_2exp (x, x, (unsigned long) s->size);
+
+  /* cache priming */
+  gmp_randseed (rstate, x);
+
+  speed_starttime ();
+  i = s->reps;
+  do
+    gmp_randseed (rstate, x);
+  while (--i != 0);
+  t = speed_endtime ();
+
+  gmp_randclear (rstate);
+  mpz_clear (x);
+  return t;
+}
+
+double
+speed_gmp_randseed_ui (struct speed_params *s)
+{
+  gmp_randstate_t  rstate;
+  unsigned  i, j;
+  double    t;
+
+  SPEED_RESTRICT_COND (speed_randinit (s, rstate));
+
+  /* cache priming */
+  gmp_randseed_ui (rstate, 123L);
+
+  speed_starttime ();
+  i = s->reps;
+  j = 0;
+  do
+    {
+      gmp_randseed_ui (rstate, (unsigned long) s->xp_block[j]);
+      j++;
+      if (j >= SPEED_BLOCK_SIZE)
+	j = 0;
+    }
+  while (--i != 0);
+  t = speed_endtime ();
+
+  gmp_randclear (rstate);
+  return t;
+}
+
+double
+speed_mpz_urandomb (struct speed_params *s)
+{
+  gmp_randstate_t  rstate;
+  mpz_t     z;
+  unsigned  i;
+  double    t;
+
+  SPEED_RESTRICT_COND (s->size >= 0);
+  SPEED_RESTRICT_COND (speed_randinit (s, rstate));
+
+  mpz_init (z);
+
+  /* cache priming */
+  mpz_urandomb (z, rstate, (unsigned long) s->size);
+  mpz_urandomb (z, rstate, (unsigned long) s->size);
+
+  speed_starttime ();
+  i = s->reps;
+  do
+    mpz_urandomb (z, rstate, (unsigned long) s->size);
+  while (--i != 0);
+  t = speed_endtime ();
+
+  mpz_clear (z);
+  gmp_randclear (rstate);
+  return t;
+}

diff --git a/third_party/gmp/tune/div_qr_1_tune.c b/third_party/gmp/tune/div_qr_1_tune.c
new file mode 100644
index 0000000..ef680ee
--- /dev/null
+++ b/third_party/gmp/tune/div_qr_1_tune.c

@@ -0,0 +1,46 @@
+/* mpn/generic/div_qr_1, using tuned threshold and method.
+
+Copyright 2013 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define TUNE_PROGRAM_BUILD 1
+
+#include "gmp-impl.h"
+
+mp_limb_t mpn_div_qr_1n_pi1_1 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
+mp_limb_t mpn_div_qr_1n_pi1_2 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
+
+#if !HAVE_NATIVE_mpn_div_qr_1n_pi1
+#define __gmpn_div_qr_1n_pi1 \
+  (div_qr_1n_pi1_method == 1 ? mpn_div_qr_1n_pi1_1 : mpn_div_qr_1n_pi1_2)
+#endif
+
+#undef mpn_div_qr_1
+#define mpn_div_qr_1 mpn_div_qr_1_tune
+
+#include "mpn/generic/div_qr_1.c"

diff --git a/third_party/gmp/tune/div_qr_1n_pi1_1.c b/third_party/gmp/tune/div_qr_1n_pi1_1.c
new file mode 100644
index 0000000..e64a3c7
--- /dev/null
+++ b/third_party/gmp/tune/div_qr_1n_pi1_1.c

@@ -0,0 +1,38 @@
+/* mpn/generic/div_qr_1n_pi1.c method 1.
+
+Copyright 2013 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#undef DIV_QR_1N_METHOD
+#define DIV_QR_1N_METHOD 1
+#undef mpn_div_qr_1n_pi1
+#define mpn_div_qr_1n_pi1 mpn_div_qr_1n_pi1_1
+
+#include "mpn/generic/div_qr_1n_pi1.c"

diff --git a/third_party/gmp/tune/div_qr_1n_pi1_2.c b/third_party/gmp/tune/div_qr_1n_pi1_2.c
new file mode 100644
index 0000000..c5432ea
--- /dev/null
+++ b/third_party/gmp/tune/div_qr_1n_pi1_2.c

@@ -0,0 +1,38 @@
+/* mpn/generic/div_qr_1n_pi1.c method 2.
+
+Copyright 2013 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#undef DIV_QR_1N_METHOD
+#define DIV_QR_1N_METHOD 2
+#undef mpn_div_qr_1n_pi1
+#define mpn_div_qr_1n_pi1 mpn_div_qr_1n_pi1_2
+
+#include "mpn/generic/div_qr_1n_pi1.c"

diff --git a/third_party/gmp/tune/divrem1div.c b/third_party/gmp/tune/divrem1div.c
new file mode 100644
index 0000000..0089971
--- /dev/null
+++ b/third_party/gmp/tune/divrem1div.c

@@ -0,0 +1,41 @@
+/* mpn/generic/divrem_1.c forced to use plain udiv_qrnnd.
+
+Copyright 2000, 2003 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define OPERATION_divrem_1
+
+#include "gmp-impl.h"
+
+#undef DIVREM_1_NORM_THRESHOLD
+#undef DIVREM_1_UNNORM_THRESHOLD
+#define DIVREM_1_NORM_THRESHOLD    MP_SIZE_T_MAX
+#define DIVREM_1_UNNORM_THRESHOLD  MP_SIZE_T_MAX
+#define __gmpn_divrem_1  mpn_divrem_1_div
+
+#include "mpn/generic/divrem_1.c"

diff --git a/third_party/gmp/tune/divrem1inv.c b/third_party/gmp/tune/divrem1inv.c
new file mode 100644
index 0000000..82c8528
--- /dev/null
+++ b/third_party/gmp/tune/divrem1inv.c

@@ -0,0 +1,41 @@
+/* mpn/generic/divrem_1.c forced to use mul-by-inverse udiv_qrnnd_preinv.
+
+Copyright 2000, 2003 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define OPERATION_divrem_1
+
+#include "gmp-impl.h"
+
+#undef DIVREM_1_NORM_THRESHOLD
+#undef DIVREM_1_UNNORM_THRESHOLD
+#define DIVREM_1_NORM_THRESHOLD    0
+#define DIVREM_1_UNNORM_THRESHOLD  0
+#define __gmpn_divrem_1  mpn_divrem_1_inv
+
+#include "mpn/generic/divrem_1.c"

diff --git a/third_party/gmp/tune/divrem2div.c b/third_party/gmp/tune/divrem2div.c
new file mode 100644
index 0000000..8331d8f
--- /dev/null
+++ b/third_party/gmp/tune/divrem2div.c

@@ -0,0 +1,40 @@
+/* mpn/generic/divrem_2.c forced to use plain udiv_qrnnd. */
+
+/*
+Copyright 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#ifdef DIVREM_2_THRESHOLD
+#undef DIVREM_2_THRESHOLD
+#endif
+#define DIVREM_2_THRESHOLD  MP_SIZE_T_MAX
+#define __gmpn_divrem_2     mpn_divrem_2_div
+
+#include "mpn/generic/divrem_2.c"

diff --git a/third_party/gmp/tune/divrem2inv.c b/third_party/gmp/tune/divrem2inv.c
new file mode 100644
index 0000000..8ae87f5
--- /dev/null
+++ b/third_party/gmp/tune/divrem2inv.c

@@ -0,0 +1,40 @@
+/* mpn/generic/divrem_2.c forced to use udiv_qrnnd_preinv. */
+
+/*
+Copyright 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#ifdef DIVREM_2_THRESHOLD
+#undef DIVREM_2_THRESHOLD
+#endif
+#define DIVREM_2_THRESHOLD  0
+#define __gmpn_divrem_2     mpn_divrem_2_inv
+
+#include "mpn/generic/divrem_2.c"

diff --git a/third_party/gmp/tune/freq.c b/third_party/gmp/tune/freq.c
new file mode 100644
index 0000000..ee38506
--- /dev/null
+++ b/third_party/gmp/tune/freq.c

@@ -0,0 +1,893 @@
+/* CPU frequency determination.
+
+Copyright 1999-2004 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+/* Currently we don't get a CPU frequency on the following systems,
+
+   alphaev5-cray-unicosmk2.0.6.X
+       times() has been seen at 13.33 ns (75 MHz), which is probably not the
+       cpu frequency.  Measuring the cycle counter against that would be
+       possible though.  But currently we don't use the cycle counter due to
+       unicos having int==8bytes where tune/alpha.asm assumes int==4bytes.
+
+   m68040-unknown-netbsd1.4.1
+       Not sure if the system even knows the cpu frequency.  There's no
+       cycle counter to measure, though we could perhaps make a loop taking
+       a known number of cycles and measure that.
+
+   power-ibm-aix4.2.1.0
+   power2-ibm-aix4.3.1.0
+   powerpc604-ibm-aix4.3.1.0
+   powerpc604-ibm-aix4.3.3.0
+   powerpc630-ibm-aix4.3.3.0
+   powerpc-unknown-netbsd1.6
+       Don't know where any info hides on these.  mftb is not related to the
+       cpu frequency so doesn't help.
+
+   sparc-unknown-linux-gnu [maybe]
+       Don't know where any info hides on this.
+
+   t90-cray-unicos10.0.X
+       The times() call seems to be for instance 2.22 nanoseconds, which
+       might be the cpu frequency (450 mhz), but need to confirm that.
+
+*/
+
+#include "config.h"
+
+#if HAVE_INVENT_H
+#include <invent.h> /* for IRIX invent_cpuinfo_t */
+#endif
+
+#include <stdio.h>
+#include <stdlib.h> /* for getenv, qsort */
+#include <string.h> /* for memcmp */
+
+#if HAVE_UNISTD_H
+#include <unistd.h> /* for sysconf */
+#endif
+
+#include <sys/types.h>
+
+#if HAVE_SYS_ATTRIBUTES_H
+#include <sys/attributes.h>   /* for IRIX attr_get(), needs sys/types.h */
+#endif
+
+#if HAVE_SYS_IOGRAPH_H
+#include <sys/iograph.h>      /* for IRIX INFO_LBL_DETAIL_INVENT */
+#endif
+
+#if HAVE_SYS_PARAM_H     /* for constants needed by NetBSD <sys/sysctl.h> */
+#include <sys/param.h>   /* and needed by HPUX <sys/pstat.h> */
+#endif
+
+#if HAVE_SYS_PSTAT_H
+#include <sys/pstat.h>   /* for HPUX pstat_getprocessor() */
+#endif
+
+#if HAVE_SYS_SYSCTL_H
+#include <sys/sysctl.h>  /* for sysctlbyname() */
+#endif
+
+#if TIME_WITH_SYS_TIME
+# include <sys/time.h>  /* for struct timeval */
+# include <time.h>
+#else
+# if HAVE_SYS_TIME_H
+#  include <sys/time.h>
+# else
+#  include <time.h>
+# endif
+#endif
+
+#if HAVE_SYS_RESOURCE_H
+#include <sys/resource.h>  /* for struct rusage */
+#endif
+
+#if HAVE_SYS_PROCESSOR_H
+#include <sys/processor.h>  /* for solaris processor_info_t */
+#endif
+
+/* On AIX 5.1 with gcc 2.9-aix51-020209 in -maix64 mode, <sys/sysinfo.h>
+   gets an error about "fill" in "struct cpuinfo" having a negative size,
+   apparently due to __64BIT_KERNEL not being defined because _KERNEL is not
+   defined.  Avoid this file if we don't actually need it, which we don't on
+   AIX since there's no getsysinfo there.  */
+#if HAVE_SYS_SYSINFO_H && HAVE_GETSYSINFO
+#include <sys/sysinfo.h>  /* for OSF getsysinfo */
+#endif
+
+#if HAVE_MACHINE_HAL_SYSINFO_H
+#include <machine/hal_sysinfo.h>  /* for OSF GSI_CPU_INFO, struct cpu_info */
+#endif
+
+/* Remove definitions from NetBSD <sys/param.h>, to avoid conflicts with
+   gmp-impl.h. */
+#ifdef MIN
+#undef MIN
+#endif
+#ifdef MAX
+#undef MAX
+#endif
+
+#include "gmp-impl.h"
+
+#include "speed.h"
+
+
+#define HELP(str)                       \
+  if (help)                             \
+    {                                   \
+      printf ("    - %s\n", str);       \
+      return 0;                         \
+    }
+
+
+/* GMP_CPU_FREQUENCY environment variable.  Should be in Hertz and can be
+   floating point, for example "450e6". */
+static int
+freq_environment (int help)
+{
+  char  *e;
+
+  HELP ("environment variable GMP_CPU_FREQUENCY (in Hertz)");
+
+  e = getenv ("GMP_CPU_FREQUENCY");
+  if (e == NULL)
+    return 0;
+
+  speed_cycletime = 1.0 / atof (e);
+
+  if (speed_option_verbose)
+    printf ("Using GMP_CPU_FREQUENCY %.2f for cycle time %.3g\n",
+            atof (e), speed_cycletime);
+
+  return 1;
+}
+
+
+/* getsysinfo is available on OSF, or 4.0 and up at least.
+   The man page (on 4.0) suggests a 0 return indicates information not
+   available, but that seems to be the normal return for GSI_CPU_INFO.  */
+static int
+freq_getsysinfo (int help)
+{
+#if HAVE_GETSYSINFO
+  struct cpu_info  c;
+  int              start;
+
+  HELP ("getsysinfo() GSI_CPU_INFO");
+
+  start = 0;
+  if (getsysinfo (GSI_CPU_INFO, (caddr_t) &c, sizeof (c),
+                  &start, NULL, NULL) != -1)
+    {
+      speed_cycletime = 1e-6 / (double) c.mhz;
+      if (speed_option_verbose)
+        printf ("Using getsysinfo() GSI_CPU_INFO %u for cycle time %.3g\n",
+                c.mhz, speed_cycletime);
+      return 1;
+    }
+#endif
+  return 0;
+}
+
+
+/* In HPUX 10 and up, pstat_getprocessor() psp_iticksperclktick is the
+   number of CPU cycles (ie. the CR16 register) per CLK_TCK.  HPUX 9 doesn't
+   have that field in pst_processor though, and has no apparent
+   equivalent.  */
+
+static int
+freq_pstat_getprocessor (int help)
+{
+#if HAVE_PSTAT_GETPROCESSOR && HAVE_PSP_ITICKSPERCLKTICK
+  struct pst_processor  p;
+
+  HELP ("pstat_getprocessor() psp_iticksperclktick");
+
+  if (pstat_getprocessor (&p, sizeof(p), 1, 0) != -1)
+    {
+      long  c = clk_tck();
+      speed_cycletime = 1.0 / (c * p.psp_iticksperclktick);
+      if (speed_option_verbose)
+        printf ("Using pstat_getprocessor() psp_iticksperclktick %lu and clk_tck %ld for cycle time %.3g\n",
+                (unsigned long) p.psp_iticksperclktick, c,
+                speed_cycletime);
+      return 1;
+    }
+#endif
+  return 0;
+}
+
+
+/* i386 FreeBSD 2.2.8 sysctlbyname machdep.i586_freq is in Hertz.
+   There's no obvious defines available to get this from plain sysctl.  */
+static int
+freq_sysctlbyname_i586_freq (int help)
+{
+#if HAVE_SYSCTLBYNAME
+  unsigned  val;
+  size_t    size;
+
+  HELP ("sysctlbyname() machdep.i586_freq");
+
+  size = sizeof(val);
+  if (sysctlbyname ("machdep.i586_freq", &val, &size, NULL, 0) == 0
+      && size == sizeof(val))
+    {
+      speed_cycletime = 1.0 / (double) val;
+      if (speed_option_verbose)
+        printf ("Using sysctlbyname() machdep.i586_freq %u for cycle time %.3g\n",
+                val, speed_cycletime);
+      return 1;
+    }
+#endif
+  return 0;
+}
+
+
+/* i368 FreeBSD 3.3 sysctlbyname machdep.tsc_freq is in Hertz.
+   There's no obvious defines to get this from plain sysctl.  */
+
+static int
+freq_sysctlbyname_tsc_freq (int help)
+{
+#if HAVE_SYSCTLBYNAME
+  unsigned  val;
+  size_t    size;
+
+  HELP ("sysctlbyname() machdep.tsc_freq");
+
+  size = sizeof(val);
+  if (sysctlbyname ("machdep.tsc_freq", &val, &size, NULL, 0) == 0
+      && size == sizeof(val))
+    {
+      speed_cycletime = 1.0 / (double) val;
+      if (speed_option_verbose)
+        printf ("Using sysctlbyname() machdep.tsc_freq %u for cycle time %.3g\n",
+                val, speed_cycletime);
+      return 1;
+    }
+#endif
+  return 0;
+}
+
+
+/* Apple powerpc Darwin 1.3 sysctl hw.cpufrequency is in hertz.  For some
+   reason only seems to be available from sysctl(), not sysctlbyname().  */
+
+static int
+freq_sysctl_hw_cpufrequency (int help)
+{
+#if HAVE_SYSCTL && defined (CTL_HW) && defined (HW_CPU_FREQ)
+  int       mib[2];
+  unsigned  val;
+  size_t    size;
+
+  HELP ("sysctl() hw.cpufrequency");
+
+  mib[0] = CTL_HW;
+  mib[1] = HW_CPU_FREQ;
+  size = sizeof(val);
+  if (sysctl (mib, 2, &val, &size, NULL, 0) == 0)
+    {
+      speed_cycletime = 1.0 / (double) val;
+      if (speed_option_verbose)
+        printf ("Using sysctl() hw.cpufrequency %u for cycle time %.3g\n",
+                val, speed_cycletime);
+      return 1;
+    }
+#endif
+  return 0;
+}
+
+
+/* The following ssyctl hw.model strings have been observed,
+
+       Alpha FreeBSD 4.1:   Digital AlphaPC 164LX 599 MHz
+       NetBSD 1.4:          Digital AlphaPC 164LX 599 MHz
+       NetBSD 1.6.1:        CY7C601 @ 40 MHz, TMS390C602A FPU
+
+   NetBSD 1.4 doesn't seem to have sysctlbyname, so sysctl() is used.  */
+
+static int
+freq_sysctl_hw_model (int help)
+{
+#if HAVE_SYSCTL && defined (CTL_HW) && defined (HW_MODEL)
+  int       mib[2];
+  char      str[128];
+  unsigned  val;
+  size_t    size;
+  char      *p;
+  int       end;
+
+  HELP ("sysctl() hw.model");
+
+  mib[0] = CTL_HW;
+  mib[1] = HW_MODEL;
+  size = sizeof(str);
+  if (sysctl (mib, 2, str, &size, NULL, 0) == 0)
+    {
+      for (p = str; *p != '\0'; p++)
+        {
+          end = 0;
+          if (sscanf (p, "%u MHz%n", &val, &end) == 1 && end != 0)
+            {
+              speed_cycletime = 1e-6 / (double) val;
+              if (speed_option_verbose)
+                printf ("Using sysctl() hw.model %u for cycle time %.3g\n",
+                        val, speed_cycletime);
+              return 1;
+            }
+        }
+    }
+#endif
+  return 0;
+}
+
+
+/* /proc/cpuinfo for linux kernel.
+
+   Linux doesn't seem to have any system call to get the CPU frequency, at
+   least not in 2.0.x or 2.2.x, so it's necessary to read /proc/cpuinfo.
+
+   i386 2.0.36 - "bogomips" is the CPU frequency.
+
+   i386 2.2.13 - has both "cpu MHz" and "bogomips", and it's "cpu MHz" which
+                 is the frequency.
+
+   alpha 2.2.5 - "cycle frequency [Hz]" seems to be right, "BogoMIPS" is
+                 very slightly different.
+
+   alpha 2.2.18pre21 - "cycle frequency [Hz]" is 0 on at least one system,
+                 "BogoMIPS" seems near enough.
+
+   powerpc 2.2.19 - "clock" is the frequency, bogomips is something weird
+  */
+
+static int
+freq_proc_cpuinfo (int help)
+{
+  FILE    *fp;
+  char    buf[128];
+  double  val;
+  int     ret = 0;
+  int     end;
+
+  HELP ("linux kernel /proc/cpuinfo file, cpu MHz or bogomips");
+
+  if ((fp = fopen ("/proc/cpuinfo", "r")) != NULL)
+    {
+      while (fgets (buf, sizeof (buf), fp) != NULL)
+        {
+          if (sscanf (buf, "cycle frequency [Hz]    : %lf", &val) == 1
+              && val != 0.0)
+            {
+              speed_cycletime = 1.0 / val;
+              if (speed_option_verbose)
+                printf ("Using /proc/cpuinfo \"cycle frequency\" %.2f for cycle time %.3g\n", val, speed_cycletime);
+              ret = 1;
+              break;
+            }
+          if (sscanf (buf, "cpu MHz : %lf\n", &val) == 1)
+            {
+              speed_cycletime = 1e-6 / val;
+              if (speed_option_verbose)
+                printf ("Using /proc/cpuinfo \"cpu MHz\" %.2f for cycle time %.3g\n", val, speed_cycletime);
+              ret = 1;
+              break;
+            }
+          end = 0;
+          if (sscanf (buf, "clock : %lfMHz\n%n", &val, &end) == 1 && end != 0)
+            {
+              speed_cycletime = 1e-6 / val;
+              if (speed_option_verbose)
+                printf ("Using /proc/cpuinfo \"clock\" %.2f for cycle time %.3g\n", val, speed_cycletime);
+              ret = 1;
+              break;
+            }
+          if (sscanf (buf, "bogomips : %lf\n", &val) == 1
+              || sscanf (buf, "BogoMIPS : %lf\n", &val) == 1)
+            {
+              speed_cycletime = 1e-6 / val;
+              if (speed_option_verbose)
+                printf ("Using /proc/cpuinfo \"bogomips\" %.2f for cycle time %.3g\n", val, speed_cycletime);
+              ret = 1;
+              break;
+            }
+        }
+      fclose (fp);
+    }
+  return ret;
+}
+
+
+/* /bin/sysinfo for SunOS 4.
+   Prints a line like: cpu0 is a "75 MHz TI,TMS390Z55" CPU */
+static int
+freq_sunos_sysinfo (int help)
+{
+  int     ret = 0;
+#if HAVE_POPEN
+  FILE    *fp;
+  char    buf[128];
+  double  val;
+  int     end;
+
+  HELP ("SunOS /bin/sysinfo program output, cpu0");
+
+  /* Error messages are sent to /dev/null in case /bin/sysinfo doesn't
+     exist.  The brackets are necessary for some shells. */
+  if ((fp = popen ("(/bin/sysinfo) 2>/dev/null", "r")) != NULL)
+    {
+      while (fgets (buf, sizeof (buf), fp) != NULL)
+        {
+          end = 0;
+          if (sscanf (buf, " cpu0 is a \"%lf MHz%n", &val, &end) == 1
+              && end != 0)
+            {
+              speed_cycletime = 1e-6 / val;
+              if (speed_option_verbose)
+                printf ("Using /bin/sysinfo \"cpu0 MHz\" %.2f for cycle time %.3g\n", val, speed_cycletime);
+              ret = 1;
+              break;
+            }
+        }
+      pclose (fp);
+    }
+#endif
+  return ret;
+}
+
+
+/* "/etc/hw -r cpu" for SCO OpenUnix 8, printing a line like
+	The speed of the CPU is approximately 450MHz
+ */
+static int
+freq_sco_etchw (int help)
+{
+  int     ret = 0;
+#if HAVE_POPEN
+  FILE    *fp;
+  char    buf[128];
+  double  val;
+  int     end;
+
+  HELP ("SCO /etc/hw program output");
+
+  /* Error messages are sent to /dev/null in case /etc/hw doesn't exist.
+     The brackets are necessary for some shells. */
+  if ((fp = popen ("(/etc/hw -r cpu) 2>/dev/null", "r")) != NULL)
+    {
+      while (fgets (buf, sizeof (buf), fp) != NULL)
+        {
+          end = 0;
+          if (sscanf (buf, " The speed of the CPU is approximately %lfMHz%n",
+                      &val, &end) == 1 && end != 0)
+            {
+              speed_cycletime = 1e-6 / val;
+              if (speed_option_verbose)
+                printf ("Using /etc/hw %.2f MHz, for cycle time %.3g\n",
+                        val, speed_cycletime);
+              ret = 1;
+              break;
+            }
+        }
+      pclose (fp);
+    }
+#endif
+  return ret;
+}
+
+
+/* attr_get("/hw/cpunum/0",INFO_LBL_DETAIL_INVENT) ic_cpu_info.cpufq for
+   IRIX 6.5.  Past versions don't have INFO_LBL_DETAIL_INVENT,
+   invent_cpuinfo_t, or /hw/cpunum/0.
+
+   The same information is available from the "hinv -c processor" command,
+   but it seems better to make a system call where possible. */
+
+static int
+freq_attr_get_invent (int help)
+{
+  int     ret = 0;
+#if HAVE_ATTR_GET && HAVE_INVENT_H && defined (INFO_LBL_DETAIL_INVENT)
+  invent_cpuinfo_t  inv;
+  int               len, val;
+
+  HELP ("attr_get(\"/hw/cpunum/0\") ic_cpu_info.cpufq");
+
+  len = sizeof (inv);
+  if (attr_get ("/hw/cpunum/0", INFO_LBL_DETAIL_INVENT,
+                (char *) &inv, &len, 0) == 0
+      && len == sizeof (inv)
+      && inv.ic_gen.ig_invclass == INV_PROCESSOR)
+    {
+      val = inv.ic_cpu_info.cpufq;
+      speed_cycletime = 1e-6 / val;
+      if (speed_option_verbose)
+        printf ("Using attr_get(\"/hw/cpunum/0\") ic_cpu_info.cpufq %d MHz for cycle time %.3g\n", val, speed_cycletime);
+      ret = 1;
+    }
+#endif
+  return ret;
+}
+
+
+/* FreeBSD on i386 gives a line like the following at bootup, and which can
+   be read back from /var/run/dmesg.boot.
+
+       CPU: AMD Athlon(tm) Processor (755.29-MHz 686-class CPU)
+       CPU: Pentium 4 (1707.56-MHz 686-class CPU)
+       CPU: i486 DX4 (486-class CPU)
+
+   This is useful on FreeBSD 4.x, where there's no sysctl machdep.tsc_freq
+   or machdep.i586_freq.
+
+   It's better to use /var/run/dmesg.boot than to run /sbin/dmesg, since the
+   latter prints the current system message buffer, which is a limited size
+   and can wrap around if the system is up for a long time.  */
+
+static int
+freq_bsd_dmesg (int help)
+{
+  FILE    *fp;
+  char    buf[256], *p;
+  double  val;
+  int     ret = 0;
+  int     end;
+
+  HELP ("BSD /var/run/dmesg.boot file");
+
+  if ((fp = fopen ("/var/run/dmesg.boot", "r")) != NULL)
+    {
+      while (fgets (buf, sizeof (buf), fp) != NULL)
+        {
+          if (memcmp (buf, "CPU:", 4) == 0)
+            {
+              for (p = buf; *p != '\0'; p++)
+                {
+                  end = 0;
+                  if (sscanf (p, "(%lf-MHz%n", &val, &end) == 1 && end != 0)
+                    {
+                      speed_cycletime = 1e-6 / val;
+                      if (speed_option_verbose)
+                        printf ("Using /var/run/dmesg.boot CPU: %.2f MHz for cycle time %.3g\n", val, speed_cycletime);
+                      ret = 1;
+                      break;
+                    }
+                }
+            }
+        }
+      fclose (fp);
+    }
+  return ret;
+}
+
+
+/* "hinv -c processor" for IRIX.  The following lines have been seen,
+
+              1 150 MHZ IP20 Processor
+              2 195 MHZ IP27 Processors
+              Processor 0: 500 MHZ IP35
+
+   This information is available from attr_get() on IRIX 6.5 (see above),
+   but on IRIX 6.2 it's not clear where to look, so fall back on
+   parsing.  */
+
+static int
+freq_irix_hinv (int help)
+{
+  int     ret = 0;
+#if HAVE_POPEN
+  FILE    *fp;
+  char    buf[128];
+  double  val;
+  int     nproc, end;
+
+  HELP ("IRIX \"hinv -c processor\" output");
+
+  /* Error messages are sent to /dev/null in case hinv doesn't exist.  The
+     brackets are necessary for some shells. */
+  if ((fp = popen ("(hinv -c processor) 2>/dev/null", "r")) != NULL)
+    {
+      while (fgets (buf, sizeof (buf), fp) != NULL)
+        {
+          end = 0;
+          if (sscanf (buf, "Processor 0: %lf MHZ%n", &val, &end) == 1
+              && end != 0)
+            {
+            found:
+              speed_cycletime = 1e-6 / val;
+              if (speed_option_verbose)
+                printf ("Using hinv -c processor \"%.2f MHZ\" for cycle time %.3g\n", val, speed_cycletime);
+              ret = 1;
+              break;
+            }
+          end = 0;
+          if (sscanf (buf, "%d %lf MHZ%n", &nproc, &val, &end) == 2
+              && end != 0)
+            goto found;
+        }
+      pclose (fp);
+    }
+#endif
+  return ret;
+}
+
+
+/* processor_info() for Solaris.  "psrinfo" is the command-line interface to
+   this.  "prtconf -vp" gives similar information.
+
+   Apple Darwin has a processor_info, but in an incompatible style.  It
+   doesn't have <sys/processor.h>, so test for that.  */
+
+static int
+freq_processor_info (int help)
+{
+#if HAVE_PROCESSOR_INFO && HAVE_SYS_PROCESSOR_H
+  processor_info_t  p;
+  int  i, n, mhz = 0;
+
+  HELP ("processor_info() pi_clock");
+
+  n = sysconf (_SC_NPROCESSORS_CONF);
+  for (i = 0; i < n; i++)
+    {
+      if (processor_info (i, &p) != 0)
+        continue;
+      if (p.pi_state != P_ONLINE)
+        continue;
+
+      if (mhz != 0 && p.pi_clock != mhz)
+        {
+          fprintf (stderr,
+                   "freq_processor_info(): There's more than one CPU and they have different clock speeds\n");
+          return 0;
+        }
+
+      mhz = p.pi_clock;
+    }
+
+  speed_cycletime = 1.0e-6 / (double) mhz;
+
+  if (speed_option_verbose)
+    printf ("Using processor_info() %d mhz for cycle time %.3g\n",
+            mhz, speed_cycletime);
+  return 1;
+
+#else
+  return 0;
+#endif
+}
+
+
+#if HAVE_SPEED_CYCLECOUNTER && HAVE_GETTIMEOFDAY
+static double
+freq_measure_gettimeofday_one (void)
+{
+#define call_gettimeofday(t)   gettimeofday (&(t), NULL)
+#define timeval_tv_sec(t)      ((t).tv_sec)
+#define timeval_tv_usec(t)     ((t).tv_usec)
+  FREQ_MEASURE_ONE ("gettimeofday", struct timeval,
+                    call_gettimeofday, speed_cyclecounter,
+                    timeval_tv_sec, timeval_tv_usec);
+}
+#endif
+
+#if HAVE_SPEED_CYCLECOUNTER && HAVE_GETRUSAGE
+static double
+freq_measure_getrusage_one (void)
+{
+#define call_getrusage(t)   getrusage (0, &(t))
+#define rusage_tv_sec(t)    ((t).ru_utime.tv_sec)
+#define rusage_tv_usec(t)   ((t).ru_utime.tv_usec)
+  FREQ_MEASURE_ONE ("getrusage", struct rusage,
+                    call_getrusage, speed_cyclecounter,
+                    rusage_tv_sec, rusage_tv_usec);
+}
+#endif
+
+
+/* MEASURE_MATCH is how many readings within MEASURE_TOLERANCE of each other
+   are required.  This must be at least 2.  */
+#define MEASURE_MAX_ATTEMPTS   20
+#define MEASURE_TOLERANCE      1.005  /* 0.5% */
+#define MEASURE_MATCH          3
+
+double
+freq_measure (const char *name, double (*one) (void))
+{
+  double  t[MEASURE_MAX_ATTEMPTS];
+  int     i, j;
+
+  for (i = 0; i < numberof (t); i++)
+    {
+      t[i] = (*one) ();
+
+      qsort (t, i+1, sizeof(t[0]), (qsort_function_t) double_cmp_ptr);
+      if (speed_option_verbose >= 3)
+        for (j = 0; j <= i; j++)
+          printf ("   t[%d] is %.6g\n", j, t[j]);
+
+      for (j = 0; j+MEASURE_MATCH-1 <= i; j++)
+        {
+          if (t[j+MEASURE_MATCH-1] <= t[j] * MEASURE_TOLERANCE)
+            {
+              /* use the average of the range found */
+                return (t[j+MEASURE_MATCH-1] + t[j]) / 2.0;
+            }
+        }
+    }
+  return -1.0;
+}
+
+static int
+freq_measure_getrusage (int help)
+{
+#if HAVE_SPEED_CYCLECOUNTER && HAVE_GETRUSAGE
+  double  cycletime;
+
+  if (! getrusage_microseconds_p ())
+    return 0;
+  if (! cycles_works_p ())
+    return 0;
+
+  HELP ("cycle counter measured with microsecond getrusage()");
+
+  cycletime = freq_measure ("getrusage", freq_measure_getrusage_one);
+  if (cycletime == -1.0)
+    return 0;
+
+  speed_cycletime = cycletime;
+  if (speed_option_verbose)
+    printf ("Using getrusage() measured cycle counter %.4g (%.2f MHz)\n",
+            speed_cycletime, 1e-6/speed_cycletime);
+  return 1;
+
+#else
+  return 0;
+#endif
+}
+
+static int
+freq_measure_gettimeofday (int help)
+{
+#if HAVE_SPEED_CYCLECOUNTER && HAVE_GETTIMEOFDAY
+  double  cycletime;
+
+  if (! gettimeofday_microseconds_p ())
+    return 0;
+  if (! cycles_works_p ())
+    return 0;
+
+  HELP ("cycle counter measured with microsecond gettimeofday()");
+
+  cycletime = freq_measure ("gettimeofday", freq_measure_gettimeofday_one);
+  if (cycletime == -1.0)
+    return 0;
+
+  speed_cycletime = cycletime;
+  if (speed_option_verbose)
+    printf ("Using gettimeofday() measured cycle counter %.4g (%.2f MHz)\n",
+            speed_cycletime, 1e-6/speed_cycletime);
+  return 1;
+#else
+  return 0;
+#endif
+}
+
+
+/* Each function returns 1 if it succeeds in setting speed_cycletime, or 0
+   if not.
+
+   In general system call tests are first since they're fast, then file
+   tests, then tests running programs.  Necessary exceptions to this rule
+   are noted.  The measuring is last since it's time consuming, and rather
+   wasteful of cpu.  */
+
+static int
+freq_all (int help)
+{
+  return
+    /* This should be first, so an environment variable can override
+       anything the system gives. */
+    freq_environment (help)
+
+    || freq_attr_get_invent (help)
+    || freq_getsysinfo (help)
+    || freq_pstat_getprocessor (help)
+    || freq_sysctl_hw_model (help)
+    || freq_sysctl_hw_cpufrequency (help)
+    || freq_sysctlbyname_i586_freq (help)
+    || freq_sysctlbyname_tsc_freq (help)
+
+    /* SCO openunix 8 puts a dummy pi_clock==16 in processor_info, so be
+       sure to check /etc/hw before that function. */
+    || freq_sco_etchw (help)
+
+    || freq_processor_info (help)
+    || freq_proc_cpuinfo (help)
+    || freq_bsd_dmesg (help)
+    || freq_irix_hinv (help)
+    || freq_sunos_sysinfo (help)
+    || freq_measure_getrusage (help)
+    || freq_measure_gettimeofday (help);
+}
+
+
+void
+speed_cycletime_init (void)
+{
+  static int  attempted = 0;
+
+  if (attempted)
+    return;
+  attempted = 1;
+
+  if (freq_all (0))
+    return;
+
+  if (speed_option_verbose)
+    printf ("CPU frequency couldn't be determined\n");
+}
+
+
+void
+speed_cycletime_fail (const char *str)
+{
+  fprintf (stderr, "Measuring with: %s\n", speed_time_string);
+  fprintf (stderr, "%s,\n", str);
+  fprintf (stderr, "but none of the following are available,\n");
+  freq_all (1);
+  abort ();
+}
+
+/* speed_time_init leaves speed_cycletime set to either 0.0 or 1.0 when the
+   CPU frequency is unknown.  0.0 is when the time base is in seconds, so
+   that's no good if cycles are wanted.  1.0 is when the time base is in
+   cycles, which conversely is no good if seconds are wanted.  */
+void
+speed_cycletime_need_cycles (void)
+{
+  speed_time_init ();
+  if (speed_cycletime == 0.0)
+    speed_cycletime_fail
+      ("Need to know CPU frequency to give times in cycles");
+}
+void
+speed_cycletime_need_seconds (void)
+{
+  speed_time_init ();
+  if (speed_cycletime == 1.0)
+    speed_cycletime_fail
+      ("Need to know CPU frequency to convert cycles to seconds");
+}

diff --git a/third_party/gmp/tune/gcdext_double.c b/third_party/gmp/tune/gcdext_double.c
new file mode 100644
index 0000000..2b2ba15
--- /dev/null
+++ b/third_party/gmp/tune/gcdext_double.c

@@ -0,0 +1,38 @@
+/* mpn/generic/gcdext.c forced to use double limb calculations. */
+
+/*
+Copyright 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#undef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD  0
+#define __gmpn_gcdext  mpn_gcdext_double
+
+#include "../mpn/generic/gcdext.c"

diff --git a/third_party/gmp/tune/gcdext_single.c b/third_party/gmp/tune/gcdext_single.c
new file mode 100644
index 0000000..3c1d28c
--- /dev/null
+++ b/third_party/gmp/tune/gcdext_single.c

@@ -0,0 +1,38 @@
+/* mpn/generic/gcdext.c forced to use single limb calculations. */
+
+/*
+Copyright 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#undef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD  MP_SIZE_T_MAX
+#define __gmpn_gcdext  mpn_gcdext_single
+
+#include "../mpn/generic/gcdext.c"

diff --git a/third_party/gmp/tune/gcdextod.c b/third_party/gmp/tune/gcdextod.c
new file mode 100644
index 0000000..f40cae6
--- /dev/null
+++ b/third_party/gmp/tune/gcdextod.c

@@ -0,0 +1,39 @@
+/* mpn/generic/gcdext.c forced to one double limb step. */
+
+/*
+Copyright 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#undef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD  0
+#define WANT_GCDEXT_ONE_STEP 1
+#define __gmpn_gcdext  mpn_gcdext_one_double
+
+#include "../mpn/generic/gcdext.c"

diff --git a/third_party/gmp/tune/gcdextos.c b/third_party/gmp/tune/gcdextos.c
new file mode 100644
index 0000000..f51ff52
--- /dev/null
+++ b/third_party/gmp/tune/gcdextos.c

@@ -0,0 +1,39 @@
+/* mpn/generic/gcdext.c forced to one single limb step. */
+
+/*
+Copyright 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#undef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD  MP_SIZE_T_MAX
+#define WANT_GCDEXT_ONE_STEP 1
+#define __gmpn_gcdext  mpn_gcdext_one_single
+
+#include "../mpn/generic/gcdext.c"

diff --git a/third_party/gmp/tune/hgcd2-1.c b/third_party/gmp/tune/hgcd2-1.c
new file mode 100644
index 0000000..1e8948c
--- /dev/null
+++ b/third_party/gmp/tune/hgcd2-1.c

@@ -0,0 +1,39 @@
+/* mpn/generic/hgcd2.c method 1.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#undef HGCD2_DIV1_METHOD
+#define HGCD2_DIV1_METHOD 1
+#define __gmpn_hgcd2 mpn_hgcd2_1
+/* Not used, but renamed to not get duplicate definitions */
+#define __gmpn_hgcd_mul_matrix1_vector mpn_hgcd_mul_matrix1_vector_1
+
+#include "mpn/generic/hgcd2.c"

diff --git a/third_party/gmp/tune/hgcd2-2.c b/third_party/gmp/tune/hgcd2-2.c
new file mode 100644
index 0000000..bbb123b
--- /dev/null
+++ b/third_party/gmp/tune/hgcd2-2.c

@@ -0,0 +1,39 @@
+/* mpn/generic/hgcd2.c method 2.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#undef HGCD2_DIV1_METHOD
+#define HGCD2_DIV1_METHOD 2
+#define __gmpn_hgcd2 mpn_hgcd2_2
+/* Not used, but renamed to not get duplicate definitions */
+#define __gmpn_hgcd_mul_matrix1_vector mpn_hgcd_mul_matrix1_vector_2
+
+#include "mpn/generic/hgcd2.c"

diff --git a/third_party/gmp/tune/hgcd2-3.c b/third_party/gmp/tune/hgcd2-3.c
new file mode 100644
index 0000000..ac62108
--- /dev/null
+++ b/third_party/gmp/tune/hgcd2-3.c

@@ -0,0 +1,39 @@
+/* mpn/generic/hgcd2.c method 3.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#undef HGCD2_DIV1_METHOD
+#define HGCD2_DIV1_METHOD 3
+#define __gmpn_hgcd2 mpn_hgcd2_3
+/* Not used, but renamed to not get duplicate definitions */
+#define __gmpn_hgcd_mul_matrix1_vector mpn_hgcd_mul_matrix1_vector_3
+
+#include "mpn/generic/hgcd2.c"

diff --git a/third_party/gmp/tune/hgcd2-4.c b/third_party/gmp/tune/hgcd2-4.c
new file mode 100644
index 0000000..ec7f927
--- /dev/null
+++ b/third_party/gmp/tune/hgcd2-4.c

@@ -0,0 +1,39 @@
+/* mpn/generic/hgcd2.c method 4.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#undef HGCD2_DIV1_METHOD
+#define HGCD2_DIV1_METHOD 4
+#define __gmpn_hgcd2 mpn_hgcd2_4
+/* Not used, but renamed to not get duplicate definitions */
+#define __gmpn_hgcd_mul_matrix1_vector mpn_hgcd_mul_matrix1_vector_4
+
+#include "mpn/generic/hgcd2.c"

diff --git a/third_party/gmp/tune/hgcd2-5.c b/third_party/gmp/tune/hgcd2-5.c
new file mode 100644
index 0000000..ed66171
--- /dev/null
+++ b/third_party/gmp/tune/hgcd2-5.c

@@ -0,0 +1,39 @@
+/* mpn/generic/hgcd2.c method 5.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#undef HGCD2_DIV1_METHOD
+#define HGCD2_DIV1_METHOD 5
+#define __gmpn_hgcd2 mpn_hgcd2_5
+/* Not used, but renamed to not get duplicate definitions */
+#define __gmpn_hgcd_mul_matrix1_vector mpn_hgcd_mul_matrix1_vector_5
+
+#include "mpn/generic/hgcd2.c"

diff --git a/third_party/gmp/tune/hgcd2.c b/third_party/gmp/tune/hgcd2.c
new file mode 100644
index 0000000..146af72
--- /dev/null
+++ b/third_party/gmp/tune/hgcd2.c

@@ -0,0 +1,49 @@
+/* mpn/generic/hgcd2.c for tuning
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define TUNE_PROGRAM_BUILD 1
+
+#include "gmp-impl.h"
+
+hgcd2_func_t mpn_hgcd2_default;
+
+hgcd2_func_t *hgcd2_func = &mpn_hgcd2_default;
+
+int
+mpn_hgcd2 (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl,
+	   struct hgcd_matrix1 *M)
+{
+  return hgcd2_func(ah, al, bh, bl, M);
+}
+
+#undef mpn_hgcd2
+#define mpn_hgcd2 mpn_hgcd2_default
+
+#include "mpn/generic/hgcd2.c"

diff --git a/third_party/gmp/tune/hgcd_appr_lehmer.c b/third_party/gmp/tune/hgcd_appr_lehmer.c
new file mode 100644
index 0000000..aa43a07
--- /dev/null
+++ b/third_party/gmp/tune/hgcd_appr_lehmer.c

@@ -0,0 +1,39 @@
+/* mpn/generic/hgcd_appr.c forced to use Lehmer's quadratic algorithm. */
+
+/*
+Copyright 2010, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#undef  HGCD_APPR_THRESHOLD
+#define HGCD_APPR_THRESHOLD MP_SIZE_T_MAX
+#define __gmpn_hgcd_appr  mpn_hgcd_appr_lehmer
+#define __gmpn_hgcd_appr_itch mpn_hgcd_appr_lehmer_itch
+
+#include "../mpn/generic/hgcd_appr.c"

diff --git a/third_party/gmp/tune/hgcd_lehmer.c b/third_party/gmp/tune/hgcd_lehmer.c
new file mode 100644
index 0000000..364749d
--- /dev/null
+++ b/third_party/gmp/tune/hgcd_lehmer.c

@@ -0,0 +1,39 @@
+/* mpn/generic/hgcd.c forced to use Lehmer's quadratic algorithm. */
+
+/*
+Copyright 2010 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#undef  HGCD_THRESHOLD
+#define HGCD_THRESHOLD MP_SIZE_T_MAX
+#define __gmpn_hgcd  mpn_hgcd_lehmer
+#define __gmpn_hgcd_itch mpn_hgcd_lehmer_itch
+
+#include "../mpn/generic/hgcd.c"

diff --git a/third_party/gmp/tune/hgcd_reduce_1.c b/third_party/gmp/tune/hgcd_reduce_1.c
new file mode 100644
index 0000000..5052233
--- /dev/null
+++ b/third_party/gmp/tune/hgcd_reduce_1.c

@@ -0,0 +1,40 @@
+/* mpn/generic/hgcd_reduce.c forced to use hgcd. */
+
+/*
+Copyright 2010 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#undef  HGCD_REDUCE_THRESHOLD
+#define HGCD_REDUCE_THRESHOLD MP_SIZE_T_MAX
+#define __gmpn_hgcd_reduce  mpn_hgcd_reduce_1
+#define __gmpn_hgcd_reduce_itch  mpn_hgcd_reduce_1_itch
+
+
+#include "../mpn/generic/hgcd_reduce.c"

diff --git a/third_party/gmp/tune/hgcd_reduce_2.c b/third_party/gmp/tune/hgcd_reduce_2.c
new file mode 100644
index 0000000..5d802e0
--- /dev/null
+++ b/third_party/gmp/tune/hgcd_reduce_2.c

@@ -0,0 +1,39 @@
+/* mpn/generic/hgcd_reduce.c forced to use hgcd_appr. */
+
+/*
+Copyright 2010 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#undef  HGCD_REDUCE_THRESHOLD
+#define HGCD_REDUCE_THRESHOLD 0
+#define __gmpn_hgcd_reduce mpn_hgcd_reduce_2
+#define __gmpn_hgcd_reduce_itch mpn_hgcd_reduce_2_itch
+
+#include "../mpn/generic/hgcd_reduce.c"

diff --git a/third_party/gmp/tune/hppa.asm b/third_party/gmp/tune/hppa.asm
new file mode 100644
index 0000000..fc9d62e
--- /dev/null
+++ b/third_party/gmp/tune/hppa.asm

@@ -0,0 +1,42 @@
+dnl  HPPA 32-bit time stamp counter access routine.
+
+dnl  Copyright 2000, 2002, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+dnl void speed_cyclecounter (unsigned p[2]);
+dnl
+dnl Get the HPPA interval timer.
+
+PROLOGUE(speed_cyclecounter)
+	mfctl	%cr16,%r28
+	stw	%r28,0(0,%r26)
+	bv	0(%r2)
+	stw	%r0,4(0,%r26)
+EPILOGUE(speed_cyclecounter)

diff --git a/third_party/gmp/tune/hppa2.asm b/third_party/gmp/tune/hppa2.asm
new file mode 100644
index 0000000..57ef4c4
--- /dev/null
+++ b/third_party/gmp/tune/hppa2.asm

@@ -0,0 +1,44 @@
+dnl  HPPA 64-bit time stamp counter access routine.
+
+dnl  Copyright 2000, 2002, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+dnl void speed_cyclecounter (unsigned p[2]);
+dnl
+dnl Get the HPPA interval timer.
+
+	.level 2.0
+PROLOGUE(speed_cyclecounter)
+	mfctl	%cr16,%r28
+	stw	%r28,0(0,%r26)		; low word
+	extrd,u	%r28,31,32,%r28
+	bve	(%r2)
+	stw	%r28,4(0,%r26)		; high word
+EPILOGUE(speed_cyclecounter)

diff --git a/third_party/gmp/tune/hppa2w.asm b/third_party/gmp/tune/hppa2w.asm
new file mode 100644
index 0000000..215a0cc
--- /dev/null
+++ b/third_party/gmp/tune/hppa2w.asm

@@ -0,0 +1,44 @@
+dnl  HPPA 64-bit time stamp counter access routine.
+
+dnl  Copyright 2000, 2002, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+dnl void speed_cyclecounter (unsigned p[2]);
+dnl
+dnl Get the HPPA interval timer.
+
+	.level 2.0w
+PROLOGUE(speed_cyclecounter)
+	mfctl	%cr16,%r28
+	stw	%r28,0(0,%r26)		; low word
+	extrd,u	%r28,31,32,%r28
+	bve	(%r2)
+	stw	%r28,4(0,%r26)		; high word
+EPILOGUE(speed_cyclecounter)

diff --git a/third_party/gmp/tune/ia64.asm b/third_party/gmp/tune/ia64.asm
new file mode 100644
index 0000000..0651111
--- /dev/null
+++ b/third_party/gmp/tune/ia64.asm

@@ -0,0 +1,47 @@
+dnl  IA-64 time stamp counter access routine.
+
+dnl  Copyright 2000, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C void speed_cyclecounter (unsigned int p[2]);
+C
+
+ASM_START()
+PROLOGUE(speed_cyclecounter)
+	mov	r14 = ar.itc
+	;;
+	st4	[r32] = r14, 4
+	shr.u	r14 = r14, 32
+	;;
+	st4	[r32] = r14
+	br.ret.sptk.many b0
+EPILOGUE(speed_cyclecounter)
+ASM_END()

diff --git a/third_party/gmp/tune/jacbase1.c b/third_party/gmp/tune/jacbase1.c
new file mode 100644
index 0000000..89a584d
--- /dev/null
+++ b/third_party/gmp/tune/jacbase1.c

@@ -0,0 +1,37 @@
+/* mpn/generic/jacbase.c method 1.
+
+Copyright 2002 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#undef JACOBI_BASE_METHOD
+#define JACOBI_BASE_METHOD 1
+#define __gmpn_jacobi_base mpn_jacobi_base_1
+
+#include "mpn/generic/jacbase.c"

diff --git a/third_party/gmp/tune/jacbase2.c b/third_party/gmp/tune/jacbase2.c
new file mode 100644
index 0000000..253d835
--- /dev/null
+++ b/third_party/gmp/tune/jacbase2.c

@@ -0,0 +1,37 @@
+/* mpn/generic/jacbase.c method 2.
+
+Copyright 2002 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#undef JACOBI_BASE_METHOD
+#define JACOBI_BASE_METHOD 2
+#define __gmpn_jacobi_base mpn_jacobi_base_2
+
+#include "mpn/generic/jacbase.c"

diff --git a/third_party/gmp/tune/jacbase3.c b/third_party/gmp/tune/jacbase3.c
new file mode 100644
index 0000000..4440f31
--- /dev/null
+++ b/third_party/gmp/tune/jacbase3.c

@@ -0,0 +1,37 @@
+/* mpn/generic/jacbase.c method 3.
+
+Copyright 2002 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#undef JACOBI_BASE_METHOD
+#define JACOBI_BASE_METHOD 3
+#define __gmpn_jacobi_base mpn_jacobi_base_3
+
+#include "mpn/generic/jacbase.c"

diff --git a/third_party/gmp/tune/jacbase4.c b/third_party/gmp/tune/jacbase4.c
new file mode 100644
index 0000000..daea3bb
--- /dev/null
+++ b/third_party/gmp/tune/jacbase4.c

@@ -0,0 +1,37 @@
+/* mpn/generic/jacbase.c method 4.
+
+Copyright 2002, 2010 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#undef JACOBI_BASE_METHOD
+#define JACOBI_BASE_METHOD 4
+#define __gmpn_jacobi_base mpn_jacobi_base_4
+
+#include "mpn/generic/jacbase.c"

diff --git a/third_party/gmp/tune/many.pl b/third_party/gmp/tune/many.pl
new file mode 100644
index 0000000..524a67d
--- /dev/null
+++ b/third_party/gmp/tune/many.pl

@@ -0,0 +1,1334 @@
+#! /usr/bin/perl -w
+
+# Copyright 2000-2002 Free Software Foundation, Inc.
+#
+#  This file is part of the GNU MP Library.
+#
+#  The GNU MP Library is free software; you can redistribute it and/or modify
+#  it under the terms of either:
+#
+#    * the GNU Lesser General Public License as published by the Free
+#      Software Foundation; either version 3 of the License, or (at your
+#      option) any later version.
+#
+#  or
+#
+#    * the GNU General Public License as published by the Free Software
+#      Foundation; either version 2 of the License, or (at your option) any
+#      later version.
+#
+#  or both in parallel, as here.
+#
+#  The GNU MP Library is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+#  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+#  for more details.
+#
+#  You should have received copies of the GNU General Public License and the
+#  GNU Lesser General Public License along with the GNU MP Library.  If not,
+#  see https://www.gnu.org/licenses/.
+
+
+# Usage:  cd $builddir/tune
+#	  perl $srcdir/tune/many.pl [-t] <files/dirs>...
+#
+# Output: speed-many.c
+#         try-many.c
+#         Makefile.many
+#
+# Make alternate versions of various mpn routines available for measuring
+# and testing.
+#
+# The $srcdir and $builddir in the invocation above just means the script
+# lives in the tune source directory, but should be run in the tune build
+# directory.  When not using a separate object directory this just becomes
+#
+#	cd tune
+#	perl many.pl [-t] <files/dirs>...
+#
+#
+# SINGLE FILES
+#
+# Suppose $HOME/newcode/mul_1_experiment.asm is a new implementation of
+# mpn_mul_1, then
+#
+#	cd $builddir/tune
+#	perl $srcdir/tune/many.pl $HOME/newcode/mul_1_experiment.asm
+#
+# will produce rules and renaming so that a speed program incorporating it
+# can be built,
+#
+#	make -f Makefile.many speed-many
+#
+# then for example it can be compared to the standard mul_1,
+#
+#	./speed-many -s 1-30 mpn_mul_1 mpn_mul_1_experiment
+#
+# An expanded try program can be used to check correctness,
+#
+#	make -f Makefile.many try-many
+#
+# and run
+#
+#	./try-many mpn_mul_1_experiment
+#
+# Files can be ".c", ".S" or ".asm".  ".s" files can't be used because they
+# don't get any preprocessing so there's no way to do renaming of their
+# functions.
+#
+#
+# WHOLE DIRECTORIES
+#
+# If a directory is given, then all files in it will be made available.
+# For example,
+#
+#	cd $builddir/tune
+#	perl $srcdir/tune/many.pl $HOME/newcode
+#
+# Each file should have a suffix, like "_experiment" above.
+#
+#
+# MPN DIRECTORIES
+#
+# mpn directories from the GMP source tree can be included, and this is a
+# convenient way to compare multiple implementations suiting different chips
+# in a CPU family.  For example the following would make all x86 routines
+# available,
+#
+#	cd $builddir/tune
+#	perl $srcdir/tune/many.pl `find $srcdir/mpn/x86 -type d`
+#
+# On a new x86 chip a comparison could then be made to see how existing code
+# runs.  For example,
+#
+#	make -f Makefile.many speed-many
+#	./speed-many -s 1-30 -c \
+#		mpn_add_n_x86 mpn_add_n_pentium mpn_add_n_k6 mpn_add_n_k7
+#
+# Files in "mpn" subdirectories don't need the "_experiment" style suffix
+# described above, instead a suffix is constructed from the subdirectory.
+# For example "mpn/x86/k7/mmx/mod_1.asm" will generate a function
+# mpn_mod_1_k7_mmx.  The rule is to take the last directory name after the
+# "mpn", or the last two if there's three or more.  (Check the generated
+# speed-many.c if in doubt.)
+#
+#
+# GENERIC C
+#
+# The mpn/generic directory can be included too, just like any processor
+# specific directory.  This is a good way to compare assembler and generic C
+# implementations.  For example,
+#
+#	cd $builddir/tune
+#	perl $srcdir/tune/many.pl $srcdir/mpn/generic
+#
+# or if just a few routines are of interest, then for example
+#
+#	cd $builddir/tune
+#	perl $srcdir/tune/many.pl \
+#		$srcdir/mpn/generic/lshift.c \
+#		$srcdir/mpn/generic/mod_1.c \
+#		$srcdir/mpn/generic/aorsmul_1.c
+#
+# giving mpn_lshift_generic etc.
+#
+#
+# TESTS/DEVEL PROGRAMS
+#
+# Makefile.many also has rules to build the tests/devel programs with suitable
+# renaming, and with some parameters for correctness or speed.  This is less
+# convenient than the speed and try programs, but provides an independent
+# check.  For example,
+#
+#	make -f Makefile.many tests_mul_1_experimental
+#	./tests_mul_1_experimental
+#
+# and for speed
+#
+#	make -f Makefile.many tests_mul_1_experimental_sp
+#	./tests_mul_1_experimental_sp
+#
+# Not all the programs support speed measuring, in which case only the
+# correctness test will be useful.
+#
+# The parameters for repetitions and host clock speed are -D defines.  Some
+# defaults are provided at the end of Makefile.many, but probably these will
+# want to be overridden.  For example,
+#
+#	rm tests_mul_1_experimental.o
+#	make -f Makefile.many \
+#	   CFLAGS_TESTS="-DSIZE=50 -DTIMES=1000 -DRANDOM -DCLOCK=175000000" \
+#	   tests_mul_1_experimental
+#	./tests_mul_1_experimental
+#
+#
+# OTHER NOTES
+#
+# The mappings of file names to functions, and the macros to then use for
+# speed measuring etc are driven by @table below.  The scheme isn't
+# completely general, it's only got as many variations as have been needed
+# so far.
+#
+# Some functions are only made available in speed-many, or others only in
+# try-many.  An @table entry speed=>none means no speed measuring is
+# available, or try=>none no try program testing.  These can be removed
+# if/when the respective programs get the necessary support.
+#
+# If a file has "1c" or "nc" carry-in entrypoints, they're renamed and made
+# available too.  These are recognised from PROLOGUE or MULFUNC_PROLOGUE in
+# .S and .asm files, or from a line starting with "mpn_foo_1c" in a .c file
+# (possibly via a #define), and on that basis are entirely optional.  This
+# entrypoint matching is done for the standard entrypoints too, but it would
+# be very unusual to have for instance a mul_1c without a mul_1.
+#
+# Some mpz files are recognized.  For example an experimental copy of
+# mpz/powm.c could be included as powm_new.c and would be called
+# mpz_powm_new.  So far only speed measuring is available for these.
+#
+# For the ".S" and ".asm" files, both PIC and non-PIC objects are built.
+# The PIC functions have a "_pic" suffix, for example "mpn_mod_1_k7_mmx_pic".
+# This can be ignored for routines that don't differ for PIC, or for CPUs
+# where everything is PIC anyway.
+#
+# K&R compilers are supported via the same ansi2knr mechanism used by
+# automake, though it's hard to believe anyone will have much interest in
+# measuring a compiler so old that it doesn't even have an ANSI mode.
+#
+# The "-t" option can be used to print a trace of the files found and what's
+# done with them.  A great deal of obscure output is produced, but it can
+# indicate where or why some files aren't being recognised etc.  For
+# example,
+#
+#	cd $builddir/tune
+#	perl $srcdir/tune/many.pl -t $HOME/newcode/add_n_weird.asm
+#
+# In general, when including new code, all that's really necessary is that
+# it will compile or assemble under the current configuration.  It's fine if
+# some code doesn't actually run due to bugs, or to needing a newer CPU or
+# whatever, simply don't ask for the offending routines when invoking
+# speed-many or try-many, or don't try to run them on sizes they don't yet
+# support, or whatever.
+#
+#
+# CPU SPECIFICS
+#
+# x86 - All the x86 code will assemble on any system, but code for newer
+#       chips might not run on older chips.  Expect SIGILLs from new
+#       instructions on old chips.
+#
+#       A few "new" instructions, like cmov for instance, are done as macros
+#       and will generate some equivalent plain i386 code when HAVE_HOST_CPU
+#       in config.m4 indicates an old CPU.  It won't run fast, but it does
+#       make it possible to test correctness.
+#
+#
+# INTERNALS
+#
+# The nonsense involving $ENV is some hooks used during development to add
+# additional functions temporarily.
+#
+#
+# FUTURE
+#
+# Maybe the C files should be compiled pic and non-pic too.  Wait until
+# there's a difference that might be of interest.
+#
+# Warn if a file provides no functions.
+#
+# Allow mpz and mpn files of the same name.  Currently the mpn fib2_ui
+# matching hides the mpz version of that.  Will need to check the file
+# contents to see which it is.  Would be worth allowing an "mpz_" or "mpn_"
+# prefix on the filenames to have working versions of both in one directory.
+#
+#
+# LIMITATIONS
+#
+# Some of the command lines can become very long when a lot of files are
+# included.  If this is a problem on a given system the only suggestion is
+# to run many.pl for just those that are actually wanted at a particular
+# time.
+#
+# DOS 8.3 or SysV 14 char filesystems won't work, since the long filenames
+# generated will almost certainly fail to be unique.
+
+
+use strict;
+use File::Basename;
+use Getopt::Std;
+
+my %opt;
+getopts('t', \%opt);
+
+my @DIRECTORIES = @ARGV;
+if (defined $ENV{directories}) { push @DIRECTORIES, @{$ENV{directories}} }
+
+
+# regexp - matched against the start of the filename.  If a grouping "(...)"
+#          is present then only the first such part is used.
+#
+# mulfunc - filenames to be generated from a multi-function file.
+#
+# funs - functions provided by the file, defaulting to the filename with mpn
+#          (or mpX).
+#
+# mpX - prefix like "mpz", defaulting to "mpn".
+#
+# ret - return value type.
+#
+# args, args_<fun> - arguments for the given function.  If an args_<fun> is
+#          set then it's used, otherwise plain args is used.  "mp_limb_t
+#          carry" is appended for carry-in variants.
+#
+# try - try.c TYPE_ to use, defaulting to TYPE_fun with the function name
+#          in upper case.  "C" is appended for carry-in variants.  Can be
+#          'none' for no try program entry.
+#
+# speed - SPEED_ROUTINE_ to use, handled like "try".
+#
+# speed_flags - SPEED_ROUTINE_ to use, handled like "try".
+
+
+my @table =
+    (
+     {
+       'regexp'=> 'add_n|sub_n|addlsh1_n|sublsh1_n|rsh1add_n|rsh1sub_n',
+       'ret'   => 'mp_limb_t',
+       'args'  => 'mp_ptr wp, mp_srcptr xp, mp_srcptr yp, mp_size_t size',
+       'speed' => 'SPEED_ROUTINE_MPN_BINARY_N',
+       'speed_flags'=> 'FLAG_R_OPTIONAL',
+     },
+     {
+       'regexp'=> 'aors_n',
+       'mulfunc'=> ['add_n','sub_n'],
+       'ret'   => 'mp_limb_t',
+       'args'  => 'mp_ptr wp, mp_srcptr xp, mp_srcptr yp, mp_size_t size',
+       'speed' => 'SPEED_ROUTINE_MPN_BINARY_N',
+       'speed_flags'=> 'FLAG_R_OPTIONAL',
+     },
+
+     {
+       'regexp'=> 'addmul_1|submul_1',
+       'ret'   => 'mp_limb_t',
+       'args'  => 'mp_ptr wp, mp_srcptr xp, mp_size_t size, mp_limb_t mult',
+       'speed' => 'SPEED_ROUTINE_MPN_UNARY_1',
+       'speed_flags'=> 'FLAG_R',
+     },
+     {
+       'regexp'=> 'aorsmul_1',
+       'mulfunc'=> ['addmul_1','submul_1'],
+       'ret'   => 'mp_limb_t',
+       'args'  => 'mp_ptr wp, mp_srcptr xp, mp_size_t size, mp_limb_t mult',
+       'speed' => 'SPEED_ROUTINE_MPN_UNARY_1',
+       'speed_flags'=> 'FLAG_R',
+     },
+
+     {
+       'regexp'=> 'addmul_2|submul_2',
+       'ret'   => 'mp_limb_t',
+       'args'  => 'mp_ptr wp, mp_srcptr xp, mp_size_t size, mp_srcptr yp',
+       'speed' => 'SPEED_ROUTINE_MPN_UNARY_2',
+       'speed_flags'=> 'FLAG_R_OPTIONAL',
+       'try-minsize' => 2,
+     },
+     {
+       'regexp'=> 'addmul_3|submul_3',
+       'ret'   => 'mp_limb_t',
+       'args'  => 'mp_ptr wp, mp_srcptr xp, mp_size_t size, mp_srcptr yp',
+       'speed' => 'SPEED_ROUTINE_MPN_UNARY_3',
+       'speed_flags'=> 'FLAG_R_OPTIONAL',
+       'try-minsize' => 3,
+     },
+     {
+       'regexp'=> 'addmul_4|submul_4',
+       'ret'   => 'mp_limb_t',
+       'args'  => 'mp_ptr wp, mp_srcptr xp, mp_size_t size, mp_srcptr yp',
+       'speed' => 'SPEED_ROUTINE_MPN_UNARY_4',
+       'speed_flags'=> 'FLAG_R_OPTIONAL',
+       'try-minsize' => 4,
+     },
+     {
+       'regexp'=> 'addmul_5|submul_5',
+       'ret'   => 'mp_limb_t',
+       'args'  => 'mp_ptr wp, mp_srcptr xp, mp_size_t size, mp_srcptr yp',
+       'speed' => 'SPEED_ROUTINE_MPN_UNARY_5',
+       'speed_flags'=> 'FLAG_R_OPTIONAL',
+       'try-minsize' => 5,
+     },
+     {
+       'regexp'=> 'addmul_6|submul_6',
+       'ret'   => 'mp_limb_t',
+       'args'  => 'mp_ptr wp, mp_srcptr xp, mp_size_t size, mp_srcptr yp',
+       'speed' => 'SPEED_ROUTINE_MPN_UNARY_6',
+       'speed_flags'=> 'FLAG_R_OPTIONAL',
+       'try-minsize' => 6,
+     },
+     {
+       'regexp'=> 'addmul_7|submul_7',
+       'ret'   => 'mp_limb_t',
+       'args'  => 'mp_ptr wp, mp_srcptr xp, mp_size_t size, mp_srcptr yp',
+       'speed' => 'SPEED_ROUTINE_MPN_UNARY_7',
+       'speed_flags'=> 'FLAG_R_OPTIONAL',
+       'try-minsize' => 7,
+     },
+     {
+       'regexp'=> 'addmul_8|submul_8',
+       'ret'   => 'mp_limb_t',
+       'args'  => 'mp_ptr wp, mp_srcptr xp, mp_size_t size, mp_srcptr yp',
+       'speed' => 'SPEED_ROUTINE_MPN_UNARY_8',
+       'speed_flags'=> 'FLAG_R_OPTIONAL',
+       'try-minsize' => 8,
+     },
+
+     {
+       'regexp'=> 'add_n_sub_n',
+       'ret'   => 'mp_limb_t',
+       'args'  => 'mp_ptr sum, mp_ptr diff, mp_srcptr xp, mp_srcptr yp, mp_size_t size',
+       'speed_flags'=> 'FLAG_R_OPTIONAL',
+     },
+
+     {
+       'regexp'=> 'com|copyi|copyd',
+       'ret'   => 'void',
+       'args'  => 'mp_ptr wp, mp_srcptr xp, mp_size_t size',
+       'speed' => 'SPEED_ROUTINE_MPN_COPY',
+     },
+
+     {
+       'regexp'=> 'dive_1',
+       'funs'  => ['divexact_1'],
+       'ret'   => 'void',
+       'args'  => 'mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor',
+       'speed_flags'=> 'FLAG_R',
+     },
+     {
+       'regexp'=> 'diveby3',
+       'funs'  => ['divexact_by3c'],
+       'ret'   => 'mp_limb_t',
+       'args'  => 'mp_ptr dst, mp_srcptr src, mp_size_t size',
+       'carrys'=> [''],
+       'speed' => 'SPEED_ROUTINE_MPN_COPY',
+     },
+
+     # mpn_preinv_divrem_1 is an optional extra entrypoint
+     {
+       'regexp'=> 'divrem_1',
+       'funs'  => ['divrem_1', 'preinv_divrem_1'],
+       'ret'   => 'mp_limb_t',
+       'args_divrem_1' => 'mp_ptr rp, mp_size_t xsize, mp_srcptr sp, mp_size_t size, mp_limb_t divisor',
+       'args_preinv_divrem_1' => 'mp_ptr rp, mp_size_t xsize, mp_srcptr sp, mp_size_t size, mp_limb_t divisor, mp_limb_t inverse, unsigned shift',
+       'speed_flags'=> 'FLAG_R',
+       'speed_suffixes' => ['f'],
+     },
+     {
+       'regexp'=> 'pre_divrem_1',
+       'funs'  => ['preinv_divrem_1'],
+       'ret'   => 'mp_limb_t',
+       'args'  => 'mp_ptr qp, mp_size_t qxn, mp_srcptr ap, mp_size_t asize, mp_limb_t divisor, mp_limb_t inverse, int shift',
+       'speed_flags' => 'FLAG_R',
+     },
+
+     {
+       'regexp'=> 'divrem_2',
+       'ret'   => 'mp_limb_t',
+       'args'  => 'mp_ptr qp, mp_size_t qxn, mp_srcptr np, mp_size_t nsize, mp_srcptr dp',
+       'try'   => 'none',
+     },
+
+     {
+       'regexp'=> 'sb_divrem_mn',
+       'ret'   => 'mp_limb_t',
+       'args'  => 'mp_ptr qp, mp_ptr np, mp_size_t nsize, mp_srcptr dp, mp_size_t dsize',
+       'speed' => 'SPEED_ROUTINE_MPN_DC_DIVREM_SB',
+       'try-minsize' => 3,
+     },
+     {
+       'regexp'=> 'tdiv_qr',
+       'ret'   => 'void',
+       'args'  => 'mp_ptr qp, mp_size_t qxn, mp_ptr np, mp_size_t nsize, mp_srcptr dp, mp_size_t dsize',
+       'speed' => 'none',
+     },
+
+     {
+       'regexp'=> 'get_str',
+       'ret'   => 'size_t',
+       'args'  => 'unsigned char *str, int base, mp_ptr mptr, mp_size_t msize',
+       'speed_flags' => 'FLAG_R_OPTIONAL',
+       'try'   => 'none',
+     },
+     {
+       'regexp'=> 'set_str',
+       'ret'   => 'mp_size_t',
+       'args'  => 'mp_ptr xp, const unsigned char *str, size_t str_len, int base',
+       'speed_flags' => 'FLAG_R_OPTIONAL',
+       'try'   => 'none',
+     },
+
+     {
+       'regexp'=> 'fac_ui',
+       'mpX'   => 'mpz',
+       'ret'   => 'void',
+       'args'  => 'mpz_ptr r, unsigned long n',
+       'speed_flags' => 'FLAG_NODATA',
+       'try'   => 'none',
+     },
+
+     {
+       'regexp'=> 'fib2_ui',
+       'ret'   => 'void',
+       'args'  => 'mp_ptr fp, mp_ptr f1p, unsigned long n',
+       'rename'=> ['__gmp_fib_table'],
+       'speed_flags' => 'FLAG_NODATA',
+       'try'   => 'none',
+     },
+     {
+       'regexp'=> 'fib_ui',
+       'mpX'   => 'mpz',
+       'ret'   => 'void',
+       'args'  => 'mpz_ptr fn, unsigned long n',
+       'speed_flags' => 'FLAG_NODATA',
+       'try'   => 'none',
+     },
+     {
+       'regexp'=> 'fib2_ui',
+       'mpX'   => 'mpz',
+       'ret'   => 'void',
+       'args'  => 'mpz_ptr fn, mpz_ptr fnsub1, unsigned long n',
+       'speed_flags' => 'FLAG_NODATA',
+       'try'   => 'none',
+     },
+
+     {
+       'regexp'=> 'lucnum_ui',
+       'mpX'   => 'mpz',
+       'ret'   => 'void',
+       'args'  => 'mpz_ptr ln, unsigned long n',
+       'speed_flags' => 'FLAG_NODATA',
+       'try'   => 'none',
+     },
+     {
+       'regexp'=> 'lucnum2_ui',
+       'mpX'   => 'mpz',
+       'ret'   => 'void',
+       'args'  => 'mpz_ptr ln, mpz_ptr lnsub1, unsigned long n',
+       'speed_flags' => 'FLAG_NODATA',
+       'try'   => 'none',
+     },
+
+     {
+       'regexp'=> 'gcd_1',
+       'ret'   => 'mp_limb_t',
+       'args'  => 'mp_ptr xp, mp_size_t xsize, mp_limb_t y',
+       'speed_flags'=> 'FLAG_R_OPTIONAL',
+       'speed_suffixes' => ['N'],
+     },
+     {
+       'regexp'=> '(gcd)(?!(_1|ext|_finda))',
+       'ret'   => 'mp_size_t',
+       'args'  => 'mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize',
+     },
+     {
+       'regexp'=> 'gcd_finda',
+       'ret'   => 'mp_limb_t',
+       'args'  => 'mp_srcptr cp',
+     },
+
+
+     {
+       'regexp'=> 'jacobi',
+       'funs'  => ['jacobi', 'legendre', 'kronecker'],
+       'mpX'   => 'mpz',
+       'ret'   => 'int',
+       'args'  => 'mpz_srcptr a, mpz_srcptr b',
+       'try-legendre' => 'TYPE_MPZ_JACOBI',
+     },
+     {
+       'regexp'=> 'jacbase',
+       'funs'  => ['jacobi_base'],
+       'ret'   => 'mp_limb_t',
+       'args'  => 'mp_limb_t a, mp_limb_t b, int bit1',
+       'speed' => 'SPEED_ROUTINE_MPN_JACBASE',
+       'try'   => 'none',
+     },
+
+     {
+       'regexp'=> 'logops_n',
+       'mulfunc'=> ['and_n','andn_n','nand_n','ior_n','iorn_n','nior_n','xor_n','xnor_n'],
+       'ret'   => 'void',
+       'args'  => 'mp_ptr wp, mp_srcptr xp, mp_srcptr yp, mp_size_t size',
+       'speed' => 'SPEED_ROUTINE_MPN_BINARY_N',
+     },
+
+     {
+       'regexp'=> '[lr]shift',
+       'ret'   => 'mp_limb_t',
+       'args'  => 'mp_ptr wp, mp_srcptr xp, mp_size_t size, unsigned shift',
+       'speed' => 'SPEED_ROUTINE_MPN_UNARY_1',
+       'speed_flags'=> 'FLAG_R',
+     },
+
+     # mpn_preinv_mod_1 is an optional extra entrypoint
+     {
+       'regexp'=> '(mod_1)(?!_rs)',
+       'funs'  => ['mod_1','preinv_mod_1'],
+       'ret'   => 'mp_limb_t',
+       'args_mod_1'       => 'mp_srcptr xp, mp_size_t size, mp_limb_t divisor',
+       'args_preinv_mod_1'=> 'mp_srcptr xp, mp_size_t size, mp_limb_t divisor, mp_limb_t inverse',
+       'speed_flags'=> 'FLAG_R',
+     },
+     {
+       'regexp'=> 'pre_mod_1',
+       'funs'  => ['preinv_mod_1'],
+       'ret'   => 'mp_limb_t',
+       'args'  => 'mp_srcptr xp, mp_size_t size, mp_limb_t divisor, mp_limb_t inverse',
+       'speed_flags'=> 'FLAG_R',
+     },
+     {
+       'regexp'=> 'mod_34lsub1',
+       'ret'   => 'mp_limb_t',
+       'args'  => 'mp_srcptr src, mp_size_t len',
+     },
+     {
+       'regexp'=> 'invert_limb',
+       'ret'   => 'mp_limb_t',
+       'args'  => 'mp_limb_t divisor',
+       'speed_flags'=> 'FLAG_R_OPTIONAL',
+       'try'   => 'none',
+     },
+
+     {
+       # not for use with hppa reversed argument versions of mpn_umul_ppmm
+       'regexp'=> 'udiv',
+       'funs'  => ['udiv_qrnnd','udiv_qrnnd_r'],
+       'ret'   => 'mp_limb_t',
+       'args_udiv_qrnnd'   => 'mp_limb_t *, mp_limb_t, mp_limb_t, mp_limb_t',
+       'args_udiv_qrnnd_r' => 'mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t *',
+       'speed' => 'none',
+       'try-minsize' => 2,
+     },
+
+     {
+       'regexp'=> 'mode1o',
+       'funs'  => ['modexact_1_odd'],
+       'ret'   => 'mp_limb_t',
+       'args'  => 'mp_srcptr src, mp_size_t size, mp_limb_t divisor',
+       'speed_flags'=> 'FLAG_R',
+     },
+     {
+       'regexp'=> 'modlinv',
+       'funs'  => ['modlimb_invert'],
+       'ret'   => 'mp_limb_t',
+       'args'  => 'mp_limb_t v',
+       'carrys'=> [''],
+       'try'   => 'none',
+     },
+
+     {
+       'regexp'=> 'mul_1',
+       'ret'   => 'mp_limb_t',
+       'args'  => 'mp_ptr wp, mp_srcptr xp, mp_size_t size, mp_limb_t mult',
+       'speed' => 'SPEED_ROUTINE_MPN_UNARY_1',
+       'speed_flags'=> 'FLAG_R',
+     },
+     {
+       'regexp'=> 'mul_2',
+       'ret'   => 'mp_limb_t',
+       'args'  => 'mp_ptr wp, mp_srcptr xp, mp_size_t size, mp_srcptr mult',
+       'speed' => 'SPEED_ROUTINE_MPN_UNARY_2',
+       'speed_flags'=> 'FLAG_R',
+     },
+
+     {
+       'regexp'=> 'mul_basecase',
+       'ret'   => 'void',
+       'args'  => 'mp_ptr wp, mp_srcptr xp, mp_size_t xsize, mp_srcptr yp, mp_size_t ysize',
+       'speed_flags' => 'FLAG_R_OPTIONAL | FLAG_RSIZE',
+     },
+     {
+       'regexp'=> '(mul_n)[_.]',
+       'ret'   => 'void',
+       'args'  => 'mp_ptr wp, mp_srcptr xp, mp_srcptr yp, mp_size_t size',
+       'rename'=> ['kara_mul_n','kara_sqr_n','toom3_mul_n','toom3_sqr_n'],
+     },
+     {
+       'regexp'=> 'umul',
+       'funs'  => ['umul_ppmm','umul_ppmm_r'],
+       'ret'   => 'mp_limb_t',
+       'args_umul_ppmm'   => 'mp_limb_t *lowptr, mp_limb_t m1, mp_limb_t m2',
+       'args_umul_ppmm_r' => 'mp_limb_t m1, mp_limb_t m2, mp_limb_t *lowptr',
+       'speed' => 'none',
+       'try-minsize' => 3,
+     },
+
+
+     {
+       'regexp'=> 'popham',
+       'mulfunc'=> ['popcount','hamdist'],
+       'ret'   => 'unsigned long',
+       'args_popcount'=> 'mp_srcptr xp, mp_size_t size',
+       'args_hamdist' => 'mp_srcptr xp, mp_srcptr yp, mp_size_t size',
+     },
+     {
+       'regexp'=> 'popcount',
+       'ret'   => 'unsigned long',
+       'args'  => 'mp_srcptr xp, mp_size_t size',
+     },
+     {
+       'regexp'=> 'hamdist',
+       'ret'   => 'unsigned long',
+       'args'  => 'mp_srcptr xp, mp_srcptr yp, mp_size_t size',
+       # extra renaming to support sharing a data table with mpn_popcount
+       'rename'=> ['popcount'],
+     },
+
+     {
+       'regexp'=> 'sqr_basecase',
+       'ret'   => 'void',
+       'args'  => 'mp_ptr wp, mp_srcptr xp, mp_size_t size',
+       'speed' => 'SPEED_ROUTINE_MPN_SQR',
+       'try'   => 'TYPE_SQR',
+     },
+     {
+       'regexp'=> 'sqr_diagonal',
+       'ret'   => 'void',
+       'args'  => 'mp_ptr wp, mp_srcptr xp, mp_size_t size',
+       'try'   => 'none',
+     },
+
+     {
+       'regexp'=> 'sqrtrem',
+       'ret'   => 'mp_size_t',
+       'args'  => 'mp_ptr root, mp_ptr rem, mp_srcptr src, mp_size_t size',
+       'try'   => 'none',
+     },
+
+     {
+       'regexp'=> 'cntlz',
+       'funs'  => ['count_leading_zeros'],
+       'ret'   => 'unsigned',
+       'args'  => 'mp_limb_t',
+       'macro-before' => "#undef COUNT_LEADING_ZEROS_0",
+       'macro-speed'  =>
+'#ifdef COUNT_LEADING_ZEROS_0
+#define COUNT_LEADING_ZEROS_0_ALLOWED   1
+#else
+#define COUNT_LEADING_ZEROS_0_ALLOWED   0
+#endif
+  SPEED_ROUTINE_COUNT_ZEROS_A (1, COUNT_LEADING_ZEROS_0_ALLOWED);
+  $fun (c, n);
+  SPEED_ROUTINE_COUNT_ZEROS_B ()',
+       'speed_flags'=> 'FLAG_R_OPTIONAL',
+       'try'   => 'none',
+     },
+     {
+       'regexp'=> 'cnttz',
+       'funs'  => ['count_trailing_zeros'],
+       'ret'   => 'unsigned',
+       'args'  => 'mp_limb_t',
+       'macro-speed' => '
+  SPEED_ROUTINE_COUNT_ZEROS_A (0, 0);
+  $fun (c, n);
+  SPEED_ROUTINE_COUNT_ZEROS_B ()',
+       'speed_flags' => 'FLAG_R_OPTIONAL',
+       'try'   => 'none',
+     },
+
+     {
+       'regexp'=> 'zero',
+       'ret'   => 'void',
+       'args'  => 'mp_ptr ptr, mp_size_t size',
+     },
+
+     {
+       'regexp'=> '(powm)(?!_ui)',
+       'mpX'   => 'mpz',
+       'ret'   => 'void',
+       'args'  => 'mpz_ptr r, mpz_srcptr b, mpz_srcptr e, mpz_srcptr m',
+       'try'   => 'none',
+     },
+     {
+       'regexp'=> 'powm_ui',
+       'mpX'   => 'mpz',
+       'ret'   => 'void',
+       'args'  => 'mpz_ptr r, mpz_srcptr b, unsigned long e, mpz_srcptr m',
+       'try'   => 'none',
+     },
+
+     # special for use during development
+     {
+       'regexp'=> 'back',
+       'funs'  => ['back_to_back'],
+       'ret'   => 'void',
+       'args'  => 'void',
+       'pic'   => 'no',
+       'try'   => 'none',
+       'speed_flags'=> 'FLAG_NODATA',
+     },
+     );
+
+if (defined $ENV{table2}) {
+  my @newtable = @{$ENV{table2}};
+  push @newtable, @table;
+  @table = @newtable;
+}
+
+
+my %pictable =
+    (
+     'yes' => {
+       'suffix' =>  '_pic',
+       'asmflags'=> '$(ASMFLAGS_PIC)',
+       'cflags' =>  '$(CFLAGS_PIC)',
+     },
+     'no' => {
+       'suffix' =>  '',
+       'asmflags'=> '',
+       'cflags' =>  '',
+     },
+     );
+
+
+my $builddir = $ENV{builddir};
+$builddir = "." if (! defined $builddir);
+
+my $top_builddir = "${builddir}/..";
+
+
+open(MAKEFILE, "<${builddir}/Makefile")
+  or die "Cannot open ${builddir}/Makefile: $!\n"
+       . "Is this a tune build directory?";
+my ($srcdir, $top_srcdir);
+while (<MAKEFILE>) {
+  if (/^srcdir = (.*)/) {     $srcdir = $1;     }
+  if (/^top_srcdir = (.*)/) { $top_srcdir = $1; }
+}
+die "Cannot find \$srcdir in Makefile\n" if (! defined $srcdir);
+die "Cannot find \$top_srcdir in Makefile\n" if (! defined $top_srcdir);
+print "srcdir $srcdir\n" if $opt{'t'};
+print "top_srcdir $top_srcdir\n" if $opt{'t'};
+close(MAKEFILE);
+
+
+open(SPEED, ">speed-many.c") or die;
+print SPEED
+"/* speed-many.c generated by many.pl - DO NOT EDIT, CHANGES WILL BE LOST */
+
+";
+my $SPEED_EXTRA_ROUTINES = "#define SPEED_EXTRA_ROUTINES \\\n";
+my $SPEED_EXTRA_PROTOS = "#define SPEED_EXTRA_PROTOS \\\n";
+my $SPEED_CODE = "";
+
+open(TRY, ">try-many.c") or die;
+print TRY
+    "/* try-many.c generated by many.pl - DO NOT EDIT, CHANGES WILL BE LOST */\n" .
+    "\n";
+my $TRY_EXTRA_ROUTINES = "#define EXTRA_ROUTINES \\\n";
+my $TRY_EXTRA_PROTOS = "#define EXTRA_PROTOS \\\n";
+
+open(FD,"<${top_builddir}/libtool") or die "Cannot open \"${top_builddir}/libtool\": $!\n";
+my $pic_flag;
+while (<FD>) {
+  if (/^pic_flag="?([^"]*)"?$/) {
+    $pic_flag=$1;
+    last;
+  }
+}
+close FD;
+if (! defined $pic_flag) {
+  die "Cannot find pic_flag in ${top_builddir}/libtool";
+}
+
+my $CFLAGS_PIC = $pic_flag;
+
+my $ASMFLAGS_PIC = "";
+foreach (split /[ \t]/, $pic_flag) {
+  if (/^-D/) {
+    $ASMFLAGS_PIC .= " " . $_;
+  }
+}
+
+open(MAKEFILE, ">Makefile.many") or die;
+print MAKEFILE
+    "# Makefile.many generated by many.pl - DO NOT EDIT, CHANGES WILL BE LOST\n" .
+    "\n" .
+    "all: speed-many try-many\n" .
+    "\n" .
+    "#--------- begin included copy of basic Makefile ----------\n" .
+    "\n";
+open(FD,"<${builddir}/Makefile") or die "Cannot open \"${builddir}/Makefile\": $!\n";
+print MAKEFILE <FD>;
+close FD;
+print MAKEFILE
+    "\n" .
+    "#--------- end included copy of basic Makefile ----------\n" .
+    "\n" .
+    "CFLAGS_PIC = $CFLAGS_PIC\n" .
+    "ASMFLAGS_PIC = $ASMFLAGS_PIC\n" .
+    "\n";
+
+my $CLEAN="";
+my $MANY_OBJS="";
+
+
+sub print_ansi2knr {
+  my ($base,$file,$includes) = @_;
+  if (! defined $file)     { $file = "$base.c"; }
+  if (! defined $includes) { $includes = ""; }
+
+  print MAKEFILE <<EOF;
+${base}_.c: $file \$(ANSI2KNR)
+	\$(CPP) \$(DEFS) \$(INCLUDES) $includes \$(AM_CPPFLAGS) \$(CPPFLAGS) $file | sed 's/^# \([0-9]\)/#line \\1/' | \$(ANSI2KNR) >${base}_.c
+
+EOF
+}
+
+
+# Spawning a glob is a touch slow when there's lots of files.
+my @files = ();
+foreach my $dir (@DIRECTORIES) {
+  print "dir $dir\n" if $opt{'t'};
+  if (-f $dir) {
+    push @files,$dir;
+  } else {
+    if (! opendir DD,$dir) {
+      print "Cannot open $dir: $!\n";
+    } else {
+      push @files, map {$_="$dir/$_"} grep /\.(c|asm|S|h)$/, readdir DD;
+      closedir DD;
+    }
+  }
+}
+@files = sort @files;
+print "@files ",join(" ",@files),"\n" if $opt{'t'};
+
+my $count_files = 0;
+my $count_functions = 0;
+my %seen_obj;
+my %seen_file;
+
+foreach my $file_full (@files) {
+  if (! -f $file_full) {
+    print "Not a file: $file_full\n";
+    next;
+  }
+  if (defined $seen_file{$file_full}) {
+    print "Skipping duplicate file: $file_full\n";
+    next;
+  }
+  $seen_file{$file_full} = 1;
+
+  my ($FILE,$path,$lang) = fileparse($file_full,"\.[a-zA-Z]+");
+  $path =~ s/\/$//;
+  print "file $FILE path $path lang $lang\n" if $opt{'t'};
+
+  my @pic_choices;
+  if ($lang eq '.asm')  { @pic_choices=('no','yes'); }
+  elsif ($lang eq '.c') { @pic_choices=('no'); }
+  elsif ($lang eq '.S') { @pic_choices=('no','yes'); }
+  elsif ($lang eq '.h') { @pic_choices=('no'); }
+  else { next };
+
+  my ($t, $file_match);
+  foreach my $p (@table) {
+    # print " ",$p->{'regexp'},"\n" if $opt{'t'};
+    if ($FILE =~ "^($p->{'regexp'})") {
+      $t = $p;
+      $file_match = $1;
+      $file_match = $2 if defined $2;
+      last;
+    }
+  }
+  next if ! defined $t;
+  print "match $t->{'regexp'} $FILE ($file_full)\n" if $opt{'t'};
+
+  if (! open FD,"<$file_full") { print "Can't open $file_full: $!\n"; next }
+  my @file_contents = <FD>;
+  close FD;
+
+  my $objs;
+  if (defined $t->{'mulfunc'}) { $objs = $t->{'mulfunc'}; }
+  else                         { $objs = [$file_match]; }
+  print "objs @$objs\n" if $opt{'t'};
+
+  my $ret = $t->{'ret'};
+  if (! defined $ret && $lang eq '.h') { $ret = ''; }
+  if (! defined $ret) { die "$FILE return type not defined\n" };
+  print "ret $ret\n" if $opt{'t'};
+
+  my $mpX = $t->{'mpX'};
+  if (! defined $mpX) { $mpX = ($lang eq '.h' ? '' : 'mpn'); }
+  $mpX = "${mpX}_" if $mpX ne '';
+  print "mpX $mpX\n" if $opt{'t'};
+
+  my $carrys;
+  if (defined $t->{'carrys'}) { $carrys = $t->{'carrys'}; }
+  else                        { $carrys = ['','c'];       }
+  print "carrys $carrys @$carrys\n" if $opt{'t'};
+
+  # some restriction functions are implemented, but they're not very useful
+  my $restriction='';
+
+  my $suffix;
+  if ($FILE =~ ("${file_match}_(.+)")) {
+    $suffix = $1;
+  } elsif ($path =~ /\/mp[zn]\/(.*)$/) {
+    # derive the suffix from the path
+    $suffix = $1;
+    $suffix =~ s/\//_/g;
+    # use last directory name, or if there's 3 or more then the last two
+    if ($suffix =~ /([^_]*_)+([^_]+_[^_]+)$/) {
+      $suffix = $2;
+    } elsif ($suffix =~ /([^_]*_)*([^_]+)$/) {
+      $suffix = $2;
+    }
+  } else {
+    die "Can't determine suffix for: $file_full (path $path)\n";
+  }
+  print "suffix $suffix\n" if $opt{'t'};
+
+  $count_files++;
+
+  foreach my $obj (@{$objs}) {
+    print "obj $obj\n" if $opt{'t'};
+
+    my $obj_with_suffix = "${obj}_$suffix";
+    if (defined $seen_obj{$obj_with_suffix}) {
+      print "Skipping duplicate object: $obj_with_suffix\n";
+      print "   first from: $seen_obj{$obj_with_suffix}\n";
+      print "   now from:   $file_full\n";
+      next;
+    }
+    $seen_obj{$obj_with_suffix} = $file_full;
+
+    my $funs = $t->{'funs'};
+    $funs = [$obj] if ! defined $funs;
+    print "funs @$funs\n" if $opt{'t'};
+
+    if (defined $t->{'pic'}) { @pic_choices = ('no'); }
+
+    foreach my $pic (map {$pictable{$_}} @pic_choices) {
+      print "pic $pic->{'suffix'}\n" if $opt{'t'};
+
+      my $objbase = "${obj}_$suffix$pic->{'suffix'}";
+      print "objbase $objbase\n" if $opt{'t'};
+
+      if ($path !~ "." && -f "${objbase}.c") {
+	die "Already have ${objbase}.c";
+      }
+
+      my $tmp_file = "tmp-$objbase.c";
+
+      my $renaming;
+      foreach my $fun (@{$funs}) {
+        if ($mpX eq 'mpn_' && $lang eq '.c') {
+          $renaming .= "\t\t-DHAVE_NATIVE_mpn_$fun=1 \\\n";
+        }
+
+        # The carry-in variant is with a "c" appended, unless there's a "_1"
+        # somewhere, eg. "modexact_1_odd", in which case that becomes "_1c".
+	my $fun_carry = $fun;
+	if (! ($fun_carry =~ s/_1/_1c/)) { $fun_carry = "${fun}c"; }
+
+	$renaming .=
+	    "\t\t-D__g$mpX$fun=$mpX${fun}_$suffix$pic->{'suffix'} \\\n" .
+	    "\t\t-D__g$mpX$fun_carry=$mpX${fun_carry}_$suffix$pic->{'suffix'} \\\n";
+      }
+      foreach my $r (@{$t->{'rename'}}) {
+	if ($r =~ /^__gmp/) {
+	  $renaming .= "\\\n" .
+	      "\t\t-D$r=${r}_$suffix$pic->{'suffix'}";
+	} else {
+	  $renaming .= "\\\n" .
+	      "\t\t-D__g$mpX$r=$mpX${r}_$suffix$pic->{'suffix'}";
+	}
+      }
+      print "renaming $renaming\n" if $opt{'t'};
+
+      print MAKEFILE "\n";
+      if ($lang eq '.asm') {
+	print MAKEFILE
+	    "$objbase.o: $file_full \$(ASM_HEADERS)\n" .
+	    "	\$(M4) \$(M4FLAGS) -DOPERATION_$obj $pic->{'asmflags'} \\\n" .
+  	    "$renaming" .
+	    "		$file_full >tmp-$objbase.s\n" .
+            "	\$(CCAS) \$(COMPILE_FLAGS) $pic->{'cflags'} tmp-$objbase.s -o $objbase.o\n" .
+            "	\$(RM_TMP) tmp-$objbase.s\n";
+	$MANY_OBJS .= " $objbase.o";
+
+      } elsif ($lang eq '.c') {
+	print MAKEFILE
+	    "$objbase.o: $file_full\n" .
+	    "	\$(COMPILE) -DOPERATION_$obj $pic->{'cflags'} \\\n" .
+  	    "$renaming" .
+	    "		-c $file_full -o $objbase.o\n";
+	print_ansi2knr($objbase,
+		       $file_full,
+		       " -DOPERATION_$obj\\\n$renaming\t\t");
+	$MANY_OBJS .= " $objbase\$U.o";
+
+      } elsif ($lang eq '.S') {
+	print MAKEFILE
+	    "$objbase.o: $file_full\n" .
+            "	\$(COMPILE) -g $pic->{'asmflags'} \\\n" .
+  	    "$renaming" .
+            "	-c $file_full -o $objbase.o\n";
+	$MANY_OBJS .= " $objbase.o";
+
+      } elsif ($lang eq '.h') {
+	print MAKEFILE
+	    "$objbase.o: tmp-$objbase.c $file_full\n" .
+	    "	\$(COMPILE) -DOPERATION_$obj $pic->{'cflags'} \\\n" .
+  	    "$renaming" .
+	    "		-c tmp-$objbase.c -o $objbase.o\n";
+	print_ansi2knr($objbase,
+		       "tmp-$objbase.c",
+		       " -DOPERATION_$obj\\\n$renaming\t\t");
+	$MANY_OBJS .= " $objbase\$U.o";
+
+        $CLEAN .= " tmp-$objbase.c";
+	open(TMP_C,">tmp-$objbase.c")
+	    or die "Can't create tmp-$objbase.c: $!\n";
+	print TMP_C
+"/* tmp-$objbase.c generated by many.pl - DO NOT EDIT, CHANGES WILL BE LOST */
+
+#include \"gmp.h\"
+#include \"gmp-impl.h\"
+#include \"longlong.h\"
+#include \"speed.h\"
+
+";
+      }
+
+      my $tests_program = "$top_srcdir/tests/devel/$obj.c";
+      if (-f $tests_program) {
+	$tests_program = "\$(top_srcdir)/tests/devel/$obj.c";
+	print_ansi2knr("tests_${objbase}",
+		       $tests_program,
+		       "\\\n$renaming\t\t\$(CFLAGS_TESTS_SP)");
+	print_ansi2knr("tests_${objbase}_sp",
+		       $tests_program,
+		       "\\\n$renaming\t\t\$(CFLAGS_TESTS_SP)");
+
+	print MAKEFILE <<EOF;
+tests_$objbase.o: $tests_program
+	\$(COMPILE) \$(CFLAGS_TESTS) \\
+$renaming		-c $tests_program -o tests_$objbase.o
+
+tests_$objbase: $objbase\$U.o tests_$objbase\$U.o ../libgmp.la
+	\$(LINK) tests_$objbase\$U.o $objbase\$U.o ../libgmp.la -o tests_$objbase
+
+tests_${objbase}_sp.o: $tests_program
+	\$(COMPILE) \$(CFLAGS_TESTS_SP) \\
+$renaming		-c $tests_program -o tests_${objbase}_sp.o
+
+tests_${objbase}_sp: $objbase\$U.o tests_${objbase}_sp\$U.o ../libgmp.la
+	\$(LINK) tests_${objbase}_sp\$U.o $objbase\$U.o ../libgmp.la -o tests_${objbase}_sp
+
+EOF
+        $CLEAN .= " tests_$objbase tests_${objbase}_sp";
+      }
+
+      foreach my $fun (@{$funs}) {
+	print "fun $fun\n" if $opt{'t'};
+
+	if ($lang eq '.h') {
+          my $macro_before = $t->{'macro_before'};
+          $macro_before = "" if ! defined $macro_before;
+	  print TMP_C
+"$macro_before
+#undef $fun
+#include \"$file_full\"
+
+";
+	}
+
+	my $args = $t->{"args_$fun"};
+	if (! defined $args) { $args = $t->{'args'}; }
+	if (! defined $args) { die "Need args for $fun\n"; }
+	print "args $args\n" if $opt{'t'};
+
+	foreach my $carry (@$carrys) {
+	  print "carry $carry\n" if $opt{'t'};
+
+	  my $fun_carry = $fun;
+	  if (! ($fun_carry =~ s/_1/_1$carry/)) { $fun_carry = "$fun$carry"; }
+          print "fun_carry $fun_carry\n" if $opt{'t'};
+
+	  if ($lang =~ /\.(asm|S)/
+	      && ! grep(m"PROLOGUE\((.* )?$mpX$fun_carry[ ,)]",@file_contents)) {
+	    print "no PROLOGUE $mpX$fun_carry\n" if $opt{'t'};
+	    next;
+	  }
+	  if ($lang eq '.c'
+	      && ! grep(m"^(#define FUNCTION\s+)?$mpX$fun_carry\W", @file_contents)) {
+	    print "no mention of $mpX$fun_carry\n" if $opt{'t'};
+	    next;
+	  }
+	  if ($lang eq '.h'
+	      && ! grep(m"^#define $fun_carry\W", @file_contents)) {
+	    print "no mention of #define $fun_carry\n" if $opt{'t'};
+	    next;
+	  }
+
+	  $count_functions++;
+
+	  my $carryarg;
+	  if (defined $t->{'carryarg'}) { $carryarg = $t->{'carryarg'}; }
+	  if ($carry eq '')             { $carryarg = ''; }
+	  else                          { $carryarg = ', mp_limb_t carry'; }
+	  print "carryarg $carryarg\n" if $opt{'t'};
+
+	  my $funfull="$mpX${fun_carry}_$suffix$pic->{'suffix'}";
+	  print "funfull $funfull\n" if $opt{'t'};
+
+	  if ($lang ne '.h') {
+	    my $proto = "$t->{'ret'} $funfull _PROTO (($args$carryarg)); \\\n";
+	    $SPEED_EXTRA_PROTOS .= $proto;
+	    $TRY_EXTRA_PROTOS .= $proto;
+	  }
+
+	  my $try_type = $t->{"try-$fun"};
+	  $try_type = $t->{'try'} if ! defined $try_type;
+	  if (! defined $try_type) {
+	    if ($mpX eq 'mpn_') {
+	      $try_type = "TYPE_\U$fun_carry";
+	    } else {
+	      $try_type = "TYPE_\U$mpX\U$fun_carry";
+	    }
+	  }
+	  print "try_type $try_type\n" if $opt{'t'};
+
+	  my $try_minsize = $t->{'try-minsize'};
+	  if (defined $try_minsize) {
+	    $try_minsize = ", " . $try_minsize;
+	  } else {
+	    $try_minsize = "";
+	  }
+	  print "try_minsize $try_minsize\n" if $opt{'t'};
+
+	  if ($try_type ne 'none') {
+	    $TRY_EXTRA_ROUTINES .=
+		"  { TRY($mpX${fun_carry}_$suffix$pic->{'suffix'}), $try_type$try_minsize }, \\\n";
+	  }
+
+	  my $speed_flags = $t->{'speed_flags'};
+	  $speed_flags = '0' if ! defined $speed_flags;
+	  print "speed_flags $speed_flags\n" if $opt{'t'};
+
+	  my $speed_routine = $t->{'speed'};
+	  $speed_routine = "SPEED_ROUTINE_\U$mpX\U$fun"
+	      if !defined $speed_routine;
+	  if (! ($speed_routine =~ s/_1/_1\U$carry/)) {
+	    $speed_routine = "$speed_routine\U$carry";
+	  }
+	  print "speed_routine $speed_routine\n" if $opt{'t'};
+
+	  my @speed_suffixes = ();
+	  push (@speed_suffixes, '') if $speed_routine ne 'none';
+	  push (@speed_suffixes, @{$t->{'speed_suffixes'}})
+	      if defined $t->{'speed_suffixes'};
+
+          my $macro_speed = $t->{'macro-speed'};
+          $macro_speed = "$speed_routine ($fun_carry)" if ! defined $macro_speed;
+          $macro_speed =~ s/\$fun/$fun_carry/g;
+
+	  foreach my $S (@speed_suffixes) {
+	    my $Sfunfull="$mpX${fun_carry}${S}_$suffix$pic->{'suffix'}";
+
+	    $SPEED_EXTRA_PROTOS .=
+	      "double speed_$Sfunfull _PROTO ((struct speed_params *s)); \\\n";
+	    $SPEED_EXTRA_ROUTINES .=
+	      "  { \"$Sfunfull\", speed_$Sfunfull, $speed_flags }, \\\n";
+	    if ($lang eq '.h') {
+              print TMP_C
+"double
+speed_$Sfunfull (struct speed_params *s)
+{
+$macro_speed
+}
+
+";
+            } else {
+	      $SPEED_CODE .=
+	        "double\n" .
+	        "speed_$Sfunfull (struct speed_params *s)\n" .
+                "{\n" .
+                "$restriction" .
+	        "  $speed_routine\U$S\E ($funfull)\n" .
+                "}\n";
+            }
+	  }
+	}
+      }
+    }
+  }
+}
+
+
+print SPEED $SPEED_EXTRA_PROTOS . "\n";
+print SPEED $SPEED_EXTRA_ROUTINES . "\n";
+if (defined $ENV{speedinc}) { print SPEED $ENV{speedinc} . "\n"; }
+print SPEED
+    "#include \"speed.c\"\n" .
+    "\n";
+print SPEED $SPEED_CODE;
+
+print TRY $TRY_EXTRA_ROUTINES . "\n";
+print TRY $TRY_EXTRA_PROTOS . "\n";
+my $tryinc = "";
+if (defined $ENV{tryinc}) {
+  $tryinc = $ENV{tryinc};
+  print TRY "#include \"$tryinc\"\n";
+}
+print "tryinc $tryinc\n" if $opt{'t'};
+print TRY
+    "#include \"try.c\"\n" .
+    "\n";
+
+my $extra_libraries = "";
+if (defined $ENV{extra_libraries}) { $extra_libraries = $ENV{extra_libraries};}
+
+my $trydeps = "";
+if (defined $ENV{trydeps}) { $trydeps = $ENV{trydeps}; }
+$trydeps .= " $tryinc";
+print "trydeps $trydeps\n" if $opt{'t'};
+
+print MAKEFILE <<EOF;
+
+MANY_OBJS = $MANY_OBJS
+MANY_CLEAN = \$(MANY_OBJS) \\
+	speed-many.c speed-many\$U.o speed-many\$(EXEEXT) \\
+	try-many.c try-many\$U.o try-many \\
+	$CLEAN
+MANY_DISTCLEAN = Makefile.many
+
+speed-many: \$(MANY_OBJS) speed-many\$U.o libspeed.la $extra_libraries
+	\$(LINK) \$(LDFLAGS) speed-many\$U.o \$(MANY_OBJS) \$(LDADD) \$(LIBS) $extra_libraries
+
+try-many: \$(MANY_OBJS) try-many\$U.o libspeed.la $extra_libraries
+	\$(LINK) \$(LDFLAGS) try-many\$U.o \$(MANY_OBJS)  \$(LDADD) \$(LIBS) $extra_libraries
+
+try-many.o: try-many.c \$(top_srcdir)/tests/devel/try.c $trydeps
+	\$(COMPILE) -I\$(top_srcdir)/tests/devel -c try-many.c
+
+EOF
+
+print_ansi2knr("speed-many");
+print_ansi2knr("try-many",
+	       "\$(top_srcdir)/tests/devel/try.c",
+	       "-I\$(top_srcdir)/tests/devel");
+
+print MAKEFILE <<EOF;
+RM_TMP = rm -f
+CFLAGS_TESTS = -DSIZE=50 -DTIMES=1 -DRANDOM -DCLOCK=333000000
+CFLAGS_TESTS_SP = -DSIZE=1024 -DNOCHECK -DOPS=200000000 -DCLOCK=333000000
+EOF
+
+close MAKEFILE or die;
+
+print "Total $count_files files, $count_functions functions\n";
+
+
+
+# Local variables:
+# perl-indent-level: 2
+# End:

diff --git a/third_party/gmp/tune/mod_1_1-1.c b/third_party/gmp/tune/mod_1_1-1.c
new file mode 100644
index 0000000..7de1e42
--- /dev/null
+++ b/third_party/gmp/tune/mod_1_1-1.c

@@ -0,0 +1,40 @@
+/* mpn/generic/mod_1_1.c method 1.
+
+Copyright 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#undef MOD_1_1P_METHOD
+#define MOD_1_1P_METHOD 1
+#undef mpn_mod_1_1p
+#undef mpn_mod_1_1p_cps
+#define mpn_mod_1_1p mpn_mod_1_1p_1
+#define mpn_mod_1_1p_cps mpn_mod_1_1p_cps_1
+
+#include "mpn/generic/mod_1_1.c"

diff --git a/third_party/gmp/tune/mod_1_1-2.c b/third_party/gmp/tune/mod_1_1-2.c
new file mode 100644
index 0000000..980a64f
--- /dev/null
+++ b/third_party/gmp/tune/mod_1_1-2.c

@@ -0,0 +1,40 @@
+/* mpn/generic/mod_1_1.c method 2.
+
+Copyright 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#undef MOD_1_1P_METHOD
+#define MOD_1_1P_METHOD 2
+#undef mpn_mod_1_1p
+#undef mpn_mod_1_1p_cps
+#define mpn_mod_1_1p mpn_mod_1_1p_2
+#define mpn_mod_1_1p_cps mpn_mod_1_1p_cps_2
+
+#include "mpn/generic/mod_1_1.c"

diff --git a/third_party/gmp/tune/mod_1_div.c b/third_party/gmp/tune/mod_1_div.c
new file mode 100644
index 0000000..6268d4e
--- /dev/null
+++ b/third_party/gmp/tune/mod_1_div.c

@@ -0,0 +1,45 @@
+/* mpn/generic/mod_1.c forced to use plain udiv_qrnnd.
+
+Copyright 2000, 2003 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define OPERATION_mod_1
+
+#include "gmp-impl.h"
+
+#undef MOD_1_NORM_THRESHOLD
+#undef MOD_1_UNNORM_THRESHOLD
+#undef MOD_1N_TO_MOD_1_1_THRESHOLD
+#undef MOD_1U_TO_MOD_1_1_THRESHOLD
+#define MOD_1_NORM_THRESHOLD    MP_SIZE_T_MAX
+#define MOD_1_UNNORM_THRESHOLD  MP_SIZE_T_MAX
+#define MOD_1N_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX
+#define MOD_1U_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX
+#define __gmpn_mod_1  mpn_mod_1_div
+
+#include "mpn/generic/mod_1.c"

diff --git a/third_party/gmp/tune/mod_1_inv.c b/third_party/gmp/tune/mod_1_inv.c
new file mode 100644
index 0000000..3b42710
--- /dev/null
+++ b/third_party/gmp/tune/mod_1_inv.c

@@ -0,0 +1,45 @@
+/* mpn/generic/mod_1.c forced to use mul-by-inverse udiv_qrnnd_preinv.
+
+Copyright 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define OPERATION_mod_1
+
+#include "gmp-impl.h"
+
+#undef MOD_1_NORM_THRESHOLD
+#undef MOD_1_UNNORM_THRESHOLD
+#undef MOD_1N_TO_MOD_1_1_THRESHOLD
+#undef MOD_1U_TO_MOD_1_1_THRESHOLD
+#define MOD_1_NORM_THRESHOLD    0
+#define MOD_1_UNNORM_THRESHOLD  0
+#define MOD_1N_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX
+#define MOD_1U_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX
+#define __gmpn_mod_1  mpn_mod_1_inv
+
+#include "mpn/generic/mod_1.c"

diff --git a/third_party/gmp/tune/modlinv.c b/third_party/gmp/tune/modlinv.c
new file mode 100644
index 0000000..42c583a
--- /dev/null
+++ b/third_party/gmp/tune/modlinv.c

@@ -0,0 +1,177 @@
+/* Alternate implementations of binvert_limb to compare speeds. */
+
+/*
+Copyright 2000, 2002 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include <stdio.h>
+#include "gmp-impl.h"
+#include "longlong.h"
+#include "speed.h"
+
+
+/* Like the standard version in gmp-impl.h, but with the expressions using a
+   "1-" form.  This has the same number of steps, but "1-" is on the
+   dependent chain, whereas the "2*" in the standard version isn't.
+   Depending on the CPU this should be the same or a touch slower.  */
+
+#if GMP_LIMB_BITS <= 32
+#define binvert_limb_mul1(inv,n)                                \
+  do {                                                          \
+    mp_limb_t  __n = (n);                                       \
+    mp_limb_t  __inv;                                           \
+    ASSERT ((__n & 1) == 1);                                    \
+    __inv = binvert_limb_table[(__n&0xFF)/2]; /*  8 */          \
+    __inv = (1 - __n * __inv) * __inv + __inv;  /* 16 */        \
+    __inv = (1 - __n * __inv) * __inv + __inv;  /* 32 */        \
+    ASSERT (__inv * __n == 1);                                  \
+    (inv) = __inv;                                              \
+  } while (0)
+#endif
+
+#if GMP_LIMB_BITS > 32 && GMP_LIMB_BITS <= 64
+#define binvert_limb_mul1(inv,n)                                \
+  do {                                                          \
+    mp_limb_t  __n = (n);                                       \
+    mp_limb_t  __inv;                                           \
+    ASSERT ((__n & 1) == 1);                                    \
+    __inv = binvert_limb_table[(__n&0xFF)/2]; /*  8 */          \
+    __inv = (1 - __n * __inv) * __inv + __inv;  /* 16 */        \
+    __inv = (1 - __n * __inv) * __inv + __inv;  /* 32 */        \
+    __inv = (1 - __n * __inv) * __inv + __inv;  /* 64 */        \
+    ASSERT (__inv * __n == 1);                                  \
+    (inv) = __inv;                                              \
+  } while (0)
+#endif
+
+
+/* The loop based version used in GMP 3.0 and earlier.  Usually slower than
+   multiplying, due to the number of steps that must be performed.  Much
+   slower when the processor has a good multiply.  */
+
+#define binvert_limb_loop(inv,n)                \
+  do {                                          \
+    mp_limb_t  __v = (n);                       \
+    mp_limb_t  __v_orig = __v;                  \
+    mp_limb_t  __make_zero = 1;                 \
+    mp_limb_t  __two_i = 1;                     \
+    mp_limb_t  __v_inv = 0;                     \
+                                                \
+    ASSERT ((__v & 1) == 1);                    \
+                                                \
+    do                                          \
+      {                                         \
+        while ((__two_i & __make_zero) == 0)    \
+          __two_i <<= 1, __v <<= 1;             \
+        __v_inv += __two_i;                     \
+        __make_zero -= __v;                     \
+      }                                         \
+    while (__make_zero);                        \
+                                                \
+    ASSERT (__v_orig * __v_inv == 1);           \
+    (inv) = __v_inv;                            \
+  } while (0)
+
+
+/* Another loop based version with conditionals, but doing a fixed number of
+   steps. */
+
+#define binvert_limb_cond(inv,n)                \
+  do {                                          \
+    mp_limb_t  __n = (n);                       \
+    mp_limb_t  __rem = (1 - __n) >> 1;          \
+    mp_limb_t  __inv = GMP_LIMB_HIGHBIT;        \
+    int        __count;                         \
+                                                \
+    ASSERT ((__n & 1) == 1);                    \
+                                                \
+    __count = GMP_LIMB_BITS-1;               \
+    do                                          \
+      {                                         \
+        __inv >>= 1;                            \
+        if (__rem & 1)                          \
+          {                                     \
+            __inv |= GMP_LIMB_HIGHBIT;          \
+            __rem -= __n;                       \
+          }                                     \
+        __rem >>= 1;                            \
+      }                                         \
+    while (-- __count);                         \
+                                                \
+    ASSERT (__inv * __n == 1);                  \
+    (inv) = __inv;                              \
+  } while (0)
+
+
+/* Another loop based bitwise version, but purely arithmetic, no
+   conditionals. */
+
+#define binvert_limb_arith(inv,n)                                       \
+  do {                                                                  \
+    mp_limb_t  __n = (n);                                               \
+    mp_limb_t  __rem = (1 - __n) >> 1;                                  \
+    mp_limb_t  __inv = GMP_LIMB_HIGHBIT;                                \
+    mp_limb_t  __lowbit;                                                \
+    int        __count;                                                 \
+                                                                        \
+    ASSERT ((__n & 1) == 1);                                            \
+                                                                        \
+    __count = GMP_LIMB_BITS-1;                                       \
+    do                                                                  \
+      {                                                                 \
+        __lowbit = __rem & 1;                                           \
+        __inv = (__inv >> 1) | (__lowbit << (GMP_LIMB_BITS-1));      \
+        __rem = (__rem - (__n & -__lowbit)) >> 1;                       \
+      }                                                                 \
+    while (-- __count);                                                 \
+                                                                        \
+    ASSERT (__inv * __n == 1);                                          \
+    (inv) = __inv;                                                      \
+  } while (0)
+
+
+double
+speed_binvert_limb_mul1 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MODLIMB_INVERT (binvert_limb_mul1);
+}
+double
+speed_binvert_limb_loop (struct speed_params *s)
+{
+  SPEED_ROUTINE_MODLIMB_INVERT (binvert_limb_loop);
+}
+double
+speed_binvert_limb_cond (struct speed_params *s)
+{
+  SPEED_ROUTINE_MODLIMB_INVERT (binvert_limb_cond);
+}
+double
+speed_binvert_limb_arith (struct speed_params *s)
+{
+  SPEED_ROUTINE_MODLIMB_INVERT (binvert_limb_arith);
+}

diff --git a/third_party/gmp/tune/noop.c b/third_party/gmp/tune/noop.c
new file mode 100644
index 0000000..c127b73
--- /dev/null
+++ b/third_party/gmp/tune/noop.c

@@ -0,0 +1,67 @@
+/* Noop routines.
+
+   These are in a separate file to stop gcc recognising do-nothing functions
+   and optimizing away calls to them.  */
+
+/*
+Copyright 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#include "speed.h"
+
+
+void
+noop (void)
+{
+}
+
+/*ARGSUSED*/
+void
+noop_1 (mp_limb_t n)
+{
+}
+
+/*ARGSUSED*/
+void
+noop_wxs (mp_ptr wp, mp_srcptr xp, mp_size_t size)
+{
+}
+
+/*ARGSUSED*/
+void
+noop_wxys (mp_ptr wp, mp_srcptr xp, mp_srcptr yp, mp_size_t size)
+{
+}
+
+/*ARGSUSED*/
+void
+mpn_cache_fill_dummy (mp_limb_t n)
+{
+}

diff --git a/third_party/gmp/tune/pentium.asm b/third_party/gmp/tune/pentium.asm
new file mode 100644
index 0000000..fb1e833
--- /dev/null
+++ b/third_party/gmp/tune/pentium.asm

@@ -0,0 +1,60 @@
+dnl  x86 pentium time stamp counter access routine.
+
+dnl  Copyright 1999, 2000, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+
+C void speed_cyclecounter (unsigned p[2]);
+C
+C Get the pentium rdtsc cycle counter, storing the least significant word in
+C p[0] and the most significant in p[1].
+C
+C cpuid is used to serialize execution.  On big measurements this won't be
+C significant but it may help make small single measurements more accurate.
+
+	.text
+	ALIGN(8)
+
+defframe(PARAM_P,4)
+
+PROLOGUE(speed_cyclecounter)
+deflit(`FRAME',0)
+	pushl	%ebx
+FRAME_pushl()
+	xorl	%eax, %eax
+	cpuid
+	rdtsc
+	movl	PARAM_P, %ebx
+	movl	%eax, (%ebx)
+	movl	%edx, 4(%ebx)
+	popl	%ebx
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/tune/powerpc.asm b/third_party/gmp/tune/powerpc.asm
new file mode 100644
index 0000000..2f4ac27
--- /dev/null
+++ b/third_party/gmp/tune/powerpc.asm

@@ -0,0 +1,53 @@
+dnl  PowerPC mftb_function -- read time base registers.
+
+dnl  Copyright 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C void mftb_function (unsigned a[2]);
+C
+
+ASM_START()
+PROLOGUE(mftb_function)
+
+	C r3	a
+
+L(again):
+	mftbu	r4
+	mftb	r5
+	mftbu	r6
+	cmpw	cr0, r4, r6
+	bne	L(again)
+
+	stw	r5, 0(r3)
+	stw	r4, 4(r3)
+	blr
+
+EPILOGUE()

diff --git a/third_party/gmp/tune/powerpc64.asm b/third_party/gmp/tune/powerpc64.asm
new file mode 100644
index 0000000..1ade996
--- /dev/null
+++ b/third_party/gmp/tune/powerpc64.asm

@@ -0,0 +1,49 @@
+dnl  PowerPC mftb_function -- read time base registers, 64-bit integer.
+
+dnl  Copyright 2002-2004 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C void mftb_function (unsigned a[2]);
+C
+
+ASM_START()
+PROLOGUE(mftb_function)
+
+	C r3	a
+
+	mftb	r5
+
+	srdi	r4, r5, 32
+	stw	r5, 0(r3)
+	stw	r4, 4(r3)
+	blr
+
+EPILOGUE()

diff --git a/third_party/gmp/tune/powm_mod.c b/third_party/gmp/tune/powm_mod.c
new file mode 100644
index 0000000..765fd7b
--- /dev/null
+++ b/third_party/gmp/tune/powm_mod.c

@@ -0,0 +1,38 @@
+/* mpz/powm.c forced to use division. */
+
+/*
+Copyright 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#undef POWM_THRESHOLD
+#define POWM_THRESHOLD  1
+#define __gmpz_powm  mpz_powm_mod
+
+#include "../mpz/powm.c"

diff --git a/third_party/gmp/tune/powm_redc.c b/third_party/gmp/tune/powm_redc.c
new file mode 100644
index 0000000..8584614
--- /dev/null
+++ b/third_party/gmp/tune/powm_redc.c

@@ -0,0 +1,40 @@
+/* mpz/powm.c forced to use REDC. */
+
+/*
+Copyright 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+/* WANT_GLOBAL_REDC makes redc() available for speed and tune program use. */
+#undef POWM_THRESHOLD
+#define POWM_THRESHOLD    MP_SIZE_T_MAX
+#define WANT_REDC_GLOBAL  1
+#define __gmpz_powm  mpz_powm_redc
+
+#include "../mpz/powm.c"

diff --git a/third_party/gmp/tune/pre_divrem_1.c b/third_party/gmp/tune/pre_divrem_1.c
new file mode 100644
index 0000000..66d00da
--- /dev/null
+++ b/third_party/gmp/tune/pre_divrem_1.c

@@ -0,0 +1,40 @@
+/* mpn_preinv_divrem_1 -- if not already in libgmp.
+
+Copyright 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#if ! USE_PREINV_DIVREM_1
+
+#undef USE_PREINV_DIVREM_1
+#define USE_PREINV_DIVREM_1 1
+
+#include "mpn/generic/pre_divrem_1.c"
+
+#endif

diff --git a/third_party/gmp/tune/set_strb.c b/third_party/gmp/tune/set_strb.c
new file mode 100644
index 0000000..128c41b
--- /dev/null
+++ b/third_party/gmp/tune/set_strb.c

@@ -0,0 +1,46 @@
+/* mpn_set_str_basecase -- mpn_set_str forced to its basecase.
+
+Copyright 2002 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define __gmpn_set_str mpn_set_str_basecase
+#define __gmpn_bc_set_str mpn_bc_set_str_basecase
+#define __gmpn_dc_set_str mpn_dc_set_str_basecase
+
+#include "gmp-impl.h"
+
+#ifndef SIZE_T_MAX
+#define SIZE_T_MAX  ((size_t) ULONG_MAX)
+#endif
+
+#undef SET_STR_DC_THRESHOLD
+#define SET_STR_DC_THRESHOLD           SIZE_T_MAX /* always */
+#undef SET_STR_PRECOMPUTE_THRESHOLD
+#define SET_STR_PRECOMPUTE_THRESHOLD   SIZE_T_MAX /* always */
+
+#include "mpn/generic/set_str.c"

diff --git a/third_party/gmp/tune/set_strp.c b/third_party/gmp/tune/set_strp.c
new file mode 100644
index 0000000..3053b60
--- /dev/null
+++ b/third_party/gmp/tune/set_strp.c

@@ -0,0 +1,42 @@
+/* mpn_set_str_subquad -- mpn_set_str forced to the sub-quadratic case.
+
+Copyright 2002 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define TUNE_PROGRAM_BUILD  1   /* for gmp-impl.h */
+
+#include "gmp-impl.h"
+
+void
+mpn_pre_set_str (mp_ptr wp, unsigned char *str, size_t str_len, powers_t *powtab, mp_ptr tp)
+{
+  if (BELOW_THRESHOLD (str_len, set_str_dc_threshold))
+    mpn_bc_set_str (wp, str, str_len, powtab->base);
+  else
+    mpn_dc_set_str (wp, str, str_len, powtab, tp);
+}

diff --git a/third_party/gmp/tune/set_strs.c b/third_party/gmp/tune/set_strs.c
new file mode 100644
index 0000000..d2a9fc2
--- /dev/null
+++ b/third_party/gmp/tune/set_strs.c

@@ -0,0 +1,42 @@
+/* mpn_set_str_subquad -- mpn_set_str forced to the sub-quadratic case.
+
+Copyright 2002 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define __gmpn_set_str mpn_set_str_subquad
+#define __gmpn_bc_set_str mpn_bc_set_str_subquad
+#define __gmpn_dc_set_str mpn_dc_set_str_subquad
+
+#include "gmp-impl.h"
+
+#undef SET_STR_DC_THRESHOLD
+#define SET_STR_DC_THRESHOLD  2 /* never */
+#undef SET_STR_PRECOMPUTE_THRESHOLD
+#define SET_STR_PRECOMPUTE_THRESHOLD  2 /* never */
+
+#include "mpn/generic/set_str.c"

diff --git a/third_party/gmp/tune/sparcv9.asm b/third_party/gmp/tune/sparcv9.asm
new file mode 100644
index 0000000..f0981c7
--- /dev/null
+++ b/third_party/gmp/tune/sparcv9.asm

@@ -0,0 +1,45 @@
+dnl  Sparc v9 32-bit time stamp counter access routine.
+
+dnl  Copyright 2000, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C void speed_cyclecounter (unsigned p[2]);
+C
+C Get the sparc v9 tick counter.
+
+ASM_START()
+PROLOGUE(speed_cyclecounter)
+	rd	%tick,%g1
+	st	%g1,[%o0]		C low 32 bits
+	srlx	%g1,32,%g4
+	retl
+	st	%g4,[%o0+4]		C high 32 bits
+EPILOGUE(speed_cyclecounter)

diff --git a/third_party/gmp/tune/speed-ext.c b/third_party/gmp/tune/speed-ext.c
new file mode 100644
index 0000000..e7fb8b9
--- /dev/null
+++ b/third_party/gmp/tune/speed-ext.c

@@ -0,0 +1,233 @@
+/* An example of extending the speed program to measure routines not in GMP.
+
+Copyright 1999, 2000, 2002, 2003, 2005 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+/* The extension here is three versions of an mpn arithmetic mean.  These
+   aren't meant to be particularly useful, just examples.
+
+   You can run something like the following to compare their speeds.
+
+           ./speed-ext -s 1-20 -c mean_calls mean_open mean_open2
+
+   On RISC chips, mean_open() might be fastest if the compiler is doing a
+   good job.  On the register starved x86s, mean_calls will be fastest.
+
+
+   Notes:
+
+   SPEED_EXTRA_PROTOS and SPEED_EXTRA_ROUTINES are macros that get expanded
+   by speed.c in useful places.  SPEED_EXTRA_PROTOS goes after the header
+   files, and SPEED_EXTRA_ROUTINES goes in the array of available routines.
+
+   The advantage of this #include "speed.c" scheme is that there's no
+   editing of a copy of that file, and new features in new versions of it
+   will be immediately available.
+
+   In a real program the routines mean_calls() etc would probably be in
+   separate C or assembler source files, and just the measuring
+   speed_mean_calls() etc would be here.  Linking against other libraries
+   for things to measure is perfectly possible too.
+
+   When attempting to compare two versions of the same named routine, say
+   like the generic and assembler versions of mpn_add_n(), creative use of
+   cc -D or #define is suggested, so one or both can be renamed and linked
+   into the same program.  It'll be much easier to compare them side by side
+   than with separate programs for each.
+
+   common.c has notes on writing speed measuring routines.
+
+   Remember to link against tune/libspeed.la (or tune/.libs/libspeed.a if
+   not using libtool) to get common.o and other objects needed by speed.c.  */
+
+
+#define SPEED_EXTRA_PROTOS                                              \
+  double speed_mean_calls (struct speed_params *s);			\
+  double speed_mean_open  (struct speed_params *s);			\
+  double speed_mean_open2 (struct speed_params *s);
+
+#define SPEED_EXTRA_ROUTINES            \
+  { "mean_calls",  speed_mean_calls  }, \
+  { "mean_open",   speed_mean_open   }, \
+  { "mean_open2",  speed_mean_open2  },
+
+#include "speed.c"
+
+
+/* A straightforward implementation calling mpn subroutines.
+
+   wp,size is set to (xp,size + yp,size) / 2.  The return value is the
+   remainder from the division.  The other versions are the same.  */
+
+mp_limb_t
+mean_calls (mp_ptr wp, mp_srcptr xp, mp_srcptr yp, mp_size_t size)
+{
+  mp_limb_t  c, ret;
+
+  ASSERT (size >= 1);
+
+  c = mpn_add_n (wp, xp, yp, size);
+  ret = mpn_rshift (wp, wp, size, 1) >> (GMP_LIMB_BITS-1);
+  wp[size-1] |= (c << (GMP_LIMB_BITS-1));
+  return ret;
+}
+
+
+/* An open-coded version, making one pass over the data.  The right shift is
+   done as the added limbs are produced.  The addition code follows
+   mpn/generic/add_n.c. */
+
+mp_limb_t
+mean_open (mp_ptr wp, mp_srcptr xp, mp_srcptr yp, mp_size_t size)
+{
+  mp_limb_t  w, wprev, x, y, c, ret;
+  mp_size_t  i;
+
+  ASSERT (size >= 1);
+
+  x = xp[0];
+  y = yp[0];
+
+  wprev = x + y;
+  c = (wprev < x);
+  ret = (wprev & 1);
+
+#define RSHIFT(hi,lo)   (((lo) >> 1) | ((hi) << (GMP_LIMB_BITS-1)))
+
+  for (i = 1; i < size; i++)
+    {
+      x = xp[i];
+      y = yp[i];
+
+      w = x + c;
+      c = (w < x);
+      w += y;
+      c += (w < y);
+
+      wp[i-1] = RSHIFT (w, wprev);
+      wprev = w;
+    }
+
+  wp[i-1] = RSHIFT (c, wprev);
+
+  return ret;
+}
+
+
+/* Another one-pass version, but right shifting the source limbs rather than
+   the result limbs.  There's not much chance of this being better than the
+   above, but it's an alternative at least. */
+
+mp_limb_t
+mean_open2 (mp_ptr wp, mp_srcptr xp, mp_srcptr yp, mp_size_t size)
+{
+  mp_limb_t  w, x, y, xnext, ynext, c, ret;
+  mp_size_t  i;
+
+  ASSERT (size >= 1);
+
+  x = xp[0];
+  y = yp[0];
+
+  /* ret is the low bit of x+y, c is the carry out of that low bit add */
+  ret = (x ^ y) & 1;
+  c   = (x & y) & 1;
+
+  for (i = 0; i < size-1; i++)
+    {
+      xnext = xp[i+1];
+      ynext = yp[i+1];
+      x = RSHIFT (xnext, x);
+      y = RSHIFT (ynext, y);
+
+      w = x + c;
+      c = (w < x);
+      w += y;
+      c += (w < y);
+      wp[i] = w;
+
+      x = xnext;
+      y = ynext;
+    }
+
+  wp[i] = (x >> 1) + (y >> 1) + c;
+
+  return ret;
+}
+
+
+/* The speed measuring routines are the same apart from which function they
+   run, so a macro is used.  Actually this macro is the same as
+   SPEED_ROUTINE_MPN_BINARY_N.  */
+
+#define SPEED_ROUTINE_MEAN(mean_fun)                    \
+  {                                                     \
+    unsigned  i;                                        \
+    mp_ptr    wp;                                       \
+    double    t;                                        \
+    TMP_DECL;                                  \
+                                                        \
+    SPEED_RESTRICT_COND (s->size >= 1);                 \
+                                                        \
+    TMP_MARK;                                  \
+    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);   \
+                                                        \
+    speed_operand_src (s, s->xp, s->size);              \
+    speed_operand_src (s, s->yp, s->size);              \
+    speed_operand_dst (s, wp, s->size);                 \
+    speed_cache_fill (s);                               \
+                                                        \
+    speed_starttime ();                                 \
+    i = s->reps;                                        \
+    do                                                  \
+      mean_fun (wp, s->xp, s->yp, s->size);             \
+    while (--i != 0);                                   \
+    t = speed_endtime ();                               \
+                                                        \
+    TMP_FREE;                                  \
+    return t;                                           \
+  }
+
+double
+speed_mean_calls (struct speed_params *s)
+{
+  SPEED_ROUTINE_MEAN (mean_calls);
+}
+
+double
+speed_mean_open (struct speed_params *s)
+{
+  SPEED_ROUTINE_MEAN (mean_open);
+}
+
+double
+speed_mean_open2 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MEAN (mean_open2);
+}

diff --git a/third_party/gmp/tune/speed.c b/third_party/gmp/tune/speed.c
new file mode 100644
index 0000000..1708e4c
--- /dev/null
+++ b/third_party/gmp/tune/speed.c

@@ -0,0 +1,1405 @@
+/* Speed measuring program.
+
+Copyright 1999-2003, 2005, 2006, 2008-2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+/* Usage message is in the code below, run with no arguments to print it.
+   See README for interesting applications.
+
+   To add a new routine foo(), create a speed_foo() function in the style of
+   the existing ones and add an entry in the routine[] array.  Put FLAG_R if
+   speed_foo() wants an "r" parameter.
+
+   The routines don't have help messages or descriptions, but most have
+   suggestive names.  See the source code for full details.
+
+*/
+
+#include "config.h"
+
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if HAVE_UNISTD_H
+#include <unistd.h>  /* for getpid, R_OK */
+#endif
+
+#if TIME_WITH_SYS_TIME
+# include <sys/time.h>  /* for struct timeval */
+# include <time.h>
+#else
+# if HAVE_SYS_TIME_H
+#  include <sys/time.h>
+# else
+#  include <time.h>
+# endif
+#endif
+
+#if HAVE_SYS_RESOURCE_H
+#include <sys/resource.h>  /* for getrusage() */
+#endif
+
+
+#include "gmp-impl.h"
+#include "longlong.h"  /* for the benefit of speed-many.c */
+#include "tests.h"
+#include "speed.h"
+
+
+#if !HAVE_DECL_OPTARG
+extern char *optarg;
+extern int optind, opterr;
+#endif
+
+#if !HAVE_STRTOUL
+#define strtoul(p,e,b)  (unsigned long) strtol(p,e,b)
+#endif
+
+#ifdef SPEED_EXTRA_PROTOS
+SPEED_EXTRA_PROTOS
+#endif
+#ifdef SPEED_EXTRA_PROTOS2
+SPEED_EXTRA_PROTOS2
+#endif
+
+
+#if GMP_LIMB_BITS == 32
+#define GMP_NUMB_0xAA  (CNST_LIMB(0xAAAAAAAA) & GMP_NUMB_MASK)
+#endif
+#if GMP_LIMB_BITS == 64
+#define GMP_NUMB_0xAA  (CNST_LIMB(0xAAAAAAAAAAAAAAAA) & GMP_NUMB_MASK)
+#endif
+
+
+#define CMP_ABSOLUTE     1
+#define CMP_RATIO        2
+#define CMP_DIFFERENCE   3
+#define CMP_DIFFPREV     4
+int  option_cmp = CMP_ABSOLUTE;
+
+#define UNIT_SECONDS        1
+#define UNIT_CYCLES         2
+#define UNIT_CYCLESPERLIMB  3
+int  option_unit = UNIT_SECONDS;
+
+#define DATA_RANDOM   1
+#define DATA_RANDOM2  2
+#define DATA_ZEROS    3
+#define DATA_AAS      4
+#define DATA_FFS      5
+#define DATA_2FD      6
+int  option_data = DATA_RANDOM;
+
+int        option_square = 0;
+double     option_factor = 0.0;
+mp_size_t  option_step = 1;
+int        option_gnuplot = 0;
+char      *option_gnuplot_basename;
+struct size_array_t {
+  mp_size_t start, end;
+} *size_array = NULL;
+mp_size_t  size_num = 0;
+mp_size_t  size_allocnum = 0;
+int        option_resource_usage = 0;
+long       option_seed = 123456789;
+
+struct speed_params  sp;
+
+#define COLUMN_WIDTH  13  /* for the free-form output */
+
+#define FLAG_R            (1<<0)  /* require ".r" */
+#define FLAG_R_OPTIONAL   (1<<1)  /* optional ".r" */
+#define FLAG_RSIZE        (1<<2)
+#define FLAG_NODATA       (1<<3)  /* don't alloc xp, yp */
+
+const struct routine_t {
+  /* constants */
+  const char        *name;
+  speed_function_t  fun;
+  int               flag;
+} routine[] = {
+
+  { "noop",              speed_noop                 },
+  { "noop_wxs",          speed_noop_wxs             },
+  { "noop_wxys",         speed_noop_wxys            },
+
+  { "mpn_add_n",         speed_mpn_add_n,     FLAG_R_OPTIONAL },
+  { "mpn_sub_n",         speed_mpn_sub_n,     FLAG_R_OPTIONAL },
+  { "mpn_add_1",         speed_mpn_add_1,     FLAG_R },
+  { "mpn_add_1_inplace", speed_mpn_add_1_inplace, FLAG_R },
+  { "mpn_sub_1",         speed_mpn_sub_1,     FLAG_R },
+  { "mpn_sub_1_inplace", speed_mpn_sub_1_inplace, FLAG_R },
+
+  { "mpn_add_err1_n",    speed_mpn_add_err1_n    },
+  { "mpn_add_err2_n",    speed_mpn_add_err2_n    },
+  { "mpn_add_err3_n",    speed_mpn_add_err3_n    },
+  { "mpn_sub_err1_n",    speed_mpn_sub_err1_n    },
+  { "mpn_sub_err2_n",    speed_mpn_sub_err2_n    },
+  { "mpn_sub_err3_n",    speed_mpn_sub_err3_n    },
+
+#if HAVE_NATIVE_mpn_add_n_sub_n
+  { "mpn_add_n_sub_n",      speed_mpn_add_n_sub_n,     FLAG_R_OPTIONAL },
+#endif
+
+  { "mpn_addmul_1",      speed_mpn_addmul_1,  FLAG_R },
+  { "mpn_submul_1",      speed_mpn_submul_1,  FLAG_R },
+#if HAVE_NATIVE_mpn_addmul_2
+  { "mpn_addmul_2",      speed_mpn_addmul_2,  FLAG_R_OPTIONAL },
+#endif
+#if HAVE_NATIVE_mpn_addmul_3
+  { "mpn_addmul_3",      speed_mpn_addmul_3,  FLAG_R_OPTIONAL },
+#endif
+#if HAVE_NATIVE_mpn_addmul_4
+  { "mpn_addmul_4",      speed_mpn_addmul_4,  FLAG_R_OPTIONAL },
+#endif
+#if HAVE_NATIVE_mpn_addmul_5
+  { "mpn_addmul_5",      speed_mpn_addmul_5,  FLAG_R_OPTIONAL },
+#endif
+#if HAVE_NATIVE_mpn_addmul_6
+  { "mpn_addmul_6",      speed_mpn_addmul_6,  FLAG_R_OPTIONAL },
+#endif
+#if HAVE_NATIVE_mpn_addmul_7
+  { "mpn_addmul_7",      speed_mpn_addmul_7,  FLAG_R_OPTIONAL },
+#endif
+#if HAVE_NATIVE_mpn_addmul_8
+  { "mpn_addmul_8",      speed_mpn_addmul_8,  FLAG_R_OPTIONAL },
+#endif
+  { "mpn_mul_1",         speed_mpn_mul_1,     FLAG_R },
+  { "mpn_mul_1_inplace", speed_mpn_mul_1_inplace, FLAG_R },
+#if HAVE_NATIVE_mpn_mul_2
+  { "mpn_mul_2",         speed_mpn_mul_2,     FLAG_R_OPTIONAL },
+#endif
+#if HAVE_NATIVE_mpn_mul_3
+  { "mpn_mul_3",         speed_mpn_mul_3,     FLAG_R_OPTIONAL },
+#endif
+#if HAVE_NATIVE_mpn_mul_4
+  { "mpn_mul_4",         speed_mpn_mul_4,     FLAG_R_OPTIONAL },
+#endif
+#if HAVE_NATIVE_mpn_mul_5
+  { "mpn_mul_5",         speed_mpn_mul_5,     FLAG_R_OPTIONAL },
+#endif
+#if HAVE_NATIVE_mpn_mul_6
+  { "mpn_mul_6",         speed_mpn_mul_6,     FLAG_R_OPTIONAL },
+#endif
+
+  { "mpn_divrem_1",      speed_mpn_divrem_1,  FLAG_R },
+  { "mpn_divrem_1f",     speed_mpn_divrem_1f, FLAG_R },
+#if HAVE_NATIVE_mpn_divrem_1c
+  { "mpn_divrem_1c",     speed_mpn_divrem_1c, FLAG_R },
+  { "mpn_divrem_1cf",    speed_mpn_divrem_1cf,FLAG_R },
+#endif
+  { "mpn_mod_1",         speed_mpn_mod_1,     FLAG_R },
+#if HAVE_NATIVE_mpn_mod_1c
+  { "mpn_mod_1c",        speed_mpn_mod_1c,    FLAG_R },
+#endif
+  { "mpn_preinv_divrem_1",  speed_mpn_preinv_divrem_1,  FLAG_R },
+  { "mpn_preinv_divrem_1f", speed_mpn_preinv_divrem_1f, FLAG_R },
+  { "mpn_preinv_mod_1",  speed_mpn_preinv_mod_1, FLAG_R },
+
+  { "mpn_mod_1_1",       speed_mpn_mod_1_1,       FLAG_R },
+  { "mpn_mod_1_1_1",     speed_mpn_mod_1_1_1,     FLAG_R },
+  { "mpn_mod_1_1_2",     speed_mpn_mod_1_1_2,     FLAG_R },
+  { "mpn_mod_1s_2",      speed_mpn_mod_1_2,       FLAG_R },
+  { "mpn_mod_1s_3",      speed_mpn_mod_1_3,       FLAG_R },
+  { "mpn_mod_1s_4",      speed_mpn_mod_1_4,       FLAG_R },
+
+  { "mpn_divrem_1_div",  speed_mpn_divrem_1_div,  FLAG_R },
+  { "mpn_divrem_1_inv",  speed_mpn_divrem_1_inv,  FLAG_R },
+  { "mpn_divrem_1f_div", speed_mpn_divrem_1f_div, FLAG_R },
+  { "mpn_divrem_1f_inv", speed_mpn_divrem_1f_inv, FLAG_R },
+  { "mpn_mod_1_div",     speed_mpn_mod_1_div,     FLAG_R },
+  { "mpn_mod_1_inv",     speed_mpn_mod_1_inv,     FLAG_R },
+
+  { "mpn_divrem_2",      speed_mpn_divrem_2,        },
+  { "mpn_divrem_2_div",  speed_mpn_divrem_2_div,    },
+  { "mpn_divrem_2_inv",  speed_mpn_divrem_2_inv,    },
+
+  { "mpn_div_qr_1n_pi1", speed_mpn_div_qr_1n_pi1, FLAG_R  },
+  { "mpn_div_qr_1n_pi1_1",speed_mpn_div_qr_1n_pi1_1, FLAG_R  },
+  { "mpn_div_qr_1n_pi1_2",speed_mpn_div_qr_1n_pi1_2, FLAG_R  },
+  { "mpn_div_qr_1",      speed_mpn_div_qr_1,      FLAG_R },
+
+  { "mpn_div_qr_2n",     speed_mpn_div_qr_2n,       },
+  { "mpn_div_qr_2u",     speed_mpn_div_qr_2u,       },
+
+  { "mpn_divexact_1",    speed_mpn_divexact_1,    FLAG_R },
+  { "mpn_divexact_by3",  speed_mpn_divexact_by3          },
+
+  { "mpn_bdiv_q_1",      speed_mpn_bdiv_q_1,      FLAG_R },
+  { "mpn_pi1_bdiv_q_1",  speed_mpn_pi1_bdiv_q_1,  FLAG_R_OPTIONAL },
+  { "mpn_bdiv_dbm1c",    speed_mpn_bdiv_dbm1c,    FLAG_R_OPTIONAL },
+
+#if HAVE_NATIVE_mpn_modexact_1_odd
+  { "mpn_modexact_1_odd",  speed_mpn_modexact_1_odd,  FLAG_R },
+#endif
+  { "mpn_modexact_1c_odd", speed_mpn_modexact_1c_odd, FLAG_R },
+
+#if GMP_NUMB_BITS % 4 == 0
+  { "mpn_mod_34lsub1",   speed_mpn_mod_34lsub1 },
+#endif
+
+  { "mpn_lshift",        speed_mpn_lshift, FLAG_R   },
+  { "mpn_lshiftc",       speed_mpn_lshiftc, FLAG_R   },
+  { "mpn_rshift",        speed_mpn_rshift, FLAG_R   },
+
+  { "mpn_and_n",         speed_mpn_and_n,  FLAG_R_OPTIONAL },
+  { "mpn_andn_n",        speed_mpn_andn_n, FLAG_R_OPTIONAL },
+  { "mpn_nand_n",        speed_mpn_nand_n, FLAG_R_OPTIONAL },
+  { "mpn_ior_n",         speed_mpn_ior_n,  FLAG_R_OPTIONAL },
+  { "mpn_iorn_n",        speed_mpn_iorn_n, FLAG_R_OPTIONAL },
+  { "mpn_nior_n",        speed_mpn_nior_n, FLAG_R_OPTIONAL },
+  { "mpn_xor_n",         speed_mpn_xor_n,  FLAG_R_OPTIONAL },
+  { "mpn_xnor_n",        speed_mpn_xnor_n, FLAG_R_OPTIONAL },
+  { "mpn_com",           speed_mpn_com              },
+  { "mpn_neg",           speed_mpn_neg              },
+
+  { "mpn_popcount",      speed_mpn_popcount         },
+  { "mpn_hamdist",       speed_mpn_hamdist          },
+
+  { "mpn_matrix22_mul",  speed_mpn_matrix22_mul     },
+
+  { "mpn_hgcd2",         speed_mpn_hgcd2, FLAG_NODATA },
+  { "mpn_hgcd2_1",       speed_mpn_hgcd2_1, FLAG_NODATA },
+  { "mpn_hgcd2_2",       speed_mpn_hgcd2_2, FLAG_NODATA },
+  { "mpn_hgcd2_3",       speed_mpn_hgcd2_3, FLAG_NODATA },
+  { "mpn_hgcd2_4",       speed_mpn_hgcd2_4, FLAG_NODATA },
+  { "mpn_hgcd2_5",       speed_mpn_hgcd2_5, FLAG_NODATA },
+  { "mpn_hgcd",          speed_mpn_hgcd             },
+  { "mpn_hgcd_lehmer",   speed_mpn_hgcd_lehmer      },
+  { "mpn_hgcd_appr",     speed_mpn_hgcd_appr        },
+  { "mpn_hgcd_appr_lehmer", speed_mpn_hgcd_appr_lehmer },
+
+  { "mpn_hgcd_reduce",   speed_mpn_hgcd_reduce      },
+  { "mpn_hgcd_reduce_1", speed_mpn_hgcd_reduce_1    },
+  { "mpn_hgcd_reduce_2", speed_mpn_hgcd_reduce_2    },
+
+  { "mpn_gcd_1",         speed_mpn_gcd_1,  FLAG_R_OPTIONAL },
+  { "mpn_gcd_11",        speed_mpn_gcd_11, FLAG_R_OPTIONAL },
+  { "mpn_gcd_1N",        speed_mpn_gcd_1N, FLAG_R_OPTIONAL },
+  { "mpn_gcd_22",        speed_mpn_gcd_22, FLAG_R_OPTIONAL },
+
+  { "mpn_gcd",           speed_mpn_gcd                    },
+
+  { "mpn_gcdext",            speed_mpn_gcdext            },
+  { "mpn_gcdext_single",     speed_mpn_gcdext_single     },
+  { "mpn_gcdext_double",     speed_mpn_gcdext_double     },
+  { "mpn_gcdext_one_single", speed_mpn_gcdext_one_single },
+  { "mpn_gcdext_one_double", speed_mpn_gcdext_one_double },
+#if 0
+  { "mpn_gcdext_lehmer",     speed_mpn_gcdext_lehmer     },
+#endif
+
+  { "mpz_nextprime",     speed_mpz_nextprime        },
+
+  { "mpz_jacobi",        speed_mpz_jacobi           },
+  { "mpn_jacobi_base",   speed_mpn_jacobi_base      },
+  { "mpn_jacobi_base_1", speed_mpn_jacobi_base_1    },
+  { "mpn_jacobi_base_2", speed_mpn_jacobi_base_2    },
+  { "mpn_jacobi_base_3", speed_mpn_jacobi_base_3    },
+  { "mpn_jacobi_base_4", speed_mpn_jacobi_base_4    },
+
+  { "mpn_mul",           speed_mpn_mul,         FLAG_R_OPTIONAL },
+  { "mpn_mul_basecase",  speed_mpn_mul_basecase,FLAG_R_OPTIONAL },
+  { "mpn_sqr_basecase",  speed_mpn_sqr_basecase     },
+#if HAVE_NATIVE_mpn_sqr_diagonal
+  { "mpn_sqr_diagonal",  speed_mpn_sqr_diagonal     },
+#endif
+#if HAVE_NATIVE_mpn_sqr_diag_addlsh1
+  { "mpn_sqr_diag_addlsh1", speed_mpn_sqr_diag_addlsh1 },
+#endif
+
+  { "mpn_mul_n",         speed_mpn_mul_n            },
+  { "mpn_sqr",           speed_mpn_sqr              },
+
+  { "mpn_toom2_sqr",     speed_mpn_toom2_sqr        },
+  { "mpn_toom3_sqr",     speed_mpn_toom3_sqr        },
+  { "mpn_toom4_sqr",     speed_mpn_toom4_sqr        },
+  { "mpn_toom6_sqr",     speed_mpn_toom6_sqr        },
+  { "mpn_toom8_sqr",     speed_mpn_toom8_sqr        },
+  { "mpn_toom22_mul",    speed_mpn_toom22_mul       },
+  { "mpn_toom33_mul",    speed_mpn_toom33_mul       },
+  { "mpn_toom44_mul",    speed_mpn_toom44_mul       },
+  { "mpn_toom6h_mul",    speed_mpn_toom6h_mul       },
+  { "mpn_toom8h_mul",    speed_mpn_toom8h_mul       },
+  { "mpn_toom32_mul",    speed_mpn_toom32_mul       },
+  { "mpn_toom42_mul",    speed_mpn_toom42_mul       },
+  { "mpn_toom43_mul",    speed_mpn_toom43_mul       },
+  { "mpn_toom63_mul",    speed_mpn_toom63_mul       },
+  { "mpn_nussbaumer_mul",    speed_mpn_nussbaumer_mul    },
+  { "mpn_nussbaumer_mul_sqr",speed_mpn_nussbaumer_mul_sqr},
+#if WANT_OLD_FFT_FULL
+  { "mpn_mul_fft_full",      speed_mpn_mul_fft_full      },
+  { "mpn_mul_fft_full_sqr",  speed_mpn_mul_fft_full_sqr  },
+#endif
+  { "mpn_mul_fft",       speed_mpn_mul_fft,     FLAG_R_OPTIONAL },
+  { "mpn_mul_fft_sqr",   speed_mpn_mul_fft_sqr, FLAG_R_OPTIONAL },
+
+  { "mpn_sqrlo",          speed_mpn_sqrlo           },
+  { "mpn_sqrlo_basecase", speed_mpn_sqrlo_basecase  },
+  { "mpn_mullo_n",        speed_mpn_mullo_n         },
+  { "mpn_mullo_basecase", speed_mpn_mullo_basecase  },
+
+  { "mpn_mulmid_basecase",  speed_mpn_mulmid_basecase, FLAG_R_OPTIONAL },
+  { "mpn_toom42_mulmid",    speed_mpn_toom42_mulmid },
+  { "mpn_mulmid_n",         speed_mpn_mulmid_n },
+  { "mpn_mulmid",           speed_mpn_mulmid, FLAG_R_OPTIONAL },
+
+  { "mpn_bc_mulmod_bnm1",      speed_mpn_bc_mulmod_bnm1      },
+  { "mpn_mulmod_bnm1",         speed_mpn_mulmod_bnm1         },
+  { "mpn_mulmod_bnm1_rounded", speed_mpn_mulmod_bnm1_rounded },
+  { "mpn_sqrmod_bnm1",         speed_mpn_sqrmod_bnm1         },
+
+  { "mpn_invert",              speed_mpn_invert              },
+  { "mpn_invertappr",          speed_mpn_invertappr          },
+  { "mpn_ni_invertappr",       speed_mpn_ni_invertappr       },
+  { "mpn_binvert",             speed_mpn_binvert             },
+  { "mpn_sec_invert",          speed_mpn_sec_invert          },
+
+  { "mpn_sbpi1_div_qr",        speed_mpn_sbpi1_div_qr,    FLAG_R_OPTIONAL},
+  { "mpn_dcpi1_div_qr",        speed_mpn_dcpi1_div_qr,    FLAG_R_OPTIONAL},
+  { "mpn_mu_div_qr",           speed_mpn_mu_div_qr,       FLAG_R_OPTIONAL},
+  { "mpn_mupi_div_qr",         speed_mpn_mupi_div_qr,     FLAG_R_OPTIONAL},
+  { "mpn_sbpi1_divappr_q",     speed_mpn_sbpi1_divappr_q, FLAG_R_OPTIONAL},
+  { "mpn_dcpi1_divappr_q",     speed_mpn_dcpi1_divappr_q, FLAG_R_OPTIONAL},
+
+  { "mpn_sbpi1_bdiv_qr",       speed_mpn_sbpi1_bdiv_qr       },
+  { "mpn_dcpi1_bdiv_qr",       speed_mpn_dcpi1_bdiv_qr       },
+  { "mpn_sbpi1_bdiv_q",        speed_mpn_sbpi1_bdiv_q        },
+  { "mpn_dcpi1_bdiv_q",        speed_mpn_dcpi1_bdiv_q        },
+  { "mpn_sbpi1_bdiv_r",        speed_mpn_sbpi1_bdiv_r        },
+
+  { "mpn_broot",               speed_mpn_broot,    FLAG_R },
+  { "mpn_broot_invm1",         speed_mpn_broot_invm1, FLAG_R },
+  { "mpn_brootinv",            speed_mpn_brootinv, FLAG_R },
+
+  { "mpn_get_str",          speed_mpn_get_str,     FLAG_R_OPTIONAL },
+  { "mpn_set_str",          speed_mpn_set_str,     FLAG_R_OPTIONAL },
+  { "mpn_set_str_basecase", speed_mpn_bc_set_str,  FLAG_R_OPTIONAL },
+
+  { "mpn_sqrtrem",       speed_mpn_sqrtrem          },
+  { "mpn_rootrem",       speed_mpn_rootrem, FLAG_R  },
+  { "mpn_sqrt",          speed_mpn_sqrt             },
+  { "mpn_root",          speed_mpn_root, FLAG_R     },
+
+  { "mpn_perfect_power_p",  speed_mpn_perfect_power_p,       },
+  { "mpn_perfect_square_p", speed_mpn_perfect_square_p,      },
+
+  { "mpn_fib2_ui",       speed_mpn_fib2_ui,    FLAG_NODATA },
+  { "mpz_fib_ui",        speed_mpz_fib_ui,     FLAG_NODATA },
+  { "mpz_fib2_ui",       speed_mpz_fib2_ui,    FLAG_NODATA },
+  { "mpz_lucnum_ui",     speed_mpz_lucnum_ui,  FLAG_NODATA },
+  { "mpz_lucnum2_ui",    speed_mpz_lucnum2_ui, FLAG_NODATA },
+
+  { "mpz_add",           speed_mpz_add              },
+  { "mpz_invert",        speed_mpz_invert,   FLAG_R_OPTIONAL },
+  { "mpz_bin_uiui",      speed_mpz_bin_uiui, FLAG_NODATA | FLAG_R_OPTIONAL },
+  { "mpz_bin_ui",        speed_mpz_bin_ui,   FLAG_NODATA | FLAG_R_OPTIONAL },
+  { "mpz_fac_ui",        speed_mpz_fac_ui,   FLAG_NODATA   },
+  { "mpz_2fac_ui",       speed_mpz_2fac_ui,  FLAG_NODATA   },
+  { "mpz_mfac_uiui",     speed_mpz_mfac_uiui,  FLAG_NODATA | FLAG_R_OPTIONAL },
+  { "mpz_primorial_ui",  speed_mpz_primorial_ui, FLAG_NODATA },
+  { "mpz_powm",          speed_mpz_powm,     FLAG_R_OPTIONAL },
+  { "mpz_powm_mod",      speed_mpz_powm_mod         },
+  { "mpz_powm_redc",     speed_mpz_powm_redc        },
+  { "mpz_powm_sec",      speed_mpz_powm_sec        },
+  { "mpz_powm_ui",       speed_mpz_powm_ui,  FLAG_R_OPTIONAL },
+
+  { "mpz_mod",           speed_mpz_mod              },
+  { "mpn_redc_1",        speed_mpn_redc_1           },
+  { "mpn_redc_2",        speed_mpn_redc_2           },
+  { "mpn_redc_n",        speed_mpn_redc_n           },
+
+  { "MPN_COPY",          speed_MPN_COPY             },
+  { "MPN_COPY_INCR",     speed_MPN_COPY_INCR        },
+  { "MPN_COPY_DECR",     speed_MPN_COPY_DECR        },
+  { "memcpy",            speed_memcpy               },
+#if HAVE_NATIVE_mpn_copyi
+  { "mpn_copyi",         speed_mpn_copyi            },
+#endif
+#if HAVE_NATIVE_mpn_copyd
+  { "mpn_copyd",         speed_mpn_copyd            },
+#endif
+  { "mpn_sec_tabselect", speed_mpn_sec_tabselect, FLAG_R_OPTIONAL },
+#if HAVE_NATIVE_mpn_addlsh1_n == 1
+  { "mpn_addlsh1_n",     speed_mpn_addlsh1_n, FLAG_R_OPTIONAL },
+#endif
+#if HAVE_NATIVE_mpn_sublsh1_n == 1
+  { "mpn_sublsh1_n",     speed_mpn_sublsh1_n, FLAG_R_OPTIONAL },
+#endif
+#if HAVE_NATIVE_mpn_addlsh1_n_ip1
+  { "mpn_addlsh1_n_ip1", speed_mpn_addlsh1_n_ip1    },
+#endif
+#if HAVE_NATIVE_mpn_addlsh1_n_ip2
+  { "mpn_addlsh1_n_ip2", speed_mpn_addlsh1_n_ip2    },
+#endif
+#if HAVE_NATIVE_mpn_sublsh1_n_ip1
+  { "mpn_sublsh1_n_ip1", speed_mpn_sublsh1_n_ip1    },
+#endif
+#if HAVE_NATIVE_mpn_rsblsh1_n == 1
+  { "mpn_rsblsh1_n",     speed_mpn_rsblsh1_n, FLAG_R_OPTIONAL },
+#endif
+#if HAVE_NATIVE_mpn_addlsh2_n == 1
+  { "mpn_addlsh2_n",     speed_mpn_addlsh2_n, FLAG_R_OPTIONAL },
+#endif
+#if HAVE_NATIVE_mpn_sublsh2_n == 1
+  { "mpn_sublsh2_n",     speed_mpn_sublsh2_n, FLAG_R_OPTIONAL },
+#endif
+#if HAVE_NATIVE_mpn_addlsh2_n_ip1
+  { "mpn_addlsh2_n_ip1", speed_mpn_addlsh2_n_ip1    },
+#endif
+#if HAVE_NATIVE_mpn_addlsh2_n_ip2
+  { "mpn_addlsh2_n_ip2", speed_mpn_addlsh2_n_ip2    },
+#endif
+#if HAVE_NATIVE_mpn_sublsh2_n_ip1
+  { "mpn_sublsh2_n_ip1", speed_mpn_sublsh2_n_ip1    },
+#endif
+#if HAVE_NATIVE_mpn_rsblsh2_n == 1
+  { "mpn_rsblsh2_n",     speed_mpn_rsblsh2_n, FLAG_R_OPTIONAL },
+#endif
+#if HAVE_NATIVE_mpn_addlsh_n
+  { "mpn_addlsh_n",     speed_mpn_addlsh_n, FLAG_R_OPTIONAL },
+#endif
+#if HAVE_NATIVE_mpn_sublsh_n
+  { "mpn_sublsh_n",     speed_mpn_sublsh_n, FLAG_R_OPTIONAL },
+#endif
+#if HAVE_NATIVE_mpn_addlsh_n_ip1
+  { "mpn_addlsh_n_ip1", speed_mpn_addlsh_n_ip1    },
+#endif
+#if HAVE_NATIVE_mpn_addlsh_n_ip2
+  { "mpn_addlsh_n_ip2", speed_mpn_addlsh_n_ip2    },
+#endif
+#if HAVE_NATIVE_mpn_sublsh_n_ip1
+  { "mpn_sublsh_n_ip1", speed_mpn_sublsh_n_ip1    },
+#endif
+#if HAVE_NATIVE_mpn_rsblsh_n
+  { "mpn_rsblsh_n",     speed_mpn_rsblsh_n, FLAG_R_OPTIONAL },
+#endif
+#if HAVE_NATIVE_mpn_rsh1add_n
+  { "mpn_rsh1add_n",     speed_mpn_rsh1add_n, FLAG_R_OPTIONAL },
+#endif
+#if HAVE_NATIVE_mpn_rsh1sub_n
+  { "mpn_rsh1sub_n",     speed_mpn_rsh1sub_n, FLAG_R_OPTIONAL },
+#endif
+
+  { "mpn_cnd_add_n",     speed_mpn_cnd_add_n, FLAG_R_OPTIONAL },
+  { "mpn_cnd_sub_n",     speed_mpn_cnd_sub_n, FLAG_R_OPTIONAL },
+
+  { "MPN_ZERO",          speed_MPN_ZERO             },
+
+  { "binvert_limb",       speed_binvert_limb,       FLAG_NODATA },
+  { "binvert_limb_mul1",  speed_binvert_limb_mul1,  FLAG_NODATA },
+  { "binvert_limb_loop",  speed_binvert_limb_loop,  FLAG_NODATA },
+  { "binvert_limb_cond",  speed_binvert_limb_cond,  FLAG_NODATA },
+  { "binvert_limb_arith", speed_binvert_limb_arith, FLAG_NODATA },
+
+  { "malloc_free",                  speed_malloc_free                  },
+  { "malloc_realloc_free",          speed_malloc_realloc_free          },
+  { "gmp_allocate_free",            speed_gmp_allocate_free            },
+  { "gmp_allocate_reallocate_free", speed_gmp_allocate_reallocate_free },
+  { "mpz_init_clear",               speed_mpz_init_clear               },
+  { "mpq_init_clear",               speed_mpq_init_clear               },
+  { "mpf_init_clear",               speed_mpf_init_clear               },
+  { "mpz_init_realloc_clear",       speed_mpz_init_realloc_clear       },
+
+  { "umul_ppmm",         speed_umul_ppmm,     FLAG_R_OPTIONAL },
+#if HAVE_NATIVE_mpn_umul_ppmm
+  { "mpn_umul_ppmm",     speed_mpn_umul_ppmm, FLAG_R_OPTIONAL },
+#endif
+#if HAVE_NATIVE_mpn_umul_ppmm_r
+  { "mpn_umul_ppmm_r",   speed_mpn_umul_ppmm_r, FLAG_R_OPTIONAL },
+#endif
+
+  { "count_leading_zeros",  speed_count_leading_zeros,  FLAG_NODATA | FLAG_R_OPTIONAL },
+  { "count_trailing_zeros", speed_count_trailing_zeros, FLAG_NODATA | FLAG_R_OPTIONAL },
+
+  { "udiv_qrnnd",             speed_udiv_qrnnd,             FLAG_R_OPTIONAL },
+  { "udiv_qrnnd_c",           speed_udiv_qrnnd_c,           FLAG_R_OPTIONAL },
+#if HAVE_NATIVE_mpn_udiv_qrnnd
+  { "mpn_udiv_qrnnd",         speed_mpn_udiv_qrnnd,         FLAG_R_OPTIONAL },
+#endif
+#if HAVE_NATIVE_mpn_udiv_qrnnd_r
+  { "mpn_udiv_qrnnd_r",       speed_mpn_udiv_qrnnd_r,       FLAG_R_OPTIONAL },
+#endif
+  { "invert_limb",            speed_invert_limb,            FLAG_R_OPTIONAL },
+
+  { "operator_div",           speed_operator_div,           FLAG_R_OPTIONAL },
+  { "operator_mod",           speed_operator_mod,           FLAG_R_OPTIONAL },
+
+  { "gmp_randseed",    speed_gmp_randseed,    FLAG_R_OPTIONAL               },
+  { "gmp_randseed_ui", speed_gmp_randseed_ui, FLAG_R_OPTIONAL | FLAG_NODATA },
+  { "mpz_urandomb",    speed_mpz_urandomb,    FLAG_R_OPTIONAL | FLAG_NODATA },
+
+#ifdef SPEED_EXTRA_ROUTINES
+  SPEED_EXTRA_ROUTINES
+#endif
+#ifdef SPEED_EXTRA_ROUTINES2
+  SPEED_EXTRA_ROUTINES2
+#endif
+};
+
+
+struct choice_t {
+  const struct routine_t  *p;
+  mp_limb_t               r;
+  double                  scale;
+  double                  time;
+  int                     no_time;
+  double                  prev_time;
+  const char              *name;
+};
+struct choice_t  *choice;
+int  num_choices = 0;
+
+
+void
+data_fill (mp_ptr ptr, mp_size_t size)
+{
+  switch (option_data) {
+  case DATA_RANDOM:
+    mpn_random (ptr, size);
+    break;
+  case DATA_RANDOM2:
+    mpn_random2 (ptr, size);
+    break;
+  case DATA_ZEROS:
+    MPN_ZERO (ptr, size);
+    break;
+  case DATA_AAS:
+    MPN_FILL (ptr, size, GMP_NUMB_0xAA);
+    break;
+  case DATA_FFS:
+    MPN_FILL (ptr, size, GMP_NUMB_MAX);
+    break;
+  case DATA_2FD:
+    MPN_FILL (ptr, size, GMP_NUMB_MAX);
+    ptr[0] -= 2;
+    break;
+  default:
+    abort();
+    /*NOTREACHED*/
+  }
+}
+
+/* The code here handling the various combinations of output options isn't
+   too attractive, but it works and is fairly clean.  */
+
+#define SIZE_TO_DIVISOR(n)              \
+  (option_square == 1 ? (n)*(n)         \
+  : option_square == 2 ? (n)*((n)+1)/2  \
+  : (n))
+
+void
+run_one (FILE *fp, struct speed_params *s, mp_size_t prev_size)
+{
+  const char  *first_open_fastest, *first_open_notfastest, *first_close;
+  int         i, fastest, want_data;
+  double      fastest_time;
+  TMP_DECL;
+
+  TMP_MARK;
+
+  /* allocate data, unless all routines are NODATA */
+  want_data = 0;
+  for (i = 0; i < num_choices; i++)
+    want_data |= ((choice[i].p->flag & FLAG_NODATA) == 0);
+
+  if (want_data)
+    {
+      SPEED_TMP_ALLOC_LIMBS (sp.xp, s->size, s->align_xp);
+      SPEED_TMP_ALLOC_LIMBS (sp.yp, s->size, s->align_yp);
+
+      data_fill (s->xp, s->size);
+      data_fill (s->yp, s->size);
+    }
+  else
+    {
+      sp.xp = NULL;
+      sp.yp = NULL;
+    }
+
+  if (prev_size == -1 && option_cmp == CMP_DIFFPREV)
+    {
+      first_open_fastest = "(#";
+      first_open_notfastest = " (";
+      first_close = ")";
+    }
+  else
+    {
+      first_open_fastest = "#";
+      first_open_notfastest = " ";
+      first_close = "";
+    }
+
+  fastest = -1;
+  fastest_time = -1.0;
+  for (i = 0; i < num_choices; i++)
+    {
+      s->r = choice[i].r;
+      choice[i].time = speed_measure (choice[i].p->fun, s);
+      choice[i].no_time = (choice[i].time == -1.0);
+      if (! choice[i].no_time)
+        choice[i].time *= choice[i].scale;
+
+      /* Apply the effect of CMP_DIFFPREV, but the new choice[i].prev_time
+         is before any differences.  */
+      {
+        double     t;
+        t = choice[i].time;
+        if (t != -1.0 && option_cmp == CMP_DIFFPREV && prev_size != -1)
+          {
+            if (choice[i].prev_time == -1.0)
+              choice[i].no_time = 1;
+            else
+              choice[i].time = choice[i].time - choice[i].prev_time;
+          }
+        choice[i].prev_time = t;
+      }
+
+      if (choice[i].no_time)
+        continue;
+
+      /* Look for the fastest after CMP_DIFFPREV has been applied, but
+         before CMP_RATIO or CMP_DIFFERENCE.  There's only a fastest shown
+         if there's more than one routine.  */
+      if (num_choices > 1 && (fastest == -1 || choice[i].time < fastest_time))
+        {
+          fastest = i;
+          fastest_time = choice[i].time;
+        }
+
+      if (option_cmp == CMP_DIFFPREV)
+        {
+          /* Conversion for UNIT_CYCLESPERLIMB differs in CMP_DIFFPREV. */
+          if (option_unit == UNIT_CYCLES)
+            choice[i].time /= speed_cycletime;
+          else if (option_unit == UNIT_CYCLESPERLIMB)
+            {
+              if (prev_size == -1)
+                choice[i].time /= speed_cycletime;
+              else
+                choice[i].time /=  (speed_cycletime
+                                    * (SIZE_TO_DIVISOR(s->size)
+                                       - SIZE_TO_DIVISOR(prev_size)));
+            }
+        }
+      else
+        {
+          if (option_unit == UNIT_CYCLES)
+            choice[i].time /= speed_cycletime;
+          else if (option_unit == UNIT_CYCLESPERLIMB)
+            choice[i].time /= (speed_cycletime * SIZE_TO_DIVISOR(s->size));
+
+          if (option_cmp == CMP_RATIO && i > 0)
+            {
+              /* A ratio isn't affected by the units chosen. */
+              if (choice[0].no_time || choice[0].time == 0.0)
+                choice[i].no_time = 1;
+              else
+                choice[i].time /= choice[0].time;
+            }
+          else if (option_cmp == CMP_DIFFERENCE && i > 0)
+            {
+              if (choice[0].no_time)
+                {
+                  choice[i].no_time = 1;
+                  continue;
+                }
+              choice[i].time -= choice[0].time;
+            }
+        }
+    }
+
+  if (option_gnuplot)
+    {
+      /* In CMP_DIFFPREV, don't print anything for the first size, start
+         with the second where an actual difference is available.
+
+         In CMP_RATIO, print the first column as 1.0.
+
+         The 9 decimals printed is much more than the expected precision of
+         the measurements actually. */
+
+      if (! (option_cmp == CMP_DIFFPREV && prev_size == -1))
+        {
+          fprintf (fp, "%-6ld ", s->size);
+          for (i = 0; i < num_choices; i++)
+            fprintf (fp, "  %.9e",
+                     choice[i].no_time ? 0.0
+                     : (option_cmp == CMP_RATIO && i == 0) ? 1.0
+                     : choice[i].time);
+          fprintf (fp, "\n");
+        }
+    }
+  else
+    {
+      fprintf (fp, "%-6ld ", s->size);
+      for (i = 0; i < num_choices; i++)
+        {
+          char  buf[128];
+          int   decimals;
+
+          if (choice[i].no_time)
+            {
+              fprintf (fp, " %*s", COLUMN_WIDTH, "n/a");
+            }
+          else
+            {if (option_unit == UNIT_CYCLESPERLIMB
+                 || (option_cmp == CMP_RATIO && i > 0))
+                decimals = 4;
+              else if (option_unit == UNIT_CYCLES)
+                decimals = 2;
+              else
+                decimals = 9;
+
+              sprintf (buf, "%s%.*f%s",
+                       i == fastest ? first_open_fastest : first_open_notfastest,
+                       decimals, choice[i].time, first_close);
+              fprintf (fp, " %*s", COLUMN_WIDTH, buf);
+            }
+        }
+      fprintf (fp, "\n");
+    }
+
+  TMP_FREE;
+}
+
+void
+run_all (FILE *fp)
+{
+  mp_size_t  prev_size;
+  int        i;
+  TMP_DECL;
+
+  TMP_MARK;
+  SPEED_TMP_ALLOC_LIMBS (sp.xp_block, SPEED_BLOCK_SIZE, sp.align_xp);
+  SPEED_TMP_ALLOC_LIMBS (sp.yp_block, SPEED_BLOCK_SIZE, sp.align_yp);
+
+  data_fill (sp.xp_block, SPEED_BLOCK_SIZE);
+  data_fill (sp.yp_block, SPEED_BLOCK_SIZE);
+
+  for (i = 0; i < size_num; i++)
+    {
+      sp.size = size_array[i].start;
+      prev_size = -1;
+      for (;;)
+        {
+          mp_size_t  step;
+
+          if (option_data == DATA_2FD && sp.size >= 2)
+            sp.xp[sp.size-1] = 2;
+
+          run_one (fp, &sp, prev_size);
+          prev_size = sp.size;
+
+          if (option_data == DATA_2FD && sp.size >= 2)
+            sp.xp[sp.size-1] = MP_LIMB_T_MAX;
+
+          if (option_factor != 0.0)
+            {
+              step = (mp_size_t) (sp.size * option_factor - sp.size);
+              if (step < 1)
+                step = 1;
+            }
+          else
+            step = 1;
+          if (step < option_step)
+            step = option_step;
+
+          sp.size += step;
+          if (sp.size > size_array[i].end)
+            break;
+        }
+    }
+
+  TMP_FREE;
+}
+
+
+FILE *
+fopen_for_write (const char *filename)
+{
+  FILE  *fp;
+  if ((fp = fopen (filename, "w")) == NULL)
+    {
+      fprintf (stderr, "Cannot create %s\n", filename);
+      exit(1);
+    }
+  return fp;
+}
+
+void
+fclose_written (FILE *fp, const char *filename)
+{
+  int  err;
+
+  err = ferror (fp);
+  err |= fclose (fp);
+
+  if (err)
+    {
+      fprintf (stderr, "Error writing %s\n", filename);
+      exit(1);
+    }
+}
+
+
+void
+run_gnuplot (int argc, char *argv[])
+{
+  char  *plot_filename;
+  char  *data_filename;
+  FILE  *fp;
+  int   i;
+
+  plot_filename = (char *) (*__gmp_allocate_func)
+    (strlen (option_gnuplot_basename) + 20);
+  data_filename = (char *) (*__gmp_allocate_func)
+    (strlen (option_gnuplot_basename) + 20);
+
+  sprintf (plot_filename, "%s.gnuplot", option_gnuplot_basename);
+  sprintf (data_filename, "%s.data",    option_gnuplot_basename);
+
+  fp = fopen_for_write (plot_filename);
+
+  fprintf (fp, "# Generated with:\n");
+  fprintf (fp, "#");
+  for (i = 0; i < argc; i++)
+    fprintf (fp, " %s", argv[i]);
+  fprintf (fp, "\n");
+  fprintf (fp, "\n");
+
+  fprintf (fp, "reset\n");
+
+  /* Putting the key at the top left is usually good, and you can change it
+     interactively if it's not. */
+  fprintf (fp, "set key left\n");
+
+  /* write underscores, not subscripts */
+  fprintf (fp, "set termoption noenhanced\n");
+
+  /* designed to make it possible to see crossovers easily */
+  fprintf (fp, "set style data lines\n");
+
+  fprintf (fp, "plot ");
+  for (i = 0; i < num_choices; i++)
+    {
+      fprintf (fp, " \"%s\" using 1:%d", data_filename, i+2);
+      fprintf (fp, " title \"%s\"", choice[i].name);
+
+      if (i != num_choices-1)
+        fprintf (fp, ", \\");
+      fprintf (fp, "\n");
+    }
+
+  fprintf (fp, "load \"-\"\n");
+  fclose_written (fp, plot_filename);
+
+  fp = fopen_for_write (data_filename);
+
+  /* Unbuffered so you can see where the program was up to if it crashes or
+     you kill it. */
+  setbuf (fp, NULL);
+
+  run_all (fp);
+  fclose_written (fp, data_filename);
+}
+
+
+/* Return a limb with n many one bits (starting from the least significant) */
+
+#define LIMB_ONES(n) \
+  ((n) == GMP_LIMB_BITS ? MP_LIMB_T_MAX      \
+    : (n) == 0 ? CNST_LIMB(0)                   \
+    : (CNST_LIMB(1) << (n)) - 1)
+
+mp_limb_t
+r_string (const char *s)
+{
+  const char  *s_orig = s;
+  long        n;
+
+  if (strcmp (s, "aas") == 0)
+    return GMP_NUMB_0xAA;
+
+  {
+    mpz_t      z;
+    mp_limb_t  l;
+    int        set, siz;
+
+    mpz_init (z);
+    set = mpz_set_str (z, s, 0);
+    siz = SIZ(z);
+    l = (siz == 0 ? 0 : siz > 0 ? PTR(z)[0] : -PTR(z)[0]);
+    mpz_clear (z);
+    if (set == 0)
+      {
+        if (siz > 1 || siz < -1)
+          printf ("Warning, r parameter %s truncated to %d bits\n",
+                  s_orig, GMP_LIMB_BITS);
+        return l;
+      }
+  }
+
+  if (s[0] == '0' && (s[1] == 'x' || s[1] == 'X'))
+    n = strtoul (s+2, (char **) &s, 16);
+  else
+    n = strtol (s, (char **) &s, 10);
+
+  if (strcmp (s, "bits") == 0)
+    {
+      mp_limb_t  l;
+      if (n > GMP_LIMB_BITS)
+        {
+          fprintf (stderr, "%ld bit parameter invalid (max %d bits)\n",
+                   n, GMP_LIMB_BITS);
+          exit (1);
+        }
+      mpn_random (&l, 1);
+      return (l | (CNST_LIMB(1) << (n-1))) & LIMB_ONES(n);
+    }
+  else  if (strcmp (s, "ones") == 0)
+    {
+      if (n > GMP_LIMB_BITS)
+        {
+          fprintf (stderr, "%ld bit parameter invalid (max %d bits)\n",
+                   n, GMP_LIMB_BITS);
+          exit (1);
+        }
+      return LIMB_ONES (n);
+    }
+  else if (*s != '\0')
+    {
+      fprintf (stderr, "invalid r parameter: %s\n", s_orig);
+      exit (1);
+    }
+
+  return n;
+}
+
+
+void
+routine_find (struct choice_t *c, const char *s_orig)
+{
+  const char  *s;
+  int     i;
+  size_t  nlen;
+
+  c->name = s_orig;
+  s = strchr (s_orig, '*');
+  if (s != NULL)
+    {
+      c->scale = atof(s_orig);
+      s++;
+    }
+  else
+    {
+      c->scale = 1.0;
+      s = s_orig;
+    }
+
+  for (i = 0; i < numberof (routine); i++)
+    {
+      nlen = strlen (routine[i].name);
+      if (memcmp (s, routine[i].name, nlen) != 0)
+        continue;
+
+      if (s[nlen] == '.')
+        {
+          /* match, with a .r parameter */
+
+          if (! (routine[i].flag & (FLAG_R|FLAG_R_OPTIONAL)))
+            {
+              fprintf (stderr,
+                       "Choice %s bad: doesn't take a \".<r>\" parameter\n",
+                       s_orig);
+              exit (1);
+            }
+
+          c->p = &routine[i];
+          c->r = r_string (s + nlen + 1);
+          return;
+        }
+
+      if (s[nlen] == '\0')
+        {
+          /* match, with no parameter */
+
+          if (routine[i].flag & FLAG_R)
+            {
+              fprintf (stderr,
+                       "Choice %s bad: needs a \".<r>\" parameter\n",
+                       s_orig);
+              exit (1);
+            }
+
+          c->p = &routine[i];
+          c->r = 0;
+          return;
+        }
+    }
+
+  fprintf (stderr, "Choice %s unrecognised\n", s_orig);
+  exit (1);
+}
+
+
+void
+usage (void)
+{
+  int  i;
+
+  speed_time_init ();
+
+  printf ("Usage: speed [-options] -s size <routine>...\n");
+  printf ("Measure the speed of some routines.\n");
+  printf ("Times are in seconds, accuracy is shown.\n");
+  printf ("\n");
+  printf ("   -p num     set precision as number of time units each routine must run\n");
+  printf ("   -s size[-end][,size[-end]]...   sizes to measure\n");
+  printf ("              single sizes or ranges, sep with comma or use multiple -s\n");
+  printf ("   -t step    step through sizes by given amount\n");
+  printf ("   -f factor  step through sizes by given factor (eg. 1.05)\n");
+  printf ("   -r         show times as ratios of the first routine\n");
+  printf ("   -d         show times as difference from the first routine\n");
+  printf ("   -D         show times as difference from previous size shown\n");
+  printf ("   -c         show times in CPU cycles\n");
+  printf ("   -C         show times in cycles per limb\n");
+  printf ("   -u         print resource usage (memory) at end\n");
+  printf ("   -P name    output plot files \"name.gnuplot\" and \"name.data\"\n");
+  printf ("   -a <type>  use given data: random(default), random2, zeros, aas, ffs, 2fd\n");
+  printf ("   -x, -y, -w, -W <align>  specify data alignments, sources and dests\n");
+  printf ("   -o addrs   print addresses of data blocks\n");
+  printf ("\n");
+  printf ("If both -t and -f are used, it means step by the factor or the step, whichever\n");
+  printf ("is greater.\n");
+  printf ("If both -C and -D are used, it means cycles per however many limbs between a\n");
+  printf ("size and the previous size.\n");
+  printf ("\n");
+  printf ("After running with -P, plots can be viewed with Gnuplot or Quickplot.\n");
+  printf ("\"gnuplot name.gnuplot\" (use \"set logscale xy; replot\" at the prompt for\n");
+  printf ("a log/log plot).\n");
+  printf ("\"quickplot -s name.data\" (has interactive zooming, and note -s is important\n");
+  printf ("when viewing more than one routine, it means same axis scales for all data).\n");
+  printf ("\n");
+  printf ("The available routines are as follows.\n");
+  printf ("\n");
+
+  for (i = 0; i < numberof (routine); i++)
+    {
+      if (routine[i].flag & FLAG_R)
+        printf ("\t%s.r\n", routine[i].name);
+      else if (routine[i].flag & FLAG_R_OPTIONAL)
+        printf ("\t%s (optional .r)\n", routine[i].name);
+      else
+        printf ("\t%s\n", routine[i].name);
+    }
+  printf ("\n");
+  printf ("Routines with a \".r\" need an extra parameter, for example mpn_lshift.6\n");
+  printf ("r should be in decimal, or use 0xN for hexadecimal.\n");
+  printf ("\n");
+  printf ("Special forms for r are \"<N>bits\" for a random N bit number, \"<N>ones\" for\n");
+  printf ("N one bits, or \"aas\" for 0xAA..AA.\n");
+  printf ("\n");
+  printf ("Times for sizes out of the range accepted by a routine are shown as 0.\n");
+  printf ("The fastest routine at each size is marked with a # (free form output only).\n");
+  printf ("\n");
+  printf ("%s", speed_time_string);
+  printf ("\n");
+  printf ("Gnuplot home page http://www.gnuplot.info/\n");
+  printf ("Quickplot home page http://quickplot.sourceforge.net/\n");
+}
+
+void
+check_align_option (const char *name, mp_size_t align)
+{
+  if (align < 0 || align > SPEED_TMP_ALLOC_ADJUST_MASK)
+    {
+      fprintf (stderr, "Alignment request out of range: %s %ld\n",
+               name, (long) align);
+      fprintf (stderr, "  should be 0 to %d (limbs), inclusive\n",
+               SPEED_TMP_ALLOC_ADJUST_MASK);
+      exit (1);
+    }
+}
+
+int
+main (int argc, char *argv[])
+{
+  int  i;
+  int  opt;
+
+  /* Unbuffered so output goes straight out when directed to a pipe or file
+     and isn't lost on killing the program half way.  */
+  setbuf (stdout, NULL);
+
+  for (;;)
+    {
+      opt = getopt(argc, argv, "a:CcDdEFf:o:p:P:rRs:t:ux:y:w:W:z");
+      if (opt == EOF)
+        break;
+
+      switch (opt) {
+      case 'a':
+        if (strcmp (optarg, "random") == 0)       option_data = DATA_RANDOM;
+        else if (strcmp (optarg, "random2") == 0) option_data = DATA_RANDOM2;
+        else if (strcmp (optarg, "zeros") == 0)   option_data = DATA_ZEROS;
+        else if (strcmp (optarg, "aas") == 0)     option_data = DATA_AAS;
+        else if (strcmp (optarg, "ffs") == 0)     option_data = DATA_FFS;
+        else if (strcmp (optarg, "2fd") == 0)     option_data = DATA_2FD;
+        else
+          {
+            fprintf (stderr, "unrecognised data option: %s\n", optarg);
+            exit (1);
+          }
+        break;
+      case 'C':
+        if (option_unit  != UNIT_SECONDS) goto bad_unit;
+        option_unit = UNIT_CYCLESPERLIMB;
+        break;
+      case 'c':
+        if (option_unit != UNIT_SECONDS)
+          {
+          bad_unit:
+            fprintf (stderr, "cannot use more than one of -c, -C\n");
+            exit (1);
+          }
+        option_unit = UNIT_CYCLES;
+        break;
+      case 'D':
+        if (option_cmp != CMP_ABSOLUTE) goto bad_cmp;
+        option_cmp = CMP_DIFFPREV;
+        break;
+      case 'd':
+        if (option_cmp != CMP_ABSOLUTE)
+          {
+          bad_cmp:
+            fprintf (stderr, "cannot use more than one of -d, -D, -r\n");
+            exit (1);
+          }
+        option_cmp = CMP_DIFFERENCE;
+        break;
+      case 'E':
+        option_square = 1;
+        break;
+      case 'F':
+        option_square = 2;
+        break;
+      case 'f':
+        option_factor = atof (optarg);
+        if (option_factor <= 1.0)
+          {
+            fprintf (stderr, "-f factor must be > 1.0\n");
+            exit (1);
+          }
+        break;
+      case 'o':
+        speed_option_set (optarg);
+        break;
+      case 'P':
+        option_gnuplot = 1;
+        option_gnuplot_basename = optarg;
+        break;
+      case 'p':
+        speed_precision = atoi (optarg);
+        break;
+      case 'R':
+        option_seed = time (NULL);
+        break;
+      case 'r':
+        if (option_cmp != CMP_ABSOLUTE)
+          goto bad_cmp;
+        option_cmp = CMP_RATIO;
+        break;
+      case 's':
+        {
+          char  *s;
+          for (s = strtok (optarg, ","); s != NULL; s = strtok (NULL, ","))
+            {
+              if (size_num == size_allocnum)
+                {
+                  size_array = (struct size_array_t *)
+                    __gmp_allocate_or_reallocate
+                    (size_array,
+                     size_allocnum * sizeof(size_array[0]),
+                     (size_allocnum+10) * sizeof(size_array[0]));
+                  size_allocnum += 10;
+                }
+              if (sscanf (s, "%ld-%ld",
+                          &size_array[size_num].start,
+                          &size_array[size_num].end) != 2)
+                {
+                  size_array[size_num].start = size_array[size_num].end
+                    = atol (s);
+                }
+
+              if (size_array[size_num].start < 0
+                  || size_array[size_num].end < 0
+                  || size_array[size_num].start > size_array[size_num].end)
+                {
+                  fprintf (stderr, "invalid size parameter: %s\n", s);
+                  exit (1);
+                }
+
+              size_num++;
+            }
+        }
+        break;
+      case 't':
+        option_step = atol (optarg);
+        if (option_step < 1)
+          {
+            fprintf (stderr, "-t step must be >= 1\n");
+            exit (1);
+          }
+        break;
+      case 'u':
+        option_resource_usage = 1;
+        break;
+      case 'z':
+        sp.cache = 1;
+        break;
+      case 'x':
+        sp.align_xp = atol (optarg);
+        check_align_option ("-x", sp.align_xp);
+        break;
+      case 'y':
+        sp.align_yp = atol (optarg);
+        check_align_option ("-y", sp.align_yp);
+        break;
+      case 'w':
+        sp.align_wp = atol (optarg);
+        check_align_option ("-w", sp.align_wp);
+        break;
+      case 'W':
+        sp.align_wp2 = atol (optarg);
+        check_align_option ("-W", sp.align_wp2);
+        break;
+      case '?':
+        exit(1);
+      }
+    }
+
+  if (optind >= argc)
+    {
+      usage ();
+      exit (1);
+    }
+
+  if (size_num == 0)
+    {
+      fprintf (stderr, "-s <size> must be specified\n");
+      exit (1);
+    }
+
+  gmp_randinit_default (__gmp_rands);
+  __gmp_rands_initialized = 1;
+  gmp_randseed_ui (__gmp_rands, option_seed);
+
+  choice = (struct choice_t *) (*__gmp_allocate_func)
+    ((argc - optind) * sizeof(choice[0]));
+  for ( ; optind < argc; optind++)
+    {
+      struct choice_t  c;
+      routine_find (&c, argv[optind]);
+      choice[num_choices] = c;
+      num_choices++;
+    }
+
+  if ((option_cmp == CMP_RATIO || option_cmp == CMP_DIFFERENCE) &&
+      num_choices < 2)
+    {
+      fprintf (stderr, "WARNING, -d or -r does nothing when only one routine requested\n");
+    }
+
+  speed_time_init ();
+  if (option_unit == UNIT_CYCLES || option_unit == UNIT_CYCLESPERLIMB)
+    speed_cycletime_need_cycles ();
+  else
+    speed_cycletime_need_seconds ();
+
+  if (option_gnuplot)
+    {
+      run_gnuplot (argc, argv);
+    }
+  else
+    {
+      if (option_unit == UNIT_SECONDS)
+        printf ("overhead %.9f secs", speed_measure (speed_noop, NULL));
+      else
+        printf ("overhead %.2f cycles",
+                speed_measure (speed_noop, NULL) / speed_cycletime);
+      printf (", precision %d units of %.2e secs",
+              speed_precision, speed_unittime);
+
+      if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
+        printf (", CPU freq unknown\n");
+      else
+        printf (", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
+
+      printf ("       ");
+      for (i = 0; i < num_choices; i++)
+        printf (" %*s", COLUMN_WIDTH, choice[i].name);
+      printf ("\n");
+
+      run_all (stdout);
+    }
+
+  if (option_resource_usage)
+    {
+#if HAVE_GETRUSAGE
+      {
+        /* This doesn't give data sizes on linux 2.0.x, only utime. */
+        struct rusage  r;
+        if (getrusage (RUSAGE_SELF, &r) != 0)
+          perror ("getrusage");
+        else
+          printf ("getrusage(): utime %ld.%06ld data %ld stack %ld maxresident %ld\n",
+                  r.ru_utime.tv_sec, r.ru_utime.tv_usec,
+                  r.ru_idrss, r.ru_isrss, r.ru_ixrss);
+      }
+#else
+      printf ("getrusage() not available\n");
+#endif
+
+      /* Linux kernel. */
+      {
+        char  buf[128];
+        sprintf (buf, "/proc/%d/status", getpid());
+        if (access (buf, R_OK) == 0)
+          {
+            sprintf (buf, "cat /proc/%d/status", getpid());
+            system (buf);
+          }
+
+      }
+    }
+
+  return 0;
+}

diff --git a/third_party/gmp/tune/speed.h b/third_party/gmp/tune/speed.h
new file mode 100644
index 0000000..f23bacb
--- /dev/null
+++ b/third_party/gmp/tune/speed.h

@@ -0,0 +1,3800 @@
+/* Header for speed and threshold things.
+
+Copyright 1999-2003, 2005, 2006, 2008-2017, 2019 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#ifndef __SPEED_H__
+#define __SPEED_H__
+
+
+/* Pad ptr,oldsize with zero limbs (at the most significant end) to make it
+   newsize long. */
+#define MPN_ZERO_EXTEND(ptr, oldsize, newsize)		\
+  do {							\
+    ASSERT ((newsize) >= (oldsize));			\
+    MPN_ZERO ((ptr)+(oldsize), (newsize)-(oldsize));	\
+  } while (0)
+
+/* A mask of the least significant n bits.  Note 1<<32 doesn't give zero on
+   x86 family CPUs, hence the separate case for GMP_LIMB_BITS. */
+#define MP_LIMB_T_LOWBITMASK(n)	\
+  ((n) == GMP_LIMB_BITS ? MP_LIMB_T_MAX : ((mp_limb_t) 1 << (n)) - 1)
+
+
+/* align must be a power of 2 here, usually CACHE_LINE_SIZE is a good choice */
+
+#define TMP_ALLOC_ALIGNED(bytes, align)	\
+  align_pointer (TMP_ALLOC ((bytes) + (align)-1), (align))
+#define TMP_ALLOC_LIMBS_ALIGNED(limbs, align)	\
+  ((mp_ptr) TMP_ALLOC_ALIGNED ((limbs)*sizeof(mp_limb_t), align))
+
+/* CACHE_LINE_SIZE is our default alignment for speed operands, and the
+   limit on what s->align_xp etc and then request for off-alignment.  Maybe
+   this should be an option of some sort, but in any case here are some line
+   sizes,
+
+       bytes
+	 32   pentium
+	 64   athlon
+	 64   itanium-2 L1
+	128   itanium-2 L2
+*/
+#define CACHE_LINE_SIZE   64 /* bytes */
+
+#define SPEED_TMP_ALLOC_ADJUST_MASK  (CACHE_LINE_SIZE/GMP_LIMB_BYTES - 1)
+
+/* Set ptr to a TMP_ALLOC block of the given limbs, with the given limb
+   alignment.  */
+#define SPEED_TMP_ALLOC_LIMBS(ptr, limbs, align)			\
+  do {									\
+    mp_ptr     __ptr;							\
+    mp_size_t  __ptr_align, __ptr_add;					\
+									\
+    ASSERT ((CACHE_LINE_SIZE % GMP_LIMB_BYTES) == 0);		\
+    __ptr = TMP_ALLOC_LIMBS ((limbs) + SPEED_TMP_ALLOC_ADJUST_MASK);	\
+    __ptr_align = (__ptr - (mp_ptr) NULL);				\
+    __ptr_add = ((align) - __ptr_align) & SPEED_TMP_ALLOC_ADJUST_MASK;	\
+    (ptr) = __ptr + __ptr_add;						\
+  } while (0)
+
+
+/* This is the size for s->xp_block and s->yp_block, used in certain
+   routines that want to run across many different data values and use
+   s->size for a different purpose, eg. SPEED_ROUTINE_MPN_GCD_1.
+
+   512 means 2kbytes of data for each of xp_block and yp_block, making 4k
+   total, which should fit easily in any L1 data cache. */
+
+#define SPEED_BLOCK_SIZE   512 /* limbs */
+
+
+extern double  speed_unittime;
+extern double  speed_cycletime;
+extern int     speed_precision;
+extern char    speed_time_string[];
+void speed_time_init (void);
+void speed_cycletime_fail (const char *str);
+void speed_cycletime_init (void);
+void speed_cycletime_need_cycles (void);
+void speed_cycletime_need_seconds (void);
+void speed_starttime (void);
+double speed_endtime (void);
+
+
+struct speed_params {
+  unsigned   reps;	/* how many times to run the routine */
+  mp_ptr     xp;	/* first argument */
+  mp_ptr     yp;	/* second argument */
+  mp_size_t  size;	/* size of both arguments */
+  mp_limb_t  r;		/* user supplied parameter */
+  mp_size_t  align_xp;	/* alignment of xp */
+  mp_size_t  align_yp;	/* alignment of yp */
+  mp_size_t  align_wp;	/* intended alignment of wp */
+  mp_size_t  align_wp2; /* intended alignment of wp2 */
+  mp_ptr     xp_block;	/* first special SPEED_BLOCK_SIZE block */
+  mp_ptr     yp_block;	/* second special SPEED_BLOCK_SIZE block */
+
+  double     time_divisor; /* optionally set by the speed routine */
+
+  /* used by the cache priming things */
+  int	     cache;
+  unsigned   src_num, dst_num;
+  struct {
+    mp_ptr    ptr;
+    mp_size_t size;
+  } src[5], dst[4];
+};
+
+typedef double (*speed_function_t) (struct speed_params *);
+
+double speed_measure (speed_function_t fun, struct speed_params *);
+
+/* Prototypes for speed measuring routines */
+
+double speed_back_to_back (struct speed_params *);
+double speed_count_leading_zeros (struct speed_params *);
+double speed_count_trailing_zeros (struct speed_params *);
+double speed_find_a (struct speed_params *);
+double speed_gmp_allocate_free (struct speed_params *);
+double speed_gmp_allocate_reallocate_free (struct speed_params *);
+double speed_invert_limb (struct speed_params *);
+double speed_malloc_free (struct speed_params *);
+double speed_malloc_realloc_free (struct speed_params *);
+double speed_memcpy (struct speed_params *);
+double speed_binvert_limb (struct speed_params *);
+double speed_binvert_limb_mul1 (struct speed_params *);
+double speed_binvert_limb_loop (struct speed_params *);
+double speed_binvert_limb_cond (struct speed_params *);
+double speed_binvert_limb_arith (struct speed_params *);
+
+double speed_mpf_init_clear (struct speed_params *);
+
+double speed_mpn_add_n (struct speed_params *);
+double speed_mpn_add_1 (struct speed_params *);
+double speed_mpn_add_1_inplace (struct speed_params *);
+double speed_mpn_add_err1_n (struct speed_params *);
+double speed_mpn_add_err2_n (struct speed_params *);
+double speed_mpn_add_err3_n (struct speed_params *);
+double speed_mpn_addlsh_n (struct speed_params *);
+double speed_mpn_addlsh1_n (struct speed_params *);
+double speed_mpn_addlsh2_n (struct speed_params *);
+double speed_mpn_addlsh_n_ip1 (struct speed_params *);
+double speed_mpn_addlsh1_n_ip1 (struct speed_params *);
+double speed_mpn_addlsh2_n_ip1 (struct speed_params *);
+double speed_mpn_addlsh_n_ip2 (struct speed_params *);
+double speed_mpn_addlsh1_n_ip2 (struct speed_params *);
+double speed_mpn_addlsh2_n_ip2 (struct speed_params *);
+double speed_mpn_add_n_sub_n (struct speed_params *);
+double speed_mpn_and_n (struct speed_params *);
+double speed_mpn_andn_n (struct speed_params *);
+double speed_mpn_addmul_1 (struct speed_params *);
+double speed_mpn_addmul_2 (struct speed_params *);
+double speed_mpn_addmul_3 (struct speed_params *);
+double speed_mpn_addmul_4 (struct speed_params *);
+double speed_mpn_addmul_5 (struct speed_params *);
+double speed_mpn_addmul_6 (struct speed_params *);
+double speed_mpn_addmul_7 (struct speed_params *);
+double speed_mpn_addmul_8 (struct speed_params *);
+double speed_mpn_cnd_add_n (struct speed_params *);
+double speed_mpn_cnd_sub_n (struct speed_params *);
+double speed_mpn_com (struct speed_params *);
+double speed_mpn_neg (struct speed_params *);
+double speed_mpn_copyd (struct speed_params *);
+double speed_mpn_copyi (struct speed_params *);
+double speed_MPN_COPY (struct speed_params *);
+double speed_MPN_COPY_DECR (struct speed_params *);
+double speed_MPN_COPY_INCR (struct speed_params *);
+double speed_mpn_sec_tabselect (struct speed_params *);
+double speed_mpn_divexact_1 (struct speed_params *);
+double speed_mpn_divexact_by3 (struct speed_params *);
+double speed_mpn_bdiv_q_1 (struct speed_params *);
+double speed_mpn_pi1_bdiv_q_1 (struct speed_params *);
+double speed_mpn_bdiv_dbm1c (struct speed_params *);
+double speed_mpn_divrem_1 (struct speed_params *);
+double speed_mpn_divrem_1f (struct speed_params *);
+double speed_mpn_divrem_1c (struct speed_params *);
+double speed_mpn_divrem_1cf (struct speed_params *);
+double speed_mpn_divrem_1_div (struct speed_params *);
+double speed_mpn_divrem_1f_div (struct speed_params *);
+double speed_mpn_divrem_1_inv (struct speed_params *);
+double speed_mpn_divrem_1f_inv (struct speed_params *);
+double speed_mpn_divrem_2 (struct speed_params *);
+double speed_mpn_divrem_2_div (struct speed_params *);
+double speed_mpn_divrem_2_inv (struct speed_params *);
+double speed_mpn_div_qr_1n_pi1 (struct speed_params *);
+double speed_mpn_div_qr_1n_pi1_1 (struct speed_params *);
+double speed_mpn_div_qr_1n_pi1_2 (struct speed_params *);
+double speed_mpn_div_qr_1 (struct speed_params *);
+double speed_mpn_div_qr_2n (struct speed_params *);
+double speed_mpn_div_qr_2u (struct speed_params *);
+double speed_mpn_fib2_ui (struct speed_params *);
+double speed_mpn_matrix22_mul (struct speed_params *);
+double speed_mpn_hgcd2 (struct speed_params *);
+double speed_mpn_hgcd2_1 (struct speed_params *);
+double speed_mpn_hgcd2_2 (struct speed_params *);
+double speed_mpn_hgcd2_3 (struct speed_params *);
+double speed_mpn_hgcd2_4 (struct speed_params *);
+double speed_mpn_hgcd2_5 (struct speed_params *);
+double speed_mpn_hgcd (struct speed_params *);
+double speed_mpn_hgcd_lehmer (struct speed_params *);
+double speed_mpn_hgcd_appr (struct speed_params *);
+double speed_mpn_hgcd_appr_lehmer (struct speed_params *);
+double speed_mpn_hgcd_reduce (struct speed_params *);
+double speed_mpn_hgcd_reduce_1 (struct speed_params *);
+double speed_mpn_hgcd_reduce_2 (struct speed_params *);
+double speed_mpn_gcd (struct speed_params *);
+double speed_mpn_gcd_1 (struct speed_params *);
+double speed_mpn_gcd_11 (struct speed_params *);
+double speed_mpn_gcd_1N (struct speed_params *);
+double speed_mpn_gcd_22 (struct speed_params *);
+double speed_mpn_gcdext (struct speed_params *);
+double speed_mpn_gcdext_double (struct speed_params *);
+double speed_mpn_gcdext_one_double (struct speed_params *);
+double speed_mpn_gcdext_one_single (struct speed_params *);
+double speed_mpn_gcdext_single (struct speed_params *);
+double speed_mpn_get_str (struct speed_params *);
+double speed_mpn_hamdist (struct speed_params *);
+double speed_mpn_ior_n (struct speed_params *);
+double speed_mpn_iorn_n (struct speed_params *);
+double speed_mpn_jacobi_base (struct speed_params *);
+double speed_mpn_jacobi_base_1 (struct speed_params *);
+double speed_mpn_jacobi_base_2 (struct speed_params *);
+double speed_mpn_jacobi_base_3 (struct speed_params *);
+double speed_mpn_jacobi_base_4 (struct speed_params *);
+double speed_mpn_lshift (struct speed_params *);
+double speed_mpn_lshiftc (struct speed_params *);
+double speed_mpn_mod_1 (struct speed_params *);
+double speed_mpn_mod_1c (struct speed_params *);
+double speed_mpn_mod_1_div (struct speed_params *);
+double speed_mpn_mod_1_inv (struct speed_params *);
+double speed_mpn_mod_1_1 (struct speed_params *);
+double speed_mpn_mod_1_1_1 (struct speed_params *);
+double speed_mpn_mod_1_1_2 (struct speed_params *);
+double speed_mpn_mod_1_2 (struct speed_params *);
+double speed_mpn_mod_1_3 (struct speed_params *);
+double speed_mpn_mod_1_4 (struct speed_params *);
+double speed_mpn_mod_34lsub1 (struct speed_params *);
+double speed_mpn_modexact_1_odd (struct speed_params *);
+double speed_mpn_modexact_1c_odd (struct speed_params *);
+double speed_mpn_mul_1 (struct speed_params *);
+double speed_mpn_mul_1_inplace (struct speed_params *);
+double speed_mpn_mul_2 (struct speed_params *);
+double speed_mpn_mul_3 (struct speed_params *);
+double speed_mpn_mul_4 (struct speed_params *);
+double speed_mpn_mul_5 (struct speed_params *);
+double speed_mpn_mul_6 (struct speed_params *);
+double speed_mpn_mul (struct speed_params *);
+double speed_mpn_mul_basecase (struct speed_params *);
+double speed_mpn_mulmid (struct speed_params *);
+double speed_mpn_mulmid_basecase (struct speed_params *);
+double speed_mpn_mul_fft (struct speed_params *);
+double speed_mpn_mul_fft_sqr (struct speed_params *);
+double speed_mpn_fft_mul (struct speed_params *);
+double speed_mpn_fft_sqr (struct speed_params *);
+#if WANT_OLD_FFT_FULL
+double speed_mpn_mul_fft_full (struct speed_params *);
+double speed_mpn_mul_fft_full_sqr (struct speed_params *);
+#endif
+double speed_mpn_nussbaumer_mul (struct speed_params *);
+double speed_mpn_nussbaumer_mul_sqr (struct speed_params *);
+double speed_mpn_mul_n (struct speed_params *);
+double speed_mpn_mul_n_sqr (struct speed_params *);
+double speed_mpn_mulmid_n (struct speed_params *);
+double speed_mpn_sqrlo (struct speed_params *);
+double speed_mpn_sqrlo_basecase (struct speed_params *);
+double speed_mpn_mullo_n (struct speed_params *);
+double speed_mpn_mullo_basecase (struct speed_params *);
+double speed_mpn_nand_n (struct speed_params *);
+double speed_mpn_nior_n (struct speed_params *);
+double speed_mpn_popcount (struct speed_params *);
+double speed_mpn_preinv_divrem_1 (struct speed_params *);
+double speed_mpn_preinv_divrem_1f (struct speed_params *);
+double speed_mpn_preinv_mod_1 (struct speed_params *);
+double speed_mpn_sbpi1_div_qr (struct speed_params *);
+double speed_mpn_dcpi1_div_qr (struct speed_params *);
+double speed_mpn_sbpi1_divappr_q (struct speed_params *);
+double speed_mpn_dcpi1_divappr_q (struct speed_params *);
+double speed_mpn_mu_div_qr (struct speed_params *);
+double speed_mpn_mu_divappr_q (struct speed_params *);
+double speed_mpn_mupi_div_qr (struct speed_params *);
+double speed_mpn_mu_div_q (struct speed_params *);
+double speed_mpn_sbpi1_bdiv_qr (struct speed_params *);
+double speed_mpn_dcpi1_bdiv_qr (struct speed_params *);
+double speed_mpn_sbpi1_bdiv_q (struct speed_params *);
+double speed_mpn_dcpi1_bdiv_q (struct speed_params *);
+double speed_mpn_sbpi1_bdiv_r (struct speed_params *);
+double speed_mpn_mu_bdiv_q (struct speed_params *);
+double speed_mpn_mu_bdiv_qr (struct speed_params *);
+double speed_mpn_broot (struct speed_params *);
+double speed_mpn_broot_invm1 (struct speed_params *);
+double speed_mpn_brootinv (struct speed_params *);
+double speed_mpn_invert (struct speed_params *);
+double speed_mpn_invertappr (struct speed_params *);
+double speed_mpn_ni_invertappr (struct speed_params *);
+double speed_mpn_sec_invert (struct speed_params *s);
+double speed_mpn_binvert (struct speed_params *);
+double speed_mpn_redc_1 (struct speed_params *);
+double speed_mpn_redc_2 (struct speed_params *);
+double speed_mpn_redc_n (struct speed_params *);
+double speed_mpn_rsblsh_n (struct speed_params *);
+double speed_mpn_rsblsh1_n (struct speed_params *);
+double speed_mpn_rsblsh2_n (struct speed_params *);
+double speed_mpn_rsh1add_n (struct speed_params *);
+double speed_mpn_rsh1sub_n (struct speed_params *);
+double speed_mpn_rshift (struct speed_params *);
+double speed_mpn_sb_divrem_m3 (struct speed_params *);
+double speed_mpn_sb_divrem_m3_div (struct speed_params *);
+double speed_mpn_sb_divrem_m3_inv (struct speed_params *);
+double speed_mpn_set_str (struct speed_params *);
+double speed_mpn_bc_set_str (struct speed_params *);
+double speed_mpn_dc_set_str (struct speed_params *);
+double speed_mpn_set_str_pre (struct speed_params *);
+double speed_mpn_sqr_basecase (struct speed_params *);
+double speed_mpn_sqr_diag_addlsh1 (struct speed_params *);
+double speed_mpn_sqr_diagonal (struct speed_params *);
+double speed_mpn_sqr (struct speed_params *);
+double speed_mpn_sqrtrem (struct speed_params *);
+double speed_mpn_rootrem (struct speed_params *);
+double speed_mpn_sqrt (struct speed_params *);
+double speed_mpn_root (struct speed_params *);
+double speed_mpn_perfect_power_p (struct speed_params *);
+double speed_mpn_perfect_square_p (struct speed_params *);
+double speed_mpn_sub_n (struct speed_params *);
+double speed_mpn_sub_1 (struct speed_params *);
+double speed_mpn_sub_1_inplace (struct speed_params *);
+double speed_mpn_sub_err1_n (struct speed_params *);
+double speed_mpn_sub_err2_n (struct speed_params *);
+double speed_mpn_sub_err3_n (struct speed_params *);
+double speed_mpn_sublsh_n (struct speed_params *);
+double speed_mpn_sublsh1_n (struct speed_params *);
+double speed_mpn_sublsh2_n (struct speed_params *);
+double speed_mpn_sublsh_n_ip1 (struct speed_params *);
+double speed_mpn_sublsh1_n_ip1 (struct speed_params *);
+double speed_mpn_sublsh2_n_ip1 (struct speed_params *);
+double speed_mpn_submul_1 (struct speed_params *);
+double speed_mpn_toom2_sqr (struct speed_params *);
+double speed_mpn_toom3_sqr (struct speed_params *);
+double speed_mpn_toom4_sqr (struct speed_params *);
+double speed_mpn_toom6_sqr (struct speed_params *);
+double speed_mpn_toom8_sqr (struct speed_params *);
+double speed_mpn_toom22_mul (struct speed_params *);
+double speed_mpn_toom33_mul (struct speed_params *);
+double speed_mpn_toom44_mul (struct speed_params *);
+double speed_mpn_toom6h_mul (struct speed_params *);
+double speed_mpn_toom8h_mul (struct speed_params *);
+double speed_mpn_toom32_mul (struct speed_params *);
+double speed_mpn_toom42_mul (struct speed_params *);
+double speed_mpn_toom43_mul (struct speed_params *);
+double speed_mpn_toom63_mul (struct speed_params *);
+double speed_mpn_toom32_for_toom43_mul (struct speed_params *);
+double speed_mpn_toom43_for_toom32_mul (struct speed_params *);
+double speed_mpn_toom32_for_toom53_mul (struct speed_params *);
+double speed_mpn_toom53_for_toom32_mul (struct speed_params *);
+double speed_mpn_toom42_for_toom53_mul (struct speed_params *);
+double speed_mpn_toom53_for_toom42_mul (struct speed_params *);
+double speed_mpn_toom43_for_toom54_mul (struct speed_params *);
+double speed_mpn_toom54_for_toom43_mul (struct speed_params *);
+double speed_mpn_toom42_mulmid (struct speed_params *);
+double speed_mpn_mulmod_bnm1 (struct speed_params *);
+double speed_mpn_bc_mulmod_bnm1 (struct speed_params *);
+double speed_mpn_mulmod_bnm1_rounded (struct speed_params *);
+double speed_mpn_sqrmod_bnm1 (struct speed_params *);
+double speed_mpn_udiv_qrnnd (struct speed_params *);
+double speed_mpn_udiv_qrnnd_r (struct speed_params *);
+double speed_mpn_umul_ppmm (struct speed_params *);
+double speed_mpn_umul_ppmm_r (struct speed_params *);
+double speed_mpn_xnor_n (struct speed_params *);
+double speed_mpn_xor_n (struct speed_params *);
+double speed_MPN_ZERO (struct speed_params *);
+
+double speed_mpq_init_clear (struct speed_params *);
+
+double speed_mpz_add (struct speed_params *);
+double speed_mpz_invert (struct speed_params *);
+double speed_mpz_bin_uiui (struct speed_params *);
+double speed_mpz_bin_ui (struct speed_params *);
+double speed_mpz_fac_ui (struct speed_params *);
+double speed_mpz_2fac_ui (struct speed_params *);
+double speed_mpz_mfac_uiui (struct speed_params *);
+double speed_mpz_primorial_ui (struct speed_params *);
+double speed_mpz_fib_ui (struct speed_params *);
+double speed_mpz_fib2_ui (struct speed_params *);
+double speed_mpz_init_clear (struct speed_params *);
+double speed_mpz_init_realloc_clear (struct speed_params *);
+double speed_mpz_nextprime (struct speed_params *);
+double speed_mpz_jacobi (struct speed_params *);
+double speed_mpz_lucnum_ui (struct speed_params *);
+double speed_mpz_lucnum2_ui (struct speed_params *);
+double speed_mpz_mod (struct speed_params *);
+double speed_mpz_powm (struct speed_params *);
+double speed_mpz_powm_mod (struct speed_params *);
+double speed_mpz_powm_redc (struct speed_params *);
+double speed_mpz_powm_sec (struct speed_params *);
+double speed_mpz_powm_ui (struct speed_params *);
+double speed_mpz_urandomb (struct speed_params *);
+
+double speed_gmp_randseed (struct speed_params *);
+double speed_gmp_randseed_ui (struct speed_params *);
+
+double speed_noop (struct speed_params *);
+double speed_noop_wxs (struct speed_params *);
+double speed_noop_wxys (struct speed_params *);
+
+double speed_operator_div (struct speed_params *);
+double speed_operator_mod (struct speed_params *);
+
+double speed_udiv_qrnnd (struct speed_params *);
+double speed_udiv_qrnnd_preinv1 (struct speed_params *);
+double speed_udiv_qrnnd_preinv2 (struct speed_params *);
+double speed_udiv_qrnnd_preinv3 (struct speed_params *);
+double speed_udiv_qrnnd_c (struct speed_params *);
+double speed_umul_ppmm (struct speed_params *);
+
+/* Prototypes for other routines */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* low 32-bits in p[0], high 32-bits in p[1] */
+void speed_cyclecounter (unsigned p[2]);
+
+#if defined (__cplusplus)
+}
+#endif
+
+void mftb_function (unsigned p[2]);
+
+double speed_cyclecounter_diff (const unsigned [2], const unsigned [2]);
+int gettimeofday_microseconds_p (void);
+int getrusage_microseconds_p (void);
+int cycles_works_p (void);
+long clk_tck (void);
+double freq_measure (const char *, double (*)(void));
+
+int double_cmp_ptr (const double *, const double *);
+void pentium_wbinvd (void);
+typedef int (*qsort_function_t) (const void *, const void *);
+
+void noop (void);
+void noop_1 (mp_limb_t);
+void noop_wxs (mp_ptr, mp_srcptr, mp_size_t);
+void noop_wxys (mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
+void mpn_cache_fill (mp_srcptr, mp_size_t);
+void mpn_cache_fill_dummy (mp_limb_t);
+void speed_cache_fill (struct speed_params *);
+void speed_operand_src (struct speed_params *, mp_ptr, mp_size_t);
+void speed_operand_dst (struct speed_params *, mp_ptr, mp_size_t);
+
+extern int  speed_option_addrs;
+extern int  speed_option_verbose;
+extern int  speed_option_cycles_broken;
+void speed_option_set (const char *);
+
+mp_limb_t mpn_div_qr_1n_pi1_1 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
+mp_limb_t mpn_div_qr_1n_pi1_2 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
+
+mp_limb_t mpn_divrem_1_div (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
+mp_limb_t mpn_divrem_1_inv (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
+mp_limb_t mpn_divrem_2_div (mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr);
+mp_limb_t mpn_divrem_2_inv (mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr);
+
+int mpn_jacobi_base_1 (mp_limb_t, mp_limb_t, int);
+int mpn_jacobi_base_2 (mp_limb_t, mp_limb_t, int);
+int mpn_jacobi_base_3 (mp_limb_t, mp_limb_t, int);
+int mpn_jacobi_base_4 (mp_limb_t, mp_limb_t, int);
+
+int mpn_hgcd2_1 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
+int mpn_hgcd2_2 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
+int mpn_hgcd2_3 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
+int mpn_hgcd2_4 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
+int mpn_hgcd2_5 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
+
+mp_limb_t mpn_mod_1_div (mp_srcptr, mp_size_t, mp_limb_t);
+mp_limb_t mpn_mod_1_inv (mp_srcptr, mp_size_t, mp_limb_t);
+
+mp_limb_t mpn_mod_1_1p_1 (mp_srcptr, mp_size_t, mp_limb_t, const mp_limb_t [4]);
+mp_limb_t mpn_mod_1_1p_2 (mp_srcptr, mp_size_t, mp_limb_t, const mp_limb_t [4]);
+
+void mpn_mod_1_1p_cps_1 (mp_limb_t [4], mp_limb_t);
+void mpn_mod_1_1p_cps_2 (mp_limb_t [4], mp_limb_t);
+
+mp_size_t mpn_gcdext_one_double (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
+mp_size_t mpn_gcdext_one_single (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
+mp_size_t mpn_gcdext_single (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
+mp_size_t mpn_gcdext_double (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
+mp_size_t mpn_hgcd_lehmer (mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr);
+mp_size_t mpn_hgcd_lehmer_itch (mp_size_t);
+
+mp_size_t mpn_hgcd_appr_lehmer (mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr);
+mp_size_t mpn_hgcd_appr_lehmer_itch (mp_size_t);
+
+mp_size_t mpn_hgcd_reduce_1 (struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr);
+mp_size_t mpn_hgcd_reduce_1_itch (mp_size_t, mp_size_t);
+
+mp_size_t mpn_hgcd_reduce_2 (struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr);
+mp_size_t mpn_hgcd_reduce_2_itch (mp_size_t, mp_size_t);
+
+mp_limb_t mpn_sb_divrem_mn_div (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t);
+mp_limb_t mpn_sb_divrem_mn_inv (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t);
+
+mp_size_t mpn_set_str_basecase (mp_ptr, const unsigned char *, size_t, int);
+void mpn_pre_set_str (mp_ptr, unsigned char *, size_t, powers_t *, mp_ptr);
+
+void mpz_powm_mod (mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr);
+void mpz_powm_redc (mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr);
+
+int speed_routine_count_zeros_setup (struct speed_params *, mp_ptr, int, int);
+
+
+/* "get" is called repeatedly until it ticks over, just in case on a fast
+   processor it takes less than a microsecond, though this is probably
+   unlikely if it's a system call.
+
+   speed_cyclecounter is called on the same side of the "get" for the start
+   and end measurements.  It doesn't matter how long it takes from the "get"
+   sample to the cycles sample, since that period will cancel out in the
+   difference calculation (assuming it's the same each time).
+
+   Letting the test run for more than a process time slice is probably only
+   going to reduce accuracy, especially for getrusage when the cycle counter
+   is real time, or for gettimeofday if the cycle counter is in fact process
+   time.  Use CLK_TCK/2 as a reasonable stop.
+
+   It'd be desirable to be quite accurate here.  The default speed_precision
+   for a cycle counter is 10000 cycles, so to mix that with getrusage or
+   gettimeofday the frequency should be at least that accurate.  But running
+   measurements for 10000 microseconds (or more) is too long.  Be satisfied
+   with just a half clock tick (5000 microseconds usually).  */
+
+#define FREQ_MEASURE_ONE(name, type, get, getc, sec, usec)		\
+  do {									\
+    type      st1, st, et1, et;						\
+    unsigned  sc[2], ec[2];						\
+    long      dt, half_tick;						\
+    double    dc, cyc;							\
+									\
+    half_tick = (1000000L / clk_tck()) / 2;				\
+									\
+    get (st1);								\
+    do {								\
+      get (st);								\
+    } while (usec(st) == usec(st1) && sec(st) == sec(st1));		\
+									\
+    getc (sc);								\
+									\
+    for (;;)								\
+      {									\
+	get (et1);							\
+	do {								\
+	  get (et);							\
+	} while (usec(et) == usec(et1) && sec(et) == sec(et1));		\
+									\
+	getc (ec);							\
+									\
+	dc = speed_cyclecounter_diff (ec, sc);				\
+									\
+	/* allow secs to cancel before multiplying */			\
+	dt = sec(et) - sec(st);						\
+	dt = dt * 1000000L + (usec(et) - usec(st));			\
+									\
+	if (dt >= half_tick)						\
+	  break;							\
+      }									\
+									\
+    cyc = dt * 1e-6 / dc;						\
+									\
+    if (speed_option_verbose >= 2)					\
+      printf ("freq_measure_%s_one() dc=%.6g dt=%ld cyc=%.6g\n",	\
+	      name, dc, dt, cyc);					\
+									\
+    return dt * 1e-6 / dc;						\
+									\
+  } while (0)
+
+
+
+
+/* The measuring routines use these big macros to save duplication for
+   similar forms.  They also get used for some automatically generated
+   measuring of new implementations of functions.
+
+   Having something like SPEED_ROUTINE_BINARY_N as a subroutine accepting a
+   function pointer is considered undesirable since it's not the way a
+   normal application will be calling, and some processors might do
+   different things with an indirect call, like not branch predicting, or
+   doing a full pipe flush.  At least some of the "functions" measured are
+   actually macros too.
+
+   The net effect is to bloat the object code, possibly in a big way, but
+   only what's being measured is being run, so that doesn't matter.
+
+   The loop forms don't try to cope with __GMP_ATTRIBUTE_PURE or
+   ATTRIBUTE_CONST on the called functions.  Adding a cast to a non-pure
+   function pointer doesn't work in gcc 3.2.  Using an actual non-pure
+   function pointer variable works, but stands a real risk of a
+   non-optimizing compiler generating unnecessary overheads in the call.
+   Currently the best idea is not to use those attributes for a timing
+   program build.  __GMP_NO_ATTRIBUTE_CONST_PURE will tell gmp.h and
+   gmp-impl.h to omit them from routines there.  */
+
+#define SPEED_RESTRICT_COND(cond)   if (!(cond)) return -1.0;
+
+/* For mpn_copy or similar. */
+#define SPEED_ROUTINE_MPN_COPY_CALL(call)				\
+  {									\
+    mp_ptr    wp;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 0);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+#define SPEED_ROUTINE_MPN_COPY(function)				\
+  SPEED_ROUTINE_MPN_COPY_CALL (function (wp, s->xp, s->size))
+
+#define SPEED_ROUTINE_MPN_TABSELECT(function)				\
+  {									\
+    mp_ptr    xp, wp;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 0);					\
+									\
+    if (s->r == 0)							\
+      s->r = s->size;	/* default to a quadratic shape */		\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (xp, s->size * s->r, s->align_xp);		\
+    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
+									\
+    speed_operand_src (s, xp, s->size * s->r);				\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (wp, xp, s->size, s->r, (s->r) / 2);			\
+    while (--i != 0);							\
+    t = speed_endtime () / s->r;					\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+
+#define SPEED_ROUTINE_MPN_COPYC(function)				\
+  {									\
+    mp_ptr    wp;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 0);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (wp, s->xp, s->size, 0);					\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+/* s->size is still in limbs, and it's limbs which are copied, but
+   "function" takes a size in bytes not limbs.  */
+#define SPEED_ROUTINE_MPN_COPY_BYTES(function)				\
+  {									\
+    mp_ptr    wp;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 0);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (wp, s->xp, s->size * GMP_LIMB_BYTES);		\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+
+/* For mpn_add_n, mpn_sub_n, or similar. */
+#define SPEED_ROUTINE_MPN_BINARY_N_CALL(call)				\
+  {									\
+    mp_ptr     wp;							\
+    mp_ptr     xp, yp;							\
+    unsigned   i;							\
+    double     t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
+									\
+    xp = s->xp;								\
+    yp = s->yp;								\
+									\
+    if (s->r == 0)	;						\
+    else if (s->r == 1) { xp = wp;	    }				\
+    else if (s->r == 2) {	   yp = wp; }				\
+    else if (s->r == 3) { xp = wp; yp = wp; }				\
+    else if (s->r == 4) {     yp = xp;	    }				\
+    else		{						\
+      TMP_FREE;								\
+      return -1.0;							\
+    }									\
+									\
+    /* initialize wp if operand overlap */				\
+    if (xp == wp || yp == wp)						\
+      MPN_COPY (wp, s->xp, s->size);					\
+									\
+    speed_operand_src (s, xp, s->size);					\
+    speed_operand_src (s, yp, s->size);					\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+
+/* For mpn_aors_errK_n, where 1 <= K <= 3. */
+#define SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL(call, K)			\
+  {									\
+    mp_ptr     wp;							\
+    mp_ptr     xp, yp;							\
+    mp_ptr     zp[K];							\
+    mp_limb_t  ep[2*K];							\
+    unsigned   i;							\
+    double     t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
+									\
+    /* (don't have a mechanism to specify zp alignments) */		\
+    for (i = 0; i < K; i++)						\
+      SPEED_TMP_ALLOC_LIMBS (zp[i], s->size, 0);			\
+									\
+    xp = s->xp;								\
+    yp = s->yp;								\
+									\
+    if (s->r == 0)	;						\
+    else if (s->r == 1) { xp = wp;	    }				\
+    else if (s->r == 2) {	   yp = wp; }				\
+    else if (s->r == 3) { xp = wp; yp = wp; }				\
+    else if (s->r == 4) {     yp = xp;	    }				\
+    else		{						\
+      TMP_FREE;								\
+      return -1.0;							\
+    }									\
+									\
+    /* initialize wp if operand overlap */				\
+    if (xp == wp || yp == wp)						\
+      MPN_COPY (wp, s->xp, s->size);					\
+									\
+    speed_operand_src (s, xp, s->size);					\
+    speed_operand_src (s, yp, s->size);					\
+    for (i = 0; i < K; i++)						\
+      speed_operand_src (s, zp[i], s->size);				\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_BINARY_ERR1_N(function)			\
+  SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], s->size, 0), 1)
+
+#define SPEED_ROUTINE_MPN_BINARY_ERR2_N(function)			\
+  SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], zp[1], s->size, 0), 2)
+
+#define SPEED_ROUTINE_MPN_BINARY_ERR3_N(function)			\
+  SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], zp[1], zp[2], s->size, 0), 3)
+
+
+/* For mpn_add_n, mpn_sub_n, or similar. */
+#define SPEED_ROUTINE_MPN_ADDSUB_N_CALL(call)				\
+  {									\
+    mp_ptr     ap, sp;							\
+    mp_ptr     xp, yp;							\
+    unsigned   i;							\
+    double     t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (ap, s->size, s->align_wp);			\
+    SPEED_TMP_ALLOC_LIMBS (sp, s->size, s->align_wp);			\
+									\
+    xp = s->xp;								\
+    yp = s->yp;								\
+									\
+    if ((s->r & 1) != 0) { xp = ap; }					\
+    if ((s->r & 2) != 0) { yp = ap; }					\
+    if ((s->r & 4) != 0) { xp = sp; }					\
+    if ((s->r & 8) != 0) { yp = sp; }					\
+    if ((s->r & 3) == 3  ||  (s->r & 12) == 12)				\
+      {									\
+	TMP_FREE;							\
+	return -1.0;							\
+      }									\
+									\
+    /* initialize ap if operand overlap */				\
+    if (xp == ap || yp == ap)						\
+      MPN_COPY (ap, s->xp, s->size);					\
+    /* initialize sp if operand overlap */				\
+    if (xp == sp || yp == sp)						\
+      MPN_COPY (sp, s->xp, s->size);					\
+									\
+    speed_operand_src (s, xp, s->size);					\
+    speed_operand_src (s, yp, s->size);					\
+    speed_operand_dst (s, ap, s->size);					\
+    speed_operand_dst (s, sp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_BINARY_N(function)				\
+   SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size))
+
+#define SPEED_ROUTINE_MPN_BINARY_NC(function)				\
+   SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size, 0))
+
+
+/* For mpn_lshift, mpn_rshift, mpn_mul_1, with r, or similar. */
+#define SPEED_ROUTINE_MPN_UNARY_1_CALL(call)				\
+  {									\
+    mp_ptr    wp;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_UNARY_1(function)				\
+  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
+
+#define SPEED_ROUTINE_MPN_UNARY_1C(function)				\
+  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0))
+
+/* FIXME: wp is uninitialized here, should start it off from xp */
+#define SPEED_ROUTINE_MPN_UNARY_1_INPLACE(function)			\
+  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, wp, s->size, s->r))
+
+#define SPEED_ROUTINE_MPN_DIVEXACT_1(function)				\
+  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
+
+#define SPEED_ROUTINE_MPN_BDIV_Q_1(function)				\
+    SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
+
+#define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL(call)			\
+  {									\
+    unsigned   shift;							\
+    mp_limb_t  dinv;							\
+									\
+    SPEED_RESTRICT_COND (s->size > 0);					\
+    SPEED_RESTRICT_COND (s->r != 0);					\
+									\
+    count_trailing_zeros (shift, s->r);					\
+    binvert_limb (dinv, s->r >> shift);					\
+									\
+    SPEED_ROUTINE_MPN_UNARY_1_CALL (call);				\
+  }
+#define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1(function)			\
+  SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL					\
+  ((*function) (wp, s->xp, s->size, s->r, dinv, shift))
+
+#define SPEED_ROUTINE_MPN_BDIV_DBM1C(function)				\
+  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0))
+
+#define SPEED_ROUTINE_MPN_DIVREM_1(function)				\
+  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r))
+
+#define SPEED_ROUTINE_MPN_DIVREM_1C(function)				\
+  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r, 0))
+
+#define SPEED_ROUTINE_MPN_DIVREM_1F(function)				\
+  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r))
+
+#define SPEED_ROUTINE_MPN_DIVREM_1CF(function)				\
+  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r, 0))
+
+
+#define SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL(call)			\
+  {									\
+    unsigned   shift;							\
+    mp_limb_t  dinv;							\
+									\
+    SPEED_RESTRICT_COND (s->size >= 0);					\
+    SPEED_RESTRICT_COND (s->r != 0);					\
+									\
+    count_leading_zeros (shift, s->r);					\
+    invert_limb (dinv, s->r << shift);					\
+									\
+    SPEED_ROUTINE_MPN_UNARY_1_CALL (call);				\
+  }									\
+
+#define SPEED_ROUTINE_MPN_PREINV_DIVREM_1(function)			\
+  SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL				\
+  ((*function) (wp, 0, s->xp, s->size, s->r, dinv, shift))
+
+/* s->size limbs worth of fraction part */
+#define SPEED_ROUTINE_MPN_PREINV_DIVREM_1F(function)			\
+  SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL				\
+  ((*function) (wp, s->size, s->xp, 0, s->r, dinv, shift))
+
+
+/* s->r is duplicated to form the multiplier, defaulting to
+   MP_BASES_BIG_BASE_10.  Not sure if that's particularly useful, but at
+   least it provides some control.  */
+#define SPEED_ROUTINE_MPN_UNARY_N(function,N)				\
+  {									\
+    mp_ptr     wp;							\
+    mp_size_t  wn;							\
+    unsigned   i;							\
+    double     t;							\
+    mp_limb_t  yp[N];							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= N);					\
+									\
+    TMP_MARK;								\
+    wn = s->size + N-1;							\
+    SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);			\
+    for (i = 0; i < N; i++)						\
+      yp[i] = (s->r != 0 ? s->r : MP_BASES_BIG_BASE_10);		\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_src (s, yp, (mp_size_t) N);				\
+    speed_operand_dst (s, wp, wn);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (wp, s->xp, s->size, yp);				\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_UNARY_2(function)				\
+  SPEED_ROUTINE_MPN_UNARY_N (function, 2)
+#define SPEED_ROUTINE_MPN_UNARY_3(function)				\
+  SPEED_ROUTINE_MPN_UNARY_N (function, 3)
+#define SPEED_ROUTINE_MPN_UNARY_4(function)				\
+  SPEED_ROUTINE_MPN_UNARY_N (function, 4)
+#define SPEED_ROUTINE_MPN_UNARY_5(function)				\
+  SPEED_ROUTINE_MPN_UNARY_N (function, 5)
+#define SPEED_ROUTINE_MPN_UNARY_6(function)				\
+  SPEED_ROUTINE_MPN_UNARY_N (function, 6)
+#define SPEED_ROUTINE_MPN_UNARY_7(function)				\
+  SPEED_ROUTINE_MPN_UNARY_N (function, 7)
+#define SPEED_ROUTINE_MPN_UNARY_8(function)				\
+  SPEED_ROUTINE_MPN_UNARY_N (function, 8)
+
+
+/* For mpn_mul, mpn_mul_basecase, xsize=r, ysize=s->size. */
+#define SPEED_ROUTINE_MPN_MUL(function)					\
+  {									\
+    mp_ptr    wp;							\
+    mp_size_t size1;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    size1 = (s->r == 0 ? s->size : s->r);				\
+    if (size1 < 0) size1 = -size1 - s->size;				\
+									\
+    SPEED_RESTRICT_COND (size1 >= 1);					\
+    SPEED_RESTRICT_COND (s->size >= size1);				\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, size1 + s->size, s->align_wp);		\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_src (s, s->yp, size1);				\
+    speed_operand_dst (s, wp, size1 + s->size);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (wp, s->xp, s->size, s->yp, size1);			\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+
+#define SPEED_ROUTINE_MPN_MUL_N_CALL(call)				\
+  {									\
+    mp_ptr    wp;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_src (s, s->yp, s->size);				\
+    speed_operand_dst (s, wp, 2*s->size);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_MUL_N(function)				\
+  SPEED_ROUTINE_MPN_MUL_N_CALL (function (wp, s->xp, s->yp, s->size));
+
+#define SPEED_ROUTINE_MPN_MULLO_N_CALL(call)				\
+  {									\
+    mp_ptr    wp;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_src (s, s->yp, s->size);				\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_MULLO_N(function)				\
+  SPEED_ROUTINE_MPN_MULLO_N_CALL (function (wp, s->xp, s->yp, s->size));
+
+#define SPEED_ROUTINE_MPN_MULLO_BASECASE(function)			\
+  SPEED_ROUTINE_MPN_MULLO_N_CALL (function (wp, s->xp, s->yp, s->size));
+
+#define SPEED_ROUTINE_MPN_SQRLO(function)				\
+  {									\
+    mp_ptr    wp;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (wp, s->xp, s->size);					\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+/* For mpn_mulmid, mpn_mulmid_basecase, xsize=r, ysize=s->size. */
+#define SPEED_ROUTINE_MPN_MULMID(function)				\
+  {									\
+    mp_ptr    wp, xp;							\
+    mp_size_t size1;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    size1 = (s->r == 0 ? (2 * s->size - 1) : s->r);			\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+    SPEED_RESTRICT_COND (size1 >= s->size);				\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp);	\
+    SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp);			\
+									\
+    speed_operand_src (s, xp, size1);					\
+    speed_operand_src (s, s->yp, s->size);				\
+    speed_operand_dst (s, wp, size1 - s->size + 3);			\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (wp, xp, size1, s->yp, s->size);				\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_MULMID_N(function)				\
+  {									\
+    mp_ptr    wp, xp;							\
+    mp_size_t size1;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    size1 = 2 * s->size - 1;						\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp);	\
+    SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp);			\
+									\
+    speed_operand_src (s, xp, size1);					\
+    speed_operand_src (s, s->yp, s->size);				\
+    speed_operand_dst (s, wp, size1 - s->size + 3);			\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (wp, xp, s->yp, s->size);				\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_TOOM42_MULMID(function)			\
+  {									\
+    mp_ptr    wp, xp, scratch;						\
+    mp_size_t size1, scratch_size;					\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    size1 = 2 * s->size - 1;						\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp);	\
+    SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp);			\
+    scratch_size = mpn_toom42_mulmid_itch (s->size);			\
+    SPEED_TMP_ALLOC_LIMBS (scratch, scratch_size, 0);			\
+									\
+    speed_operand_src (s, xp, size1);					\
+    speed_operand_src (s, s->yp, s->size);				\
+    speed_operand_dst (s, wp, size1 - s->size + 3);			\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (wp, xp, s->yp, s->size, scratch);			\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL(call)			\
+  {									\
+    mp_ptr    wp, tp;							\
+    unsigned  i;							\
+    double    t;							\
+    mp_size_t itch;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    itch = mpn_mulmod_bnm1_itch (s->size, s->size, s->size);		\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, 2 * s->size, s->align_wp);		\
+    SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2);			\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_src (s, s->yp, s->size);				\
+    speed_operand_dst (s, wp, 2 * s->size);				\
+    speed_operand_dst (s, tp, itch);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+#define SPEED_ROUTINE_MPN_MULMOD_BNM1_ROUNDED(function)			\
+  {									\
+    mp_ptr    wp, tp;							\
+    unsigned  i;							\
+    double    t;							\
+    mp_size_t size, itch;						\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    size = mpn_mulmod_bnm1_next_size (s->size);				\
+    itch = mpn_mulmod_bnm1_itch (size, size, size);			\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, size, s->align_wp);			\
+    SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2);			\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_src (s, s->yp, s->size);				\
+    speed_operand_dst (s, wp, size);					\
+    speed_operand_dst (s, tp, itch);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (wp, size, s->xp, s->size, s->yp, s->size, tp);		\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_MUL_N_TSPACE(call, tsize, minsize)		\
+  {									\
+    mp_ptr    wp, tspace;						\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= minsize);				\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
+    SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2);		\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_src (s, s->yp, s->size);				\
+    speed_operand_dst (s, wp, 2*s->size);				\
+    speed_operand_dst (s, tspace, tsize);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_TOOM22_MUL_N(function)			\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
+     mpn_toom22_mul_itch (s->size, s->size),				\
+     MPN_TOOM22_MUL_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_TOOM33_MUL_N(function)			\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
+     mpn_toom33_mul_itch (s->size, s->size),				\
+     MPN_TOOM33_MUL_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_TOOM44_MUL_N(function)			\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
+     mpn_toom44_mul_itch (s->size, s->size),				\
+     MPN_TOOM44_MUL_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_TOOM6H_MUL_N(function)			\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
+     mpn_toom6h_mul_itch (s->size, s->size),				\
+     MPN_TOOM6H_MUL_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_TOOM8H_MUL_N(function)			\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
+     mpn_toom8h_mul_itch (s->size, s->size),				\
+     MPN_TOOM8H_MUL_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_TOOM32_MUL(function)				\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, 2*s->size/3, tspace),		\
+     mpn_toom32_mul_itch (s->size, 2*s->size/3),			\
+     MPN_TOOM32_MUL_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_TOOM42_MUL(function)				\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, s->size/2, tspace),		\
+     mpn_toom42_mul_itch (s->size, s->size/2),				\
+     MPN_TOOM42_MUL_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_TOOM43_MUL(function)				\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, s->size*3/4, tspace),		\
+     mpn_toom43_mul_itch (s->size, s->size*3/4),			\
+     MPN_TOOM43_MUL_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_TOOM63_MUL(function)				\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, s->size/2, tspace),		\
+     mpn_toom63_mul_itch (s->size, s->size/2),				\
+     MPN_TOOM63_MUL_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM43_MUL(function)		\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace),	\
+     mpn_toom32_mul_itch (s->size, 17*s->size/24),			\
+     MPN_TOOM32_MUL_MINSIZE)
+#define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM32_MUL(function)		\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace),	\
+     mpn_toom43_mul_itch (s->size, 17*s->size/24),			\
+     MPN_TOOM43_MUL_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM53_MUL(function)		\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace),	\
+     mpn_toom32_mul_itch (s->size, 19*s->size/30),			\
+     MPN_TOOM32_MUL_MINSIZE)
+#define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM32_MUL(function)		\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace),	\
+     mpn_toom53_mul_itch (s->size, 19*s->size/30),			\
+     MPN_TOOM53_MUL_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_TOOM42_FOR_TOOM53_MUL(function)		\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace),	\
+     mpn_toom42_mul_itch (s->size, 11*s->size/20),			\
+     MPN_TOOM42_MUL_MINSIZE)
+#define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM42_MUL(function)		\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace),	\
+     mpn_toom53_mul_itch (s->size, 11*s->size/20),			\
+     MPN_TOOM53_MUL_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM54_MUL(function)		\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, 5*s->size/6, tspace),	\
+     mpn_toom42_mul_itch (s->size, 5*s->size/6),			\
+     MPN_TOOM54_MUL_MINSIZE)
+#define SPEED_ROUTINE_MPN_TOOM54_FOR_TOOM43_MUL(function)		\
+  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
+    (function (wp, s->xp, s->size, s->yp, 5*s->size/6, tspace),	\
+     mpn_toom54_mul_itch (s->size, 5*s->size/6),			\
+     MPN_TOOM54_MUL_MINSIZE)
+
+
+
+#define SPEED_ROUTINE_MPN_SQR_CALL(call)				\
+  {									\
+    mp_ptr    wp;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_dst (s, wp, 2*s->size);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_SQR(function)					\
+  SPEED_ROUTINE_MPN_SQR_CALL (function (wp, s->xp, s->size))
+
+#define SPEED_ROUTINE_MPN_SQR_DIAG_ADDLSH1_CALL(call)			\
+  {									\
+    mp_ptr    wp, tp;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 2);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_wp);		\
+    SPEED_TMP_ALLOC_LIMBS (wp, 2 * s->size, s->align_wp);		\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_src (s, tp, 2 * s->size);				\
+    speed_operand_dst (s, wp, 2 * s->size);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime () / 2;						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_SQR_TSPACE(call, tsize, minsize)		\
+  {									\
+    mp_ptr    wp, tspace;						\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= minsize);				\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
+    SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2);		\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_dst (s, wp, 2*s->size);				\
+    speed_operand_dst (s, tspace, tsize);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_TOOM2_SQR(function)				\
+  SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
+				mpn_toom2_sqr_itch (s->size),		\
+				MPN_TOOM2_SQR_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_TOOM3_SQR(function)				\
+  SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
+				mpn_toom3_sqr_itch (s->size),		\
+				MPN_TOOM3_SQR_MINSIZE)
+
+
+#define SPEED_ROUTINE_MPN_TOOM4_SQR(function)				\
+  SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
+				mpn_toom4_sqr_itch (s->size),		\
+				MPN_TOOM4_SQR_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_TOOM6_SQR(function)				\
+  SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
+				mpn_toom6_sqr_itch (s->size),		\
+				MPN_TOOM6_SQR_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_TOOM8_SQR(function)				\
+  SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
+				mpn_toom8_sqr_itch (s->size),		\
+				MPN_TOOM8_SQR_MINSIZE)
+
+#define SPEED_ROUTINE_MPN_MOD_CALL(call)				\
+  {									\
+    unsigned   i;							\
+									\
+    SPEED_RESTRICT_COND (s->size >= 0);					\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+									\
+    return speed_endtime ();						\
+  }
+
+#define SPEED_ROUTINE_MPN_MOD_1(function)				\
+   SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size, s->r))
+
+#define SPEED_ROUTINE_MPN_MOD_1C(function)				\
+   SPEED_ROUTINE_MPN_MOD_CALL ((*function)(s->xp, s->size, s->r, CNST_LIMB(0)))
+
+#define SPEED_ROUTINE_MPN_MODEXACT_1_ODD(function)			\
+  SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r));
+
+#define SPEED_ROUTINE_MPN_MODEXACT_1C_ODD(function)			\
+  SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r, CNST_LIMB(0)));
+
+#define SPEED_ROUTINE_MPN_MOD_34LSUB1(function)				\
+   SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size))
+
+#define SPEED_ROUTINE_MPN_PREINV_MOD_1(function)			\
+  {									\
+    unsigned   i;							\
+    mp_limb_t  inv;							\
+									\
+    SPEED_RESTRICT_COND (s->size >= 0);					\
+    SPEED_RESTRICT_COND (s->r & GMP_LIMB_HIGHBIT);			\
+									\
+    invert_limb (inv, s->r);						\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      (*function) (s->xp, s->size, s->r, inv);				\
+    while (--i != 0);							\
+									\
+    return speed_endtime ();						\
+  }
+
+#define SPEED_ROUTINE_MPN_MOD_1_1(function,pfunc)			\
+  {									\
+    unsigned   i;							\
+    mp_limb_t  inv[4];							\
+									\
+    SPEED_RESTRICT_COND (s->size >= 2);					\
+									\
+    mpn_mod_1_1p_cps (inv, s->r);					\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do {								\
+      pfunc (inv, s->r);						\
+      function (s->xp, s->size, s->r << inv[1], inv);				\
+    } while (--i != 0);							\
+									\
+    return speed_endtime ();						\
+  }
+#define SPEED_ROUTINE_MPN_MOD_1_N(function,pfunc,N)			\
+  {									\
+    unsigned   i;							\
+    mp_limb_t  inv[N+3];						\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+    SPEED_RESTRICT_COND (s->r <= ~(mp_limb_t)0 / N);			\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do {								\
+      pfunc (inv, s->r);						\
+      function (s->xp, s->size, s->r, inv);				\
+    } while (--i != 0);							\
+									\
+    return speed_endtime ();						\
+  }
+
+
+/* A division of 2*s->size by s->size limbs */
+
+#define SPEED_ROUTINE_MPN_DC_DIVREM_CALL(call)				\
+  {									\
+    unsigned  i;							\
+    mp_ptr    a, d, q, r;						\
+    double    t;							\
+    gmp_pi1_t dinv;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (d, s->size,   s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp);			\
+    SPEED_TMP_ALLOC_LIMBS (r, s->size,   s->align_wp2);			\
+									\
+    MPN_COPY (a, s->xp, s->size);					\
+    MPN_COPY (a+s->size, s->xp, s->size);				\
+									\
+    MPN_COPY (d, s->yp, s->size);					\
+									\
+    /* normalize the data */						\
+    d[s->size-1] |= GMP_NUMB_HIGHBIT;					\
+    a[2*s->size-1] = d[s->size-1] - 1;					\
+									\
+    invert_pi1 (dinv, d[s->size-1], d[s->size-2]);			\
+									\
+    speed_operand_src (s, a, 2*s->size);				\
+    speed_operand_src (s, d, s->size);					\
+    speed_operand_dst (s, q, s->size+1);				\
+    speed_operand_dst (s, r, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+
+/* A remainder 2*s->size by s->size limbs */
+
+#define SPEED_ROUTINE_MPZ_MOD(function)					\
+  {									\
+    unsigned   i;							\
+    mpz_t      a, d, r;							\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    mpz_init_set_n (d, s->yp, s->size);					\
+									\
+    /* high part less than d, low part a duplicate copied in */		\
+    mpz_init_set_n (a, s->xp, s->size);					\
+    mpz_mod (a, a, d);							\
+    mpz_mul_2exp (a, a, GMP_LIMB_BITS * s->size);			\
+    MPN_COPY (PTR(a), s->xp, s->size);					\
+									\
+    mpz_init (r);							\
+									\
+    speed_operand_src (s, PTR(a), SIZ(a));				\
+    speed_operand_src (s, PTR(d), SIZ(d));				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (r, a, d);						\
+    while (--i != 0);							\
+    return speed_endtime ();						\
+  }
+
+#define SPEED_ROUTINE_MPN_PI1_DIV(function, INV, DMIN, QMIN)		\
+  {									\
+    unsigned   i;							\
+    mp_ptr     dp, tp, ap, qp;						\
+    gmp_pi1_t  inv;							\
+    double     t;							\
+    mp_size_t size1;							\
+    TMP_DECL;								\
+									\
+    size1 = (s->r == 0 ? 2 * s->size : s->r);				\
+									\
+    SPEED_RESTRICT_COND (s->size >= DMIN);				\
+    SPEED_RESTRICT_COND (size1 - s->size >= QMIN);			\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (ap, size1, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp);		\
+    SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_wp2);			\
+									\
+    /* we don't fill in dividend completely when size1 > s->size */	\
+    MPN_COPY (ap,         s->xp, s->size);				\
+    MPN_COPY (ap + size1 - s->size, s->xp, s->size);			\
+									\
+    MPN_COPY (dp,         s->yp, s->size);				\
+									\
+    /* normalize the data */						\
+    dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
+    ap[size1 - 1] = dp[s->size - 1] - 1;				\
+									\
+    invert_pi1 (inv, dp[s->size-1], dp[s->size-2]);			\
+									\
+    speed_operand_src (s, ap, size1);					\
+    speed_operand_dst (s, tp, size1);					\
+    speed_operand_src (s, dp, s->size);					\
+    speed_operand_dst (s, qp, size1 - s->size);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do {								\
+      MPN_COPY (tp, ap, size1);						\
+      function (qp, tp, size1, dp, s->size, INV);			\
+    } while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+#define SPEED_ROUTINE_MPN_MU_DIV_Q(function,itchfn)			\
+  {									\
+    unsigned   i;							\
+    mp_ptr     dp, tp, qp, scratch;					\
+    double     t;							\
+    mp_size_t itch;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 2);					\
+									\
+    itch = itchfn (2 * s->size, s->size, 0);				\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
+    SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp);		\
+    SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
+									\
+    MPN_COPY (tp,         s->xp, s->size);				\
+    MPN_COPY (tp+s->size, s->xp, s->size);				\
+									\
+    /* normalize the data */						\
+    dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
+    tp[2*s->size-1] = dp[s->size-1] - 1;				\
+									\
+    speed_operand_dst (s, qp, s->size);					\
+    speed_operand_src (s, tp, 2 * s->size);				\
+    speed_operand_src (s, dp, s->size);					\
+    speed_operand_dst (s, scratch, itch);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do {								\
+      function (qp, tp, 2 * s->size, dp, s->size, scratch);		\
+    } while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+#define SPEED_ROUTINE_MPN_MU_DIV_QR(function,itchfn)			\
+  {									\
+    unsigned   i;							\
+    mp_ptr     dp, tp, qp, rp, scratch;					\
+    double     t;							\
+    mp_size_t size1, itch;						\
+    TMP_DECL;								\
+									\
+    size1 = (s->r == 0 ? 2 * s->size : s->r);				\
+									\
+    SPEED_RESTRICT_COND (s->size >= 2);					\
+    SPEED_RESTRICT_COND (size1 >= s->size);				\
+									\
+    itch = itchfn (size1, s->size, 0);					\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp);		\
+    SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
+    SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */	\
+									\
+    /* we don't fill in dividend completely when size1 > s->size */	\
+    MPN_COPY (tp,         s->xp, s->size);				\
+    MPN_COPY (tp + size1 - s->size, s->xp, s->size);			\
+									\
+    MPN_COPY (dp,         s->yp, s->size);				\
+									\
+    /* normalize the data */						\
+    dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
+    tp[size1 - 1] = dp[s->size - 1] - 1;				\
+									\
+    speed_operand_dst (s, qp, size1 - s->size);				\
+    speed_operand_dst (s, rp, s->size);					\
+    speed_operand_src (s, tp, size1);					\
+    speed_operand_src (s, dp, s->size);					\
+    speed_operand_dst (s, scratch, itch);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do {								\
+      function (qp, rp, tp, size1, dp, s->size, scratch);		\
+    } while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+#define SPEED_ROUTINE_MPN_MUPI_DIV_QR(function,itchfn)			\
+  {									\
+    unsigned   i;							\
+    mp_ptr     dp, tp, qp, rp, ip, scratch, tmp;			\
+    double     t;							\
+    mp_size_t  size1, itch;						\
+    TMP_DECL;								\
+									\
+    size1 = (s->r == 0 ? 2 * s->size : s->r);				\
+									\
+    SPEED_RESTRICT_COND (s->size >= 2);					\
+    SPEED_RESTRICT_COND (size1 >= s->size);				\
+									\
+    itch = itchfn (size1, s->size, s->size);				\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp);		\
+    SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
+    SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */	\
+    SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_wp2); /* alignment? */	\
+									\
+    /* we don't fill in dividend completely when size1 > s->size */	\
+    MPN_COPY (tp,         s->xp, s->size);				\
+    MPN_COPY (tp + size1 - s->size, s->xp, s->size);			\
+									\
+    MPN_COPY (dp,         s->yp, s->size);				\
+									\
+    /* normalize the data */						\
+    dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
+    tp[size1 - 1] = dp[s->size-1] - 1;					\
+									\
+    tmp = TMP_ALLOC_LIMBS (mpn_invert_itch (s->size));			\
+    mpn_invert (ip, dp, s->size, tmp);					\
+									\
+    speed_operand_dst (s, qp, size1 - s->size);				\
+    speed_operand_dst (s, rp, s->size);					\
+    speed_operand_src (s, tp, size1);					\
+    speed_operand_src (s, dp, s->size);					\
+    speed_operand_src (s, ip, s->size);					\
+    speed_operand_dst (s, scratch, itch);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do {								\
+      function (qp, rp, tp, size1, dp, s->size, ip, s->size, scratch);	\
+    } while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_PI1_BDIV_QR(function)				\
+  {									\
+    unsigned   i;							\
+    mp_ptr     dp, tp, ap, qp;						\
+    mp_limb_t  inv;							\
+    double     t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
+    SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size, s->align_wp2);		\
+									\
+    MPN_COPY (ap,         s->xp, s->size);				\
+    MPN_COPY (ap+s->size, s->xp, s->size);				\
+									\
+    /* divisor must be odd */						\
+    MPN_COPY (dp, s->yp, s->size);					\
+    dp[0] |= 1;								\
+    binvert_limb (inv, dp[0]);						\
+    inv = -inv;								\
+									\
+    speed_operand_src (s, ap, 2*s->size);				\
+    speed_operand_dst (s, tp, 2*s->size);				\
+    speed_operand_src (s, dp, s->size);					\
+    speed_operand_dst (s, qp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do {								\
+      MPN_COPY (tp, ap, 2*s->size);					\
+      function (qp, tp, 2*s->size, dp, s->size, inv);			\
+    } while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+#define SPEED_ROUTINE_MPN_PI1_BDIV_Q(function)				\
+  {									\
+    unsigned   i;							\
+    mp_ptr     dp, tp, qp;						\
+    mp_limb_t  inv;							\
+    double     t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
+    SPEED_TMP_ALLOC_LIMBS (tp, s->size, s->align_wp2);			\
+									\
+    /* divisor must be odd */						\
+    MPN_COPY (dp, s->yp, s->size);					\
+    dp[0] |= 1;								\
+    binvert_limb (inv, dp[0]);						\
+    inv = -inv;								\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_dst (s, tp, s->size);					\
+    speed_operand_src (s, dp, s->size);					\
+    speed_operand_dst (s, qp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do {								\
+      MPN_COPY (tp, s->xp, s->size);					\
+      function (qp, tp, s->size, dp, s->size, inv);			\
+    } while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+#define SPEED_ROUTINE_MPN_PI1_BDIV_R(function)				\
+  {									\
+    unsigned   i;							\
+    mp_ptr     dp, tp, ap;						\
+    mp_limb_t  inv;							\
+    double     t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size, s->align_wp2);		\
+									\
+    MPN_COPY (ap,         s->xp, s->size);				\
+    MPN_COPY (ap+s->size, s->xp, s->size);				\
+									\
+    /* divisor must be odd */						\
+    MPN_COPY (dp, s->yp, s->size);					\
+    dp[0] |= 1;								\
+    binvert_limb (inv, dp[0]);						\
+    inv = -inv;								\
+									\
+    speed_operand_src (s, ap, 2*s->size);				\
+    speed_operand_dst (s, tp, 2*s->size);				\
+    speed_operand_src (s, dp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do {								\
+      MPN_COPY (tp, ap, 2*s->size);					\
+      function (tp, 2*s->size, dp, s->size, inv);			\
+    } while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+#define SPEED_ROUTINE_MPN_MU_BDIV_Q(function,itchfn)			\
+  {									\
+    unsigned   i;							\
+    mp_ptr     dp, qp, scratch;						\
+    double     t;							\
+    mp_size_t itch;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 2);					\
+									\
+    itch = itchfn (s->size, s->size);					\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
+    SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
+									\
+    /* divisor must be odd */						\
+    MPN_COPY (dp, s->yp, s->size);					\
+    dp[0] |= 1;								\
+									\
+    speed_operand_dst (s, qp, s->size);					\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_src (s, dp, s->size);					\
+    speed_operand_dst (s, scratch, itch);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do {								\
+      function (qp, s->xp, s->size, dp, s->size, scratch);		\
+    } while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+#define SPEED_ROUTINE_MPN_MU_BDIV_QR(function,itchfn)			\
+  {									\
+    unsigned   i;							\
+    mp_ptr     dp, tp, qp, rp, scratch;					\
+    double     t;							\
+    mp_size_t itch;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 2);					\
+									\
+    itch = itchfn (2 * s->size, s->size);				\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
+    SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp);		\
+    SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
+    SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */	\
+									\
+    MPN_COPY (tp,         s->xp, s->size);				\
+    MPN_COPY (tp+s->size, s->xp, s->size);				\
+									\
+    /* divisor must be odd */						\
+    MPN_COPY (dp, s->yp, s->size);					\
+    dp[0] |= 1;								\
+									\
+    speed_operand_dst (s, qp, s->size);					\
+    speed_operand_dst (s, rp, s->size);					\
+    speed_operand_src (s, tp, 2 * s->size);				\
+    speed_operand_src (s, dp, s->size);					\
+    speed_operand_dst (s, scratch, itch);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do {								\
+      function (qp, rp, tp, 2 * s->size, dp, s->size, scratch);		\
+    } while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_BROOT(function)	\
+  {						\
+    SPEED_RESTRICT_COND (s->r & 1);		\
+    s->xp[0] |= 1;				\
+    SPEED_ROUTINE_MPN_UNARY_1_CALL		\
+      ((*function) (wp, s->xp, s->size, s->r));	\
+  }
+
+#define SPEED_ROUTINE_MPN_BROOTINV(function, itch)	\
+  {							\
+    mp_ptr    wp, tp;					\
+    unsigned  i;					\
+    double    t;					\
+    TMP_DECL;						\
+    TMP_MARK;						\
+    SPEED_RESTRICT_COND (s->size >= 1);			\
+    SPEED_RESTRICT_COND (s->r & 1);			\
+    wp = TMP_ALLOC_LIMBS (s->size);			\
+    tp = TMP_ALLOC_LIMBS ( (itch));			\
+    s->xp[0] |= 1;					\
+							\
+    speed_operand_src (s, s->xp, s->size);		\
+    speed_operand_dst (s, wp, s->size);			\
+    speed_cache_fill (s);				\
+							\
+    speed_starttime ();					\
+    i = s->reps;					\
+    do							\
+      (*function) (wp, s->xp, s->size, s->r, tp);	\
+    while (--i != 0);					\
+    t = speed_endtime ();				\
+							\
+    TMP_FREE;						\
+    return t;						\
+  }
+
+#define SPEED_ROUTINE_MPN_INVERT(function,itchfn)			\
+  {									\
+    long  i;								\
+    mp_ptr    up, tp, ip;						\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (up, s->size,   s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
+									\
+    MPN_COPY (up, s->xp, s->size);					\
+									\
+    /* normalize the data */						\
+    up[s->size-1] |= GMP_NUMB_HIGHBIT;					\
+									\
+    speed_operand_src (s, up, s->size);					\
+    speed_operand_dst (s, tp, s->size);					\
+    speed_operand_dst (s, ip, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (ip, up, s->size, tp);					\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_INVERTAPPR(function,itchfn)			\
+  {									\
+    long  i;								\
+    mp_ptr    up, tp, ip;						\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
+									\
+    MPN_COPY (up, s->xp, s->size);					\
+									\
+    /* normalize the data */						\
+    up[s->size-1] |= GMP_NUMB_HIGHBIT;					\
+									\
+    speed_operand_src (s, up, s->size);					\
+    speed_operand_dst (s, tp, s->size);					\
+    speed_operand_dst (s, ip, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (ip, up, s->size, tp);					\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_NI_INVERTAPPR(function,itchfn)		\
+  {									\
+    long  i;								\
+    mp_ptr    up, tp, ip;						\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 3);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
+									\
+    MPN_COPY (up, s->xp, s->size);					\
+									\
+    /* normalize the data */						\
+    up[s->size-1] |= GMP_NUMB_HIGHBIT;					\
+									\
+    speed_operand_src (s, up, s->size);					\
+    speed_operand_dst (s, tp, s->size);					\
+    speed_operand_dst (s, ip, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (ip, up, s->size, tp);					\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_BINVERT(function,itchfn)			\
+  {									\
+    long  i;								\
+    mp_ptr    up, tp, ip;						\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (up, s->size,   s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
+									\
+    MPN_COPY (up, s->xp, s->size);					\
+									\
+    /* normalize the data */						\
+    up[0] |= 1;								\
+									\
+    speed_operand_src (s, up, s->size);					\
+    speed_operand_dst (s, tp, s->size);					\
+    speed_operand_dst (s, ip, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (ip, up, s->size, tp);					\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_SEC_INVERT(function,itchfn)			\
+  {									\
+    long  i;								\
+    mp_ptr    up, mp, tp, ip;						\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
+									\
+    speed_operand_src (s, up, s->size);					\
+    speed_operand_dst (s, tp, s->size);					\
+    speed_operand_dst (s, ip, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    MPN_COPY (mp, s->yp, s->size);					\
+    /* Must be odd */							\
+    mp[0] |= 1;								\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      {									\
+	MPN_COPY (up, s->xp, s->size);					\
+	function (ip, up, mp, s->size, 2*s->size*GMP_NUMB_BITS, tp);	\
+      }									\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_REDC_1(function)					\
+  {									\
+    unsigned   i;							\
+    mp_ptr     cp, mp, tp, ap;						\
+    mp_limb_t  inv;							\
+    double     t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp);		\
+    SPEED_TMP_ALLOC_LIMBS (mp, s->size,     s->align_yp);		\
+    SPEED_TMP_ALLOC_LIMBS (cp, s->size,     s->align_wp);		\
+    SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2);		\
+									\
+    MPN_COPY (ap,         s->xp, s->size);				\
+    MPN_COPY (ap+s->size, s->xp, s->size);				\
+									\
+    /* modulus must be odd */						\
+    MPN_COPY (mp, s->yp, s->size);					\
+    mp[0] |= 1;								\
+    binvert_limb (inv, mp[0]);						\
+    inv = -inv;								\
+									\
+    speed_operand_src (s, ap, 2*s->size+1);				\
+    speed_operand_dst (s, tp, 2*s->size+1);				\
+    speed_operand_src (s, mp, s->size);					\
+    speed_operand_dst (s, cp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do {								\
+      MPN_COPY (tp, ap, 2*s->size);					\
+      function (cp, tp, mp, s->size, inv);				\
+    } while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+#define SPEED_ROUTINE_REDC_2(function)					\
+  {									\
+    unsigned   i;							\
+    mp_ptr     cp, mp, tp, ap;						\
+    mp_limb_t  invp[2];							\
+    double     t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp);		\
+    SPEED_TMP_ALLOC_LIMBS (mp, s->size,     s->align_yp);		\
+    SPEED_TMP_ALLOC_LIMBS (cp, s->size,     s->align_wp);		\
+    SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2);		\
+									\
+    MPN_COPY (ap,         s->xp, s->size);				\
+    MPN_COPY (ap+s->size, s->xp, s->size);				\
+									\
+    /* modulus must be odd */						\
+    MPN_COPY (mp, s->yp, s->size);					\
+    mp[0] |= 1;								\
+    mpn_binvert (invp, mp, 2, tp);					\
+    invp[0] = -invp[0]; invp[1] = ~invp[1];				\
+									\
+    speed_operand_src (s, ap, 2*s->size+1);				\
+    speed_operand_dst (s, tp, 2*s->size+1);				\
+    speed_operand_src (s, mp, s->size);					\
+    speed_operand_dst (s, cp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do {								\
+      MPN_COPY (tp, ap, 2*s->size);					\
+      function (cp, tp, mp, s->size, invp);				\
+    } while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+#define SPEED_ROUTINE_REDC_N(function)					\
+  {									\
+    unsigned   i;							\
+    mp_ptr     cp, mp, tp, ap, invp;					\
+    double     t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size > 8);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp);		\
+    SPEED_TMP_ALLOC_LIMBS (mp, s->size,     s->align_yp);		\
+    SPEED_TMP_ALLOC_LIMBS (cp, s->size,     s->align_wp);		\
+    SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2);		\
+    SPEED_TMP_ALLOC_LIMBS (invp, s->size,   s->align_wp2); /* align? */	\
+									\
+    MPN_COPY (ap,         s->xp, s->size);				\
+    MPN_COPY (ap+s->size, s->xp, s->size);				\
+									\
+    /* modulus must be odd */						\
+    MPN_COPY (mp, s->yp, s->size);					\
+    mp[0] |= 1;								\
+    mpn_binvert (invp, mp, s->size, tp);				\
+									\
+    speed_operand_src (s, ap, 2*s->size+1);				\
+    speed_operand_dst (s, tp, 2*s->size+1);				\
+    speed_operand_src (s, mp, s->size);					\
+    speed_operand_dst (s, cp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do {								\
+      MPN_COPY (tp, ap, 2*s->size);					\
+      function (cp, tp, mp, s->size, invp);				\
+    } while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+
+#define SPEED_ROUTINE_MPN_POPCOUNT(function)				\
+  {									\
+    unsigned i;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (s->xp, s->size);					\
+    while (--i != 0);							\
+									\
+    return speed_endtime ();						\
+  }
+
+#define SPEED_ROUTINE_MPN_HAMDIST(function)				\
+  {									\
+    unsigned i;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_src (s, s->yp, s->size);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (s->xp, s->yp, s->size);					\
+    while (--i != 0);							\
+									\
+    return speed_endtime ();						\
+  }
+
+
+#define SPEED_ROUTINE_MPZ_UI(function)					\
+  {									\
+    mpz_t     z;							\
+    unsigned  i;							\
+    double    t;							\
+									\
+    SPEED_RESTRICT_COND (s->size >= 0);					\
+									\
+    mpz_init (z);							\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (z, s->size);						\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    mpz_clear (z);							\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPZ_FAC_UI(function)    SPEED_ROUTINE_MPZ_UI(function)
+#define SPEED_ROUTINE_MPZ_FIB_UI(function)    SPEED_ROUTINE_MPZ_UI(function)
+#define SPEED_ROUTINE_MPZ_LUCNUM_UI(function) SPEED_ROUTINE_MPZ_UI(function)
+
+
+#define SPEED_ROUTINE_MPZ_2_UI(function)				\
+  {									\
+    mpz_t     z, z2;							\
+    unsigned  i;							\
+    double    t;							\
+									\
+    SPEED_RESTRICT_COND (s->size >= 0);					\
+									\
+    mpz_init (z);							\
+    mpz_init (z2);							\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (z, z2, s->size);					\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    mpz_clear (z);							\
+    mpz_clear (z2);							\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPZ_FIB2_UI(function)    SPEED_ROUTINE_MPZ_2_UI(function)
+#define SPEED_ROUTINE_MPZ_LUCNUM2_UI(function) SPEED_ROUTINE_MPZ_2_UI(function)
+
+
+#define SPEED_ROUTINE_MPN_FIB2_UI(function)				\
+  {									\
+    mp_ptr     fp, f1p;							\
+    mp_size_t  alloc;							\
+    unsigned   i;							\
+    double     t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 0);					\
+									\
+    TMP_MARK;								\
+    alloc = MPN_FIB2_SIZE (s->size);					\
+    SPEED_TMP_ALLOC_LIMBS (fp,	alloc, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (f1p, alloc, s->align_yp);			\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (fp, f1p, s->size);					\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+
+
+/* Calculate b^e mod m for random b and m of s->size limbs and random e of 6
+   limbs.  m is forced to odd so that redc can be used.  e is limited in
+   size so the calculation doesn't take too long. */
+#define SPEED_ROUTINE_MPZ_POWM(function)				\
+  {									\
+    mpz_t     r, b, e, m;						\
+    unsigned  i;							\
+    double    t;							\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    mpz_init (r);							\
+    if (s->r < 2)							\
+      mpz_init_set_n (b, s->xp, s->size);				\
+    else								\
+      mpz_init_set_ui (b, s->r);					\
+    mpz_init_set_n (m, s->yp, s->size);					\
+    mpz_setbit (m, 0);	/* force m to odd */				\
+    mpz_init_set_n (e, s->xp_block, 6);					\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (r, b, e, m);						\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    mpz_clear (r);							\
+    mpz_clear (b);							\
+    mpz_clear (e);							\
+    mpz_clear (m);							\
+    return t;								\
+  }
+
+/* (m-2)^0xAAAAAAAA mod m */
+#define SPEED_ROUTINE_MPZ_POWM_UI(function)				\
+  {									\
+    mpz_t     r, b, m;							\
+    unsigned  long  e;							\
+    unsigned  i;							\
+    double    t;							\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    mpz_init (r);							\
+									\
+    /* force m to odd */						\
+    mpz_init (m);							\
+    mpz_set_n (m, s->xp, s->size);					\
+    PTR(m)[0] |= 1;							\
+									\
+    e = (~ (unsigned long) 0) / 3;					\
+    if (s->r != 0)							\
+      e = s->r;								\
+									\
+    mpz_init_set (b, m);						\
+    mpz_sub_ui (b, b, 2);						\
+/* printf ("%X\n", mpz_get_ui(m)); */					\
+    i = s->reps;							\
+    speed_starttime ();							\
+    do									\
+      function (r, b, e, m);						\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    mpz_clear (r);							\
+    mpz_clear (b);							\
+    mpz_clear (m);							\
+    return t;								\
+  }
+
+
+#define SPEED_ROUTINE_MPN_ADDSUB_CALL(call)				\
+  {									\
+    mp_ptr    wp, wp2, xp, yp;						\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 0);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp,	s->size, s->align_wp);			\
+    SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2);			\
+    xp = s->xp;								\
+    yp = s->yp;								\
+									\
+    if (s->r == 0)	;						\
+    else if (s->r == 1) { xp = wp;	      }				\
+    else if (s->r == 2) {	    yp = wp2; }				\
+    else if (s->r == 3) { xp = wp;  yp = wp2; }				\
+    else if (s->r == 4) { xp = wp2; yp = wp;  }				\
+    else {								\
+      TMP_FREE;								\
+      return -1.0;							\
+    }									\
+    if (xp != s->xp) MPN_COPY (xp, s->xp, s->size);			\
+    if (yp != s->yp) MPN_COPY (yp, s->yp, s->size);			\
+									\
+    speed_operand_src (s, xp, s->size);					\
+    speed_operand_src (s, yp, s->size);					\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_operand_dst (s, wp2, s->size);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_ADDSUB_N(function)				\
+  SPEED_ROUTINE_MPN_ADDSUB_CALL						\
+    (function (wp, wp2, xp, yp, s->size));
+
+#define SPEED_ROUTINE_MPN_ADDSUB_NC(function)				\
+  SPEED_ROUTINE_MPN_ADDSUB_CALL						\
+    (function (wp, wp2, xp, yp, s->size, 0));
+
+
+/* Doing an Nx1 gcd with the given r. */
+#define SPEED_ROUTINE_MPN_GCD_1N(function)				\
+  {									\
+    mp_ptr    xp;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+    SPEED_RESTRICT_COND (s->r != 0);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp);			\
+    MPN_COPY (xp, s->xp, s->size);					\
+    xp[0] |= refmpn_zero_p (xp, s->size);				\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (xp, s->size, s->r);					\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+
+/* SPEED_BLOCK_SIZE many one GCDs of s->size bits each. */
+
+#define SPEED_ROUTINE_MPN_GCD_1_CALL(setup, call)			\
+  {									\
+    unsigned  i, j;							\
+    mp_ptr    px, py;							\
+    mp_limb_t x_mask, y_mask;						\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+    SPEED_RESTRICT_COND (s->size <= mp_bits_per_limb);			\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (px, SPEED_BLOCK_SIZE, s->align_xp);		\
+    SPEED_TMP_ALLOC_LIMBS (py, SPEED_BLOCK_SIZE, s->align_yp);		\
+    MPN_COPY (px, s->xp_block, SPEED_BLOCK_SIZE);			\
+    MPN_COPY (py, s->yp_block, SPEED_BLOCK_SIZE);			\
+									\
+    x_mask = MP_LIMB_T_LOWBITMASK (s->size);				\
+    y_mask = MP_LIMB_T_LOWBITMASK (s->r != 0 ? s->r : s->size);		\
+    for (i = 0; i < SPEED_BLOCK_SIZE; i++)				\
+      {									\
+	px[i] &= x_mask; px[i] += (px[i] == 0);				\
+	py[i] &= y_mask; py[i] += (py[i] == 0);				\
+	setup;								\
+      }									\
+									\
+    speed_operand_src (s, px, SPEED_BLOCK_SIZE);			\
+    speed_operand_src (s, py, SPEED_BLOCK_SIZE);			\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      {									\
+	j = SPEED_BLOCK_SIZE;						\
+	do								\
+	  {								\
+	    call;							\
+	  }								\
+	while (--j != 0);						\
+      }									\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+									\
+    s->time_divisor = SPEED_BLOCK_SIZE;					\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_GCD_1(function)				\
+  SPEED_ROUTINE_MPN_GCD_1_CALL( , function (&px[j-1], 1, py[j-1]))
+
+#define SPEED_ROUTINE_MPN_GCD_11(function)				\
+  SPEED_ROUTINE_MPN_GCD_1_CALL((px[i] |= 1, py[i] |= 1),		\
+			       function (px[j-1], py[j-1]))
+
+/* Multiply limbs by (B+1). Then we get a gcd exceeding one limb, so
+   we can measure gcd_22 loop only, without gcd_11. */
+#define SPEED_ROUTINE_MPN_GCD_22(function)				\
+  SPEED_ROUTINE_MPN_GCD_1_CALL((px[i] |= 1, py[i] |= 1),		\
+			       function (px[j-1], px[j-1], py[j-1], py[j-1]))
+
+#define SPEED_ROUTINE_MPN_JACBASE(function)				\
+  SPEED_ROUTINE_MPN_GCD_1_CALL						\
+    ({									\
+       /* require x<y, y odd, y!=1 */					\
+       px[i] %= py[i];							\
+       px[i] |= 1;							\
+       py[i] |= 1;							\
+       if (py[i]==1) py[i]=3;						\
+     },									\
+     function (px[j-1], py[j-1], 0))
+
+#define SPEED_ROUTINE_MPN_HGCD2(function)				\
+  {									\
+    unsigned   i, j;							\
+    struct hgcd_matrix1 m = {{{0,0},{0,0}}};				\
+    double     t;							\
+									\
+    speed_operand_src (s, s->xp_block, SPEED_BLOCK_SIZE);		\
+    speed_operand_src (s, s->yp_block, SPEED_BLOCK_SIZE);		\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    mp_limb_t chain = 0;						\
+    do									\
+      {									\
+	for (j = 0; j < SPEED_BLOCK_SIZE; j+= 2)			\
+	  {								\
+	    /* randomized but successively dependent */			\
+	    function (s->xp_block[j] | GMP_NUMB_HIGHBIT,		\
+		      s->xp_block[j+1] + chain,				\
+		      s->yp_block[j] | GMP_NUMB_HIGHBIT,		\
+		      s->yp_block[j+1], &m);				\
+	    chain += m.u[0][0];						\
+	  }								\
+      }									\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    /* make sure the compiler won't optimize away chain */		\
+    noop_1 (chain);							\
+									\
+    s->time_divisor = SPEED_BLOCK_SIZE / 2;				\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_HGCD_CALL(func, itchfunc)			\
+  {									\
+    mp_size_t hgcd_init_itch, hgcd_itch;				\
+    mp_ptr ap, bp, wp, tmp1;						\
+    struct hgcd_matrix hgcd;						\
+    int res;								\
+    unsigned i;								\
+    double t;								\
+    TMP_DECL;								\
+									\
+    if (s->size < 2)							\
+      return -1;							\
+									\
+    TMP_MARK;								\
+									\
+    SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp);		\
+    SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp);		\
+									\
+    s->xp[s->size - 1] |= 1;						\
+    s->yp[s->size - 1] |= 1;						\
+									\
+    hgcd_init_itch = MPN_HGCD_MATRIX_INIT_ITCH (s->size);		\
+    hgcd_itch = itchfunc (s->size);					\
+									\
+    SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_itch, s->align_wp);		\
+    SPEED_TMP_ALLOC_LIMBS (wp, hgcd_itch, s->align_wp);			\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_src (s, s->yp, s->size);				\
+    speed_operand_dst (s, ap, s->size + 1);				\
+    speed_operand_dst (s, bp, s->size + 1);				\
+    speed_operand_dst (s, wp, hgcd_itch);				\
+    speed_operand_dst (s, tmp1, hgcd_init_itch);			\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      {									\
+	MPN_COPY (ap, s->xp, s->size);					\
+	MPN_COPY (bp, s->yp, s->size);					\
+	mpn_hgcd_matrix_init (&hgcd, s->size, tmp1);			\
+	res = func (ap, bp, s->size, &hgcd, wp);			\
+      }									\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL(func, itchfunc)		\
+  {									\
+    mp_size_t hgcd_init_itch, hgcd_step_itch;				\
+    mp_ptr ap, bp, wp, tmp1;						\
+    struct hgcd_matrix hgcd;						\
+    mp_size_t p = s->size/2;						\
+    int res;								\
+    unsigned i;								\
+    double t;								\
+    TMP_DECL;								\
+									\
+    if (s->size < 2)							\
+      return -1;							\
+									\
+    TMP_MARK;								\
+									\
+    SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp);		\
+    SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp);		\
+									\
+    s->xp[s->size - 1] |= 1;						\
+    s->yp[s->size - 1] |= 1;						\
+									\
+    hgcd_init_itch = MPN_HGCD_MATRIX_INIT_ITCH (s->size);		\
+    hgcd_step_itch = itchfunc (s->size, p);				\
+									\
+    SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_itch, s->align_wp);		\
+    SPEED_TMP_ALLOC_LIMBS (wp, hgcd_step_itch, s->align_wp);			\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_src (s, s->yp, s->size);				\
+    speed_operand_dst (s, ap, s->size + 1);				\
+    speed_operand_dst (s, bp, s->size + 1);				\
+    speed_operand_dst (s, wp, hgcd_step_itch);				\
+    speed_operand_dst (s, tmp1, hgcd_init_itch);			\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      {									\
+	MPN_COPY (ap, s->xp, s->size);					\
+	MPN_COPY (bp, s->yp, s->size);					\
+	mpn_hgcd_matrix_init (&hgcd, s->size, tmp1);			\
+	res = func (&hgcd, ap, bp, s->size, p, wp);			\
+      }									\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+/* Run some GCDs of s->size limbs each.  The number of different data values
+   is decreased as s->size**2, since GCD is a quadratic algorithm.
+   SPEED_ROUTINE_MPN_GCD runs more times than SPEED_ROUTINE_MPN_GCDEXT
+   though, because the plain gcd is about twice as fast as gcdext.  */
+
+#define SPEED_ROUTINE_MPN_GCD_CALL(datafactor, call)			\
+  {									\
+    unsigned  i;							\
+    mp_size_t j, pieces, psize;						\
+    mp_ptr    wp, wp2, xtmp, ytmp, px, py;				\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp);		\
+    SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp);		\
+    SPEED_TMP_ALLOC_LIMBS (wp,   s->size+1, s->align_wp);		\
+    SPEED_TMP_ALLOC_LIMBS (wp2,  s->size+1, s->align_wp2);		\
+									\
+    pieces = SPEED_BLOCK_SIZE * datafactor / s->size / s->size;		\
+    pieces = MIN (pieces, SPEED_BLOCK_SIZE / s->size);			\
+    pieces = MAX (pieces, 1);						\
+									\
+    psize = pieces * s->size;						\
+    px = TMP_ALLOC_LIMBS (psize);					\
+    py = TMP_ALLOC_LIMBS (psize);					\
+    MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize);		\
+    MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize);		\
+									\
+    /* Requirements: x >= y, y must be odd, high limbs != 0.		\
+       No need to ensure random numbers are really great.  */		\
+    for (j = 0; j < pieces; j++)					\
+      {									\
+	mp_ptr	x = px + j * s->size;					\
+	mp_ptr	y = py + j * s->size;					\
+	if (x[s->size - 1] == 0) x[s->size - 1] = 1;			\
+	if (y[s->size - 1] == 0) y[s->size - 1] = 1;			\
+									\
+	if (x[s->size - 1] < y[s->size - 1])				\
+	  MP_LIMB_T_SWAP (x[s->size - 1], y[s->size - 1]);		\
+	else if (x[s->size - 1] == y[s->size - 1])			\
+	  {								\
+	    x[s->size - 1] = 2;						\
+	    y[s->size - 1] = 1;						\
+	  }								\
+	y[0] |= 1;							\
+      }									\
+									\
+    speed_operand_src (s, px, psize);					\
+    speed_operand_src (s, py, psize);					\
+    speed_operand_dst (s, xtmp, s->size);				\
+    speed_operand_dst (s, ytmp, s->size);				\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      {									\
+	j = pieces;							\
+	do								\
+	  {								\
+	    MPN_COPY (xtmp, px+(j - 1)*s->size, s->size);		\
+	    MPN_COPY (ytmp, py+(j - 1)*s->size, s->size);		\
+	    call;							\
+	  }								\
+	while (--j != 0);						\
+      }									\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+									\
+    s->time_divisor = pieces;						\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_GCD(function)	\
+  SPEED_ROUTINE_MPN_GCD_CALL (8, function (wp, xtmp, s->size, ytmp, s->size))
+
+#define SPEED_ROUTINE_MPN_GCDEXT(function)				\
+  SPEED_ROUTINE_MPN_GCD_CALL						\
+    (4, { mp_size_t  wp2size;						\
+	  function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); })
+
+
+#define SPEED_ROUTINE_MPN_GCDEXT_ONE(function)				\
+  {									\
+    unsigned  i;							\
+    mp_size_t j, pieces, psize, wp2size;				\
+    mp_ptr    wp, wp2, xtmp, ytmp, px, py;				\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+									\
+    SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp);		\
+    SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp);		\
+    MPN_COPY (xtmp, s->xp, s->size);					\
+    MPN_COPY (ytmp, s->yp, s->size);					\
+									\
+    SPEED_TMP_ALLOC_LIMBS (wp,	s->size+1, s->align_wp);		\
+    SPEED_TMP_ALLOC_LIMBS (wp2, s->size+1, s->align_wp2);		\
+									\
+    pieces = SPEED_BLOCK_SIZE / 3;					\
+    psize = 3 * pieces;							\
+    px = TMP_ALLOC_LIMBS (psize);					\
+    py = TMP_ALLOC_LIMBS (psize);					\
+    MPN_COPY (px, s->xp_block, psize);					\
+    MPN_COPY (py, s->yp_block, psize);					\
+									\
+    /* x must have at least as many bits as y,				\
+       high limbs must be non-zero */					\
+    for (j = 0; j < pieces; j++)					\
+      {									\
+	mp_ptr	x = px+3*j;						\
+	mp_ptr	y = py+3*j;						\
+	x[2] += (x[2] == 0);						\
+	y[2] += (y[2] == 0);						\
+	if (x[2] < y[2])						\
+	  MP_LIMB_T_SWAP (x[2], y[2]);					\
+      }									\
+									\
+    speed_operand_src (s, px, psize);					\
+    speed_operand_src (s, py, psize);					\
+    speed_operand_dst (s, xtmp, s->size);				\
+    speed_operand_dst (s, ytmp, s->size);				\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      {									\
+	mp_ptr	x = px;							\
+	mp_ptr	y = py;							\
+	mp_ptr	xth = &xtmp[s->size-3];					\
+	mp_ptr	yth = &ytmp[s->size-3];					\
+	j = pieces;							\
+	do								\
+	  {								\
+	    xth[0] = x[0], xth[1] = x[1], xth[2] = x[2];		\
+	    yth[0] = y[0], yth[1] = y[1], yth[2] = y[2];		\
+									\
+	    ytmp[0] |= 1; /* y must be odd, */				\
+									\
+	    function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size);	\
+									\
+	    x += 3;							\
+	    y += 3;							\
+	  }								\
+	while (--j != 0);						\
+      }									\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+									\
+    s->time_divisor = pieces;						\
+    return t;								\
+  }
+
+/* Calculate nextprime(n) for random n of s->size bits (not limbs). */
+#define SPEED_ROUTINE_MPZ_NEXTPRIME(function)				\
+  {									\
+    unsigned  i, j;							\
+    mpz_t     wp, n;							\
+    double    t;							\
+									\
+    SPEED_RESTRICT_COND (s->size >= 10);				\
+									\
+    mpz_init (wp);							\
+    mpz_init_set_n (n, s->xp, s->size);					\
+    /* limit to s->size bits, as this function is very slow */		\
+    mpz_tdiv_r_2exp (n, n, s->size);					\
+    /* set high bits so operand and result are genaral s->size bits */	\
+    mpz_setbit (n, s->size - 1);					\
+    mpz_clrbit (n, s->size - 2);					\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      {									\
+        /* nextprime timing is variable, so average over many calls */	\
+        j = SPEED_BLOCK_SIZE - 1;					\
+        /* starts on random, after measures prime to next prime */	\
+        function (wp, n);						\
+        do								\
+          {								\
+            function (wp, wp);						\
+          }								\
+        while (--j != 0);						\
+      }									\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    mpz_clear (wp);							\
+    mpz_clear (n);							\
+									\
+    s->time_divisor = SPEED_BLOCK_SIZE;					\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPZ_JACOBI(function)				\
+  {									\
+    mpz_t     a, b;							\
+    unsigned  i;							\
+    mp_size_t j, pieces, psize;						\
+    mp_ptr    px, py;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    TMP_MARK;								\
+    pieces = SPEED_BLOCK_SIZE / MAX (s->size, 1);			\
+    pieces = MAX (pieces, 1);						\
+    s->time_divisor = pieces;						\
+									\
+    psize = pieces * s->size;						\
+    px = TMP_ALLOC_LIMBS (psize);					\
+    py = TMP_ALLOC_LIMBS (psize);					\
+    MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize);		\
+    MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize);		\
+									\
+    for (j = 0; j < pieces; j++)					\
+      {									\
+	mp_ptr	x = px+j*s->size;					\
+	mp_ptr	y = py+j*s->size;					\
+									\
+	/* y odd */							\
+	y[0] |= 1;							\
+									\
+	/* high limbs non-zero */					\
+	if (x[s->size-1] == 0) x[s->size-1] = 1;			\
+	if (y[s->size-1] == 0) y[s->size-1] = 1;			\
+      }									\
+									\
+    SIZ(a) = s->size;							\
+    SIZ(b) = s->size;							\
+									\
+    speed_operand_src (s, px, psize);					\
+    speed_operand_src (s, py, psize);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      {									\
+	j = pieces;							\
+	do								\
+	  {								\
+	    PTR(a) = px+(j-1)*s->size;					\
+	    PTR(b) = py+(j-1)*s->size;					\
+	    function (a, b);						\
+	  }								\
+	while (--j != 0);						\
+      }									\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_DIVREM_2(function)				\
+  {									\
+    mp_ptr    wp, xp;							\
+    mp_limb_t yp[2];							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 2);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
+									\
+    /* source is destroyed */						\
+    MPN_COPY (xp, s->xp, s->size);					\
+									\
+    /* divisor must be normalized */					\
+    MPN_COPY (yp, s->yp_block, 2);					\
+    yp[1] |= GMP_NUMB_HIGHBIT;						\
+									\
+    speed_operand_src (s, xp, s->size);					\
+    speed_operand_src (s, yp, 2);					\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (wp, 0, xp, s->size, yp);				\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_DIV_QR_1(function)				\
+  {									\
+    mp_ptr    wp, xp;							\
+    mp_limb_t d;							\
+    mp_limb_t r;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
+									\
+    d = s->r;								\
+    if (d == 0)								\
+      d = 1;								\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      r = function (wp, wp+s->size-1, s->xp, s->size, d);		\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_DIV_QR_1N_PI1(function)			\
+  {									\
+    mp_ptr    wp, xp;							\
+    mp_limb_t d, dinv;							\
+    mp_limb_t r;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
+									\
+    d = s->r;								\
+    /* divisor must be normalized */					\
+    SPEED_RESTRICT_COND (d & GMP_NUMB_HIGHBIT);				\
+    invert_limb (dinv, d);						\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      r = function (wp, s->xp, s->size, 0, d, dinv);			\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_DIV_QR_2(function, norm)			\
+  {									\
+    mp_ptr    wp, xp;							\
+    mp_limb_t yp[2];							\
+    mp_limb_t rp[2];							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 2);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
+									\
+    /* divisor must be normalized */					\
+    MPN_COPY (yp, s->yp_block, 2);					\
+    if (norm)								\
+      yp[1] |= GMP_NUMB_HIGHBIT;					\
+    else								\
+      {									\
+	yp[1] &= ~GMP_NUMB_HIGHBIT;					\
+	if (yp[1] == 0)							\
+	  yp[1] = 1;							\
+      }									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_src (s, yp, 2);					\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_operand_dst (s, rp, 2);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (wp, rp, s->xp, s->size, yp);				\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MODLIMB_INVERT(function)				\
+  {									\
+    unsigned   i, j;							\
+    mp_ptr     xp;							\
+    mp_limb_t  n = 1;							\
+    double     t;							\
+									\
+    xp = s->xp_block-1;							\
+									\
+    speed_operand_src (s, s->xp_block, SPEED_BLOCK_SIZE);		\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      {									\
+	j = SPEED_BLOCK_SIZE;						\
+	do								\
+	  {								\
+	    /* randomized but successively dependent */			\
+	    n += (xp[j] << 1);						\
+									\
+	    function (n, n);						\
+	  }								\
+	while (--j != 0);						\
+      }									\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    /* make sure the compiler won't optimize away n */			\
+    noop_1 (n);								\
+									\
+    s->time_divisor = SPEED_BLOCK_SIZE;					\
+    return t;								\
+  }
+
+
+#define SPEED_ROUTINE_MPN_SQRTROOT_CALL(call)				\
+  {									\
+    mp_ptr    wp, wp2;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp,	s->size, s->align_wp);			\
+    SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2);			\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_operand_dst (s, wp2, s->size);				\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+
+/* Calculate worst case for perfect_power
+   Worst case is multiple prime factors larger than trial div limit. */
+#define SPEED_ROUTINE_MPN_PERFECT_POWER(function)		 	\
+  {									\
+    mpz_t     r;							\
+    unsigned  i, power;							\
+    double    t;							\
+									\
+    SPEED_RESTRICT_COND (s->size >= 10);				\
+									\
+    mpz_init (r);							\
+    power = s->size * GMP_NUMB_BITS / 17;				\
+    mpz_ui_pow_ui(r, (1 << 17) - 1, power - 1);				\
+    mpz_mul_ui(r, r, (1 << 16) + 1);	/* larger than 1000th prime */	\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (PTR(r), SIZ(r));					\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    mpz_clear (r);							\
+    return t;								\
+  }
+
+/* Calculate worst case (larger prime) for perfect_square */
+#define SPEED_ROUTINE_MPN_PERFECT_SQUARE(function)			\
+  {									\
+    mpz_t     r;							\
+    unsigned  i;							\
+    double    t;							\
+									\
+    SPEED_RESTRICT_COND (s->size >= 2);					\
+    mpz_init_set_n (r, s->xp, s->size / 2);				\
+    mpz_setbit (r, s->size * GMP_NUMB_BITS / 2 - 1);			\
+    mpz_mul (r, r, r);							\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function (PTR(r), SIZ(r));					\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    mpz_clear (r);							\
+    return t;								\
+  }
+
+
+/* s->size controls the number of limbs in the input, s->r is the base, or
+   decimal by default. */
+#define SPEED_ROUTINE_MPN_GET_STR(function)				\
+  {									\
+    unsigned char *wp;							\
+    mp_size_t wn;							\
+    mp_ptr xp;								\
+    int base;								\
+    unsigned i;								\
+    double t;								\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    base = s->r == 0 ? 10 : s->r;					\
+    SPEED_RESTRICT_COND (base >= 2 && base <= 256);			\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (xp, s->size + 1, s->align_xp);		\
+									\
+    MPN_SIZEINBASE (wn, s->xp, s->size, base);				\
+    wp = (unsigned char *) TMP_ALLOC (wn);				\
+									\
+    /* use this during development to guard against overflowing wp */	\
+    /*									\
+    MPN_COPY (xp, s->xp, s->size);					\
+    ASSERT_ALWAYS (mpn_get_str (wp, base, xp, s->size) <= wn);		\
+    */									\
+									\
+    speed_operand_src (s, s->xp, s->size);				\
+    speed_operand_dst (s, xp, s->size);					\
+    speed_operand_dst (s, (mp_ptr) wp, wn/GMP_LIMB_BYTES);		\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      {									\
+	MPN_COPY (xp, s->xp, s->size);					\
+	function (wp, base, xp, s->size);				\
+      }									\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+/* s->size controls the number of digits in the input, s->r is the base, or
+   decimal by default. */
+#define SPEED_ROUTINE_MPN_SET_STR_CALL(call)				\
+  {									\
+    unsigned char *xp;							\
+    mp_ptr     wp;							\
+    mp_size_t  wn;							\
+    unsigned   i;							\
+    int        base;							\
+    double     t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 1);					\
+									\
+    base = s->r == 0 ? 10 : s->r;					\
+    SPEED_RESTRICT_COND (base >= 2 && base <= 256);			\
+									\
+    TMP_MARK;								\
+									\
+    xp = (unsigned char *) TMP_ALLOC (s->size);				\
+    for (i = 0; i < s->size; i++)					\
+      xp[i] = s->xp[i] % base;						\
+									\
+    LIMBS_PER_DIGIT_IN_BASE (wn, s->size, base);			\
+    SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);			\
+									\
+    /* use this during development to check wn is big enough */		\
+    /*									\
+    ASSERT_ALWAYS (mpn_set_str (wp, xp, s->size, base) <= wn);		\
+    */									\
+									\
+    speed_operand_src (s, (mp_ptr) xp, s->size/GMP_LIMB_BYTES);	\
+    speed_operand_dst (s, wp, wn);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+
+/* Run an accel gcd find_a() function over various data values.  A set of
+   values is used in case some run particularly fast or slow.  The size
+   parameter is ignored, the amount of data tested is fixed.  */
+
+#define SPEED_ROUTINE_MPN_GCD_FINDA(function)				\
+  {									\
+    unsigned  i, j;							\
+    mp_limb_t cp[SPEED_BLOCK_SIZE][2];					\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    TMP_MARK;								\
+									\
+    /* low must be odd, high must be non-zero */			\
+    for (i = 0; i < SPEED_BLOCK_SIZE; i++)				\
+      {									\
+	cp[i][0] = s->xp_block[i] | 1;					\
+	cp[i][1] = s->yp_block[i] + (s->yp_block[i] == 0);		\
+      }									\
+									\
+    speed_operand_src (s, &cp[0][0], 2*SPEED_BLOCK_SIZE);		\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      {									\
+	j = SPEED_BLOCK_SIZE;						\
+	do								\
+	  {								\
+	    function (cp[j-1]);						\
+	  }								\
+	while (--j != 0);						\
+      }									\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+									\
+    s->time_divisor = SPEED_BLOCK_SIZE;					\
+    return t;								\
+  }
+
+
+/* "call" should do "count_foo_zeros(c,n)".
+   Give leading=1 if foo is leading zeros, leading=0 for trailing.
+   Give zero=1 if n=0 is allowed in the call, zero=0 if not.  */
+
+#define SPEED_ROUTINE_COUNT_ZEROS_A(leading, zero)			\
+  {									\
+    mp_ptr     xp;							\
+    int        i, c;							\
+    unsigned   j;							\
+    mp_limb_t  n;							\
+    double     t;							\
+    TMP_DECL;								\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (xp, SPEED_BLOCK_SIZE, s->align_xp);		\
+									\
+    if (! speed_routine_count_zeros_setup (s, xp, leading, zero))	\
+      return -1.0;							\
+    speed_operand_src (s, xp, SPEED_BLOCK_SIZE);			\
+    speed_cache_fill (s);						\
+									\
+    c = 0;								\
+    speed_starttime ();							\
+    j = s->reps;							\
+    do {								\
+      for (i = 0; i < SPEED_BLOCK_SIZE; i++)				\
+	{								\
+	  n = xp[i];							\
+	  n ^= c;							\
+
+#define SPEED_ROUTINE_COUNT_ZEROS_B()					\
+	}								\
+    } while (--j != 0);							\
+    t = speed_endtime ();						\
+									\
+    /* don't let c go dead */						\
+    noop_1 (c);								\
+									\
+    s->time_divisor = SPEED_BLOCK_SIZE;					\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }									\
+
+#define SPEED_ROUTINE_COUNT_ZEROS_C(call, leading, zero)		\
+  do {									\
+    SPEED_ROUTINE_COUNT_ZEROS_A (leading, zero);			\
+    call;								\
+    SPEED_ROUTINE_COUNT_ZEROS_B ();					\
+  } while (0)								\
+
+#define SPEED_ROUTINE_COUNT_LEADING_ZEROS_C(call,zero)			\
+  SPEED_ROUTINE_COUNT_ZEROS_C (call, 1, zero)
+#define SPEED_ROUTINE_COUNT_LEADING_ZEROS(fun)				\
+  SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 1, 0)
+
+#define SPEED_ROUTINE_COUNT_TRAILING_ZEROS_C(call,zero)			\
+  SPEED_ROUTINE_COUNT_ZEROS_C (call, 0, zero)
+#define SPEED_ROUTINE_COUNT_TRAILING_ZEROS(call)			\
+  SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 0, 0)
+
+
+#define SPEED_ROUTINE_INVERT_LIMB_CALL(call)				\
+  {									\
+    unsigned   i, j;							\
+    mp_limb_t  d, dinv=0;						\
+    mp_ptr     xp = s->xp_block - 1;					\
+									\
+    s->time_divisor = SPEED_BLOCK_SIZE;					\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      {									\
+	j = SPEED_BLOCK_SIZE;						\
+	do								\
+	  {								\
+	    d = dinv ^ xp[j];						\
+	    d |= GMP_LIMB_HIGHBIT;					\
+	    do { call; } while (0);					\
+	  }								\
+	while (--j != 0);						\
+      }									\
+    while (--i != 0);							\
+									\
+    /* don't let the compiler optimize everything away */		\
+    noop_1 (dinv);							\
+									\
+    return speed_endtime();						\
+  }
+
+
+#define SPEED_ROUTINE_MPN_BACK_TO_BACK(function)			\
+  {									\
+    unsigned  i;							\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function ();							\
+    while (--i != 0);							\
+    return speed_endtime ();						\
+  }
+
+
+#define SPEED_ROUTINE_MPN_ZERO_CALL(call)				\
+  {									\
+    mp_ptr    wp;							\
+    unsigned  i;							\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 0);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
+    speed_operand_dst (s, wp, s->size);					\
+    speed_cache_fill (s);						\
+									\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      call;								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_ZERO(function)				\
+  SPEED_ROUTINE_MPN_ZERO_CALL (function (wp, s->size))
+
+
+#endif

diff --git a/third_party/gmp/tune/time.c b/third_party/gmp/tune/time.c
new file mode 100644
index 0000000..9060c3d
--- /dev/null
+++ b/third_party/gmp/tune/time.c

@@ -0,0 +1,1596 @@
+/* Time routines for speed measurements.
+
+Copyright 1999-2004, 2010-2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+/* Usage:
+
+   The code in this file implements the lowest level of time measuring,
+   simple one-time measuring of time between two points.
+
+   void speed_starttime (void)
+   double speed_endtime (void)
+       Call speed_starttime to start measuring, and then call speed_endtime
+       when done.
+
+       speed_endtime returns the time taken, in seconds.  Or if the timebase
+       is in CPU cycles and the CPU frequency is unknown then speed_endtime
+       returns cycles.  Applications can identify the cycles return by
+       checking for speed_cycletime (described below) equal to 1.0.
+
+       If some sort of temporary glitch occurs then speed_endtime returns
+       0.0.  Currently this is for various cases where a negative time has
+       occurred.  This unfortunately occurs with getrusage on some systems,
+       and with the hppa cycle counter on hpux.
+
+   double speed_cycletime
+       The time in seconds for each CPU cycle.  For example on a 100 MHz CPU
+       this would be 1.0e-8.
+
+       If the CPU frequency is unknown, then speed_cycletime is either 0.0
+       or 1.0.  It's 0.0 when speed_endtime is returning seconds, or it's
+       1.0 when speed_endtime is returning cycles.
+
+       It may be noted that "speed_endtime() / speed_cycletime" gives a
+       measured time in cycles, irrespective of whether speed_endtime is
+       returning cycles or seconds.  (Assuming cycles can be had, ie. it's
+       either cycles already or the cpu frequency is known.  See also
+       speed_cycletime_need_cycles below.)
+
+   double speed_unittime
+       The unit of time measurement accuracy for the timing method in use.
+       This is in seconds or cycles, as per speed_endtime.
+
+   char speed_time_string[]
+       A null-terminated string describing the time method in use.
+
+   void speed_time_init (void)
+       Initialize time measuring.  speed_starttime() does this
+       automatically, so it's only needed if an application wants to inspect
+       the above global variables before making a measurement.
+
+   int speed_precision
+       The intended accuracy of time measurements.  speed_measure() in
+       common.c for instance runs target routines with enough repetitions so
+       it takes at least "speed_unittime * speed_precision" (this expression
+       works for both cycles or seconds from speed_endtime).
+
+       A program can provide an option so the user to set speed_precision.
+       If speed_precision is zero when speed_time_init or speed_starttime
+       first run then it gets a default based on the measuring method
+       chosen.  (More precision for higher accuracy methods.)
+
+   void speed_cycletime_need_seconds (void)
+       Call this to demand that speed_endtime will return seconds, and not
+       cycles.  If only cycles are available then an error is printed and
+       the program exits.
+
+   void speed_cycletime_need_cycles (void)
+       Call this to demand that speed_cycletime is non-zero, so that
+       "speed_endtime() / speed_cycletime" will give times in cycles.
+
+
+
+   Notes:
+
+   Various combinations of cycle counter, read_real_time(), getrusage(),
+   gettimeofday() and times() can arise, according to which are available
+   and their precision.
+
+
+   Allowing speed_endtime() to return either seconds or cycles is only a
+   slight complication and makes it possible for the speed program to do
+   some sensible things without demanding the CPU frequency.  If seconds are
+   being measured then it can always print seconds, and if cycles are being
+   measured then it can always print them without needing to know how long
+   they are.  Also the tune program doesn't care at all what the units are.
+
+   GMP_CPU_FREQUENCY can always be set when the automated methods in freq.c
+   fail.  This will be needed if times in seconds are wanted but a cycle
+   counter is being used, or if times in cycles are wanted but getrusage or
+   another seconds based timer is in use.
+
+   If the measuring method uses a cycle counter but supplements it with
+   getrusage or the like, then knowing the CPU frequency is mandatory since
+   the code compares values from the two.
+
+
+   Not done:
+
+   Solaris gethrtime() seems no more than a slow way to access the Sparc V9
+   cycle counter.  gethrvtime() seems to be relevant only to light weight
+   processes, it doesn't for instance give nanosecond virtual time.  So
+   neither of these are used.
+
+
+   Bugs:
+
+   getrusage_microseconds_p is fundamentally flawed, getrusage and
+   gettimeofday can have resolutions other than clock ticks or microseconds,
+   for instance IRIX 5 has a tick of 10 ms but a getrusage of 1 ms.
+
+
+   Enhancements:
+
+   The SGI hardware counter has 64 bits on some machines, which could be
+   used when available.  But perhaps 32 bits is enough range, and then rely
+   on the getrusage supplement.
+
+   Maybe getrusage (or times) should be used as a supplement for any
+   wall-clock measuring method.  Currently a wall clock with a good range
+   (eg. a 64-bit cycle counter) is used without a supplement.
+
+   On PowerPC the timebase registers could be used, but would have to do
+   something to find out the speed.  On 6xx chips it's normally 1/4 bus
+   speed, on 4xx chips it's either that or an external clock.  Measuring
+   against gettimeofday might be ok.  */
+
+
+#include "config.h"
+
+#include <errno.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h> /* for getenv() */
+
+#if HAVE_FCNTL_H
+#include <fcntl.h>  /* for open() */
+#endif
+
+#if HAVE_STDINT_H
+#include <stdint.h> /* for uint64_t */
+#endif
+
+#if HAVE_UNISTD_H
+#include <unistd.h> /* for sysconf() */
+#endif
+
+#include <sys/types.h>
+
+#if TIME_WITH_SYS_TIME
+# include <sys/time.h>  /* for struct timeval */
+# include <time.h>
+#else
+# if HAVE_SYS_TIME_H
+#  include <sys/time.h>
+# else
+#  include <time.h>
+# endif
+#endif
+
+#if HAVE_SYS_MMAN_H
+#include <sys/mman.h>      /* for mmap() */
+#endif
+
+#if HAVE_SYS_RESOURCE_H
+#include <sys/resource.h>  /* for struct rusage */
+#endif
+
+#if HAVE_SYS_SYSSGI_H
+#include <sys/syssgi.h>    /* for syssgi() */
+#endif
+
+#if HAVE_SYS_SYSTEMCFG_H
+#include <sys/systemcfg.h> /* for RTC_POWER on AIX */
+#endif
+
+#if HAVE_SYS_TIMES_H
+#include <sys/times.h>  /* for times() and struct tms */
+#endif
+
+#include "gmp-impl.h"
+
+#include "speed.h"
+
+
+/* strerror is only used for some stuff on newish systems, no need to have a
+   proper replacement */
+#if ! HAVE_STRERROR
+#define strerror(n)  "<strerror not available>"
+#endif
+
+
+char    speed_time_string[256];
+int     speed_precision = 0;
+double  speed_unittime;
+double  speed_cycletime = 0.0;
+
+
+/* don't rely on "unsigned" to "double" conversion, it's broken in SunOS 4
+   native cc */
+#define M_2POWU   (((double) INT_MAX + 1.0) * 2.0)
+
+#define M_2POW32  4294967296.0
+#define M_2POW64  (M_2POW32 * M_2POW32)
+
+
+/* Conditionals for the time functions available are done with normal C
+   code, which is a lot easier than wildly nested preprocessor directives.
+
+   The choice of what to use is partly made at run-time, according to
+   whether the cycle counter works and the measured accuracy of getrusage
+   and gettimeofday.
+
+   A routine that's not available won't be getting called, but is an abort()
+   to be sure it isn't called mistakenly.
+
+   It can be assumed that if a function exists then its data type will, but
+   if the function doesn't then the data type might or might not exist, so
+   the type can't be used unconditionally.  The "struct_rusage" etc macros
+   provide dummies when the respective function doesn't exist. */
+
+
+#if HAVE_SPEED_CYCLECOUNTER
+static const int have_cycles = HAVE_SPEED_CYCLECOUNTER;
+#else
+static const int have_cycles = 0;
+#define speed_cyclecounter(p)  ASSERT_FAIL (speed_cyclecounter not available)
+#endif
+
+/* "stck" returns ticks since 1 Jan 1900 00:00 GMT, where each tick is 2^-12
+   microseconds.  Same #ifdefs here as in longlong.h.  */
+#if defined (__GNUC__) && ! defined (NO_ASM)                            \
+  && (defined (__i370__) || defined (__s390__) || defined (__mvs__))
+static const int  have_stck = 1;
+static const int  use_stck = 1;  /* always use when available */
+typedef uint64_t  stck_t; /* gcc for s390 is quite new, always has uint64_t */
+#define STCK(timestamp)                 \
+  do {                                  \
+    asm ("stck %0" : "=Q" (timestamp)); \
+  } while (0)
+#else
+static const int  have_stck = 0;
+static const int  use_stck = 0;
+typedef unsigned long  stck_t;   /* dummy */
+#define STCK(timestamp)  ASSERT_FAIL (stck instruction not available)
+#endif
+#define STCK_PERIOD      (1.0 / 4096e6)   /* 2^-12 microseconds */
+
+/* mftb
+   Enhancement: On 64-bit chips mftb gives a 64-bit value, no need for mftbu
+   and a loop (see powerpc64.asm).  */
+#if HAVE_HOST_CPU_FAMILY_powerpc
+static const int  have_mftb = 1;
+#if defined (__GNUC__) && ! defined (NO_ASM)
+#define MFTB(a)                         \
+  do {                                  \
+    unsigned  __h1, __l, __h2;          \
+    do {                                \
+      asm volatile ("mftbu %0\n"        \
+		    "mftb  %1\n"        \
+		    "mftbu %2"          \
+		    : "=r" (__h1),      \
+		      "=r" (__l),       \
+		      "=r" (__h2));     \
+    } while (__h1 != __h2);             \
+    a[0] = __l;                         \
+    a[1] = __h1;                        \
+  } while (0)
+#else
+#define MFTB(a)   mftb_function (a)
+#endif
+#else /* ! powerpc */
+static const int  have_mftb = 0;
+#define MFTB(a)                         \
+  do {                                  \
+    a[0] = 0;                           \
+    a[1] = 0;                           \
+    ASSERT_FAIL (mftb not available);   \
+  } while (0)
+#endif
+
+/* Unicos 10.X has syssgi(), but not mmap(). */
+#if HAVE_SYSSGI && HAVE_MMAP
+static const int  have_sgi = 1;
+#else
+static const int  have_sgi = 0;
+#endif
+
+#if HAVE_READ_REAL_TIME
+static const int have_rrt = 1;
+#else
+static const int have_rrt = 0;
+#define read_real_time(t,s)     ASSERT_FAIL (read_real_time not available)
+#define time_base_to_time(t,s)  ASSERT_FAIL (time_base_to_time not available)
+#define RTC_POWER     1
+#define RTC_POWER_PC  2
+#define timebasestruct_t   struct timebasestruct_dummy
+struct timebasestruct_dummy {
+  int             flag;
+  unsigned int    tb_high;
+  unsigned int    tb_low;
+};
+#endif
+
+#if HAVE_CLOCK_GETTIME
+static const int have_cgt = 1;
+#define struct_timespec  struct timespec
+#else
+static const int have_cgt = 0;
+#define struct_timespec       struct timespec_dummy
+#define clock_gettime(id,ts)  (ASSERT_FAIL (clock_gettime not available), -1)
+#define clock_getres(id,ts)   (ASSERT_FAIL (clock_getres not available), -1)
+#endif
+
+#if HAVE_GETRUSAGE
+static const int have_grus = 1;
+#define struct_rusage   struct rusage
+#else
+static const int have_grus = 0;
+#define getrusage(n,ru)  ASSERT_FAIL (getrusage not available)
+#define struct_rusage    struct rusage_dummy
+#endif
+
+#if HAVE_GETTIMEOFDAY
+static const int have_gtod = 1;
+#define struct_timeval   struct timeval
+#else
+static const int have_gtod = 0;
+#define gettimeofday(tv,tz)  ASSERT_FAIL (gettimeofday not available)
+#define struct_timeval   struct timeval_dummy
+#endif
+
+#if HAVE_TIMES
+static const int have_times = 1;
+#define struct_tms   struct tms
+#else
+static const int have_times = 0;
+#define times(tms)   ASSERT_FAIL (times not available)
+#define struct_tms   struct tms_dummy
+#endif
+
+struct tms_dummy {
+  long  tms_utime;
+};
+struct timeval_dummy {
+  long  tv_sec;
+  long  tv_usec;
+};
+struct rusage_dummy {
+  struct_timeval ru_utime;
+};
+struct timespec_dummy {
+  long  tv_sec;
+  long  tv_nsec;
+};
+
+static int  use_cycles;
+static int  use_mftb;
+static int  use_sgi;
+static int  use_rrt;
+static int  use_cgt;
+static int  use_gtod;
+static int  use_grus;
+static int  use_times;
+static int  use_tick_boundary;
+
+static unsigned         start_cycles[2];
+static stck_t           start_stck;
+static unsigned         start_mftb[2];
+static unsigned         start_sgi;
+static timebasestruct_t start_rrt;
+static struct_timespec  start_cgt;
+static struct_rusage    start_grus;
+static struct_timeval   start_gtod;
+static struct_tms       start_times;
+
+static double  cycles_limit = 1e100;
+static double  mftb_unittime;
+static double  sgi_unittime;
+static double  cgt_unittime;
+static double  grus_unittime;
+static double  gtod_unittime;
+static double  times_unittime;
+
+/* for RTC_POWER format, ie. seconds and nanoseconds */
+#define TIMEBASESTRUCT_SECS(t)  ((t)->tb_high + (t)->tb_low * 1e-9)
+
+
+/* Return a string representing a time in seconds, nicely formatted.
+   Eg. "10.25ms".  */
+char *
+unittime_string (double t)
+{
+  static char  buf[128];
+
+  const char  *unit;
+  int         prec;
+
+  /* choose units and scale */
+  if (t < 1e-6)
+    t *= 1e9, unit = "ns";
+  else if (t < 1e-3)
+    t *= 1e6, unit = "us";
+  else if (t < 1.0)
+    t *= 1e3, unit = "ms";
+  else
+    unit = "s";
+
+  /* want 4 significant figures */
+  if (t < 1.0)
+    prec = 4;
+  else if (t < 10.0)
+    prec = 3;
+  else if (t < 100.0)
+    prec = 2;
+  else
+    prec = 1;
+
+  sprintf (buf, "%.*f%s", prec, t, unit);
+  return buf;
+}
+
+
+static jmp_buf  cycles_works_buf;
+
+static RETSIGTYPE
+cycles_works_handler (int sig)
+{
+  longjmp (cycles_works_buf, 1);
+}
+
+int
+cycles_works_p (void)
+{
+  static int  result = -1;
+
+  if (result != -1)
+    goto done;
+
+  /* FIXME: On linux, the cycle counter is not saved and restored over
+   * context switches, making it almost useless for precise cputime
+   * measurements. When available, it's better to use clock_gettime,
+   * which seems to have reasonable accuracy (tested on x86_32,
+   * linux-2.6.26, glibc-2.7). However, there are also some linux
+   * systems where clock_gettime is broken in one way or the other,
+   * like CLOCK_PROCESS_CPUTIME_ID not implemented (easy case) or
+   * kind-of implemented but broken (needs code to detect that), and
+   * on those systems a wall-clock cycle counter is the least bad
+   * fallback.
+   *
+   * So we need some code to disable the cycle counter on some but not
+   * all linux systems. */
+#ifdef SIGILL
+  {
+    RETSIGTYPE (*old_handler) (int);
+    unsigned  cycles[2];
+
+    old_handler = signal (SIGILL, cycles_works_handler);
+    if (old_handler == SIG_ERR)
+      {
+	if (speed_option_verbose)
+	  printf ("cycles_works_p(): SIGILL not supported, assuming speed_cyclecounter() works\n");
+	goto yes;
+      }
+    if (setjmp (cycles_works_buf))
+      {
+	if (speed_option_verbose)
+	  printf ("cycles_works_p(): SIGILL during speed_cyclecounter(), so doesn't work\n");
+	result = 0;
+	goto done;
+      }
+    speed_cyclecounter (cycles);
+    signal (SIGILL, old_handler);
+    if (speed_option_verbose)
+      printf ("cycles_works_p(): speed_cyclecounter() works\n");
+  }
+#else
+
+  if (speed_option_verbose)
+    printf ("cycles_works_p(): SIGILL not defined, assuming speed_cyclecounter() works\n");
+  goto yes;
+#endif
+
+ yes:
+  result = 1;
+
+ done:
+  return result;
+}
+
+
+/* The number of clock ticks per second, but looking at sysconf rather than
+   just CLK_TCK, where possible.  */
+long
+clk_tck (void)
+{
+  static long  result = -1L;
+  if (result != -1L)
+    return result;
+
+#if HAVE_SYSCONF
+  result = sysconf (_SC_CLK_TCK);
+  if (result != -1L)
+    {
+      if (speed_option_verbose)
+	printf ("sysconf(_SC_CLK_TCK) is %ld per second\n", result);
+      return result;
+    }
+
+  fprintf (stderr,
+	   "sysconf(_SC_CLK_TCK) not working, using CLK_TCK instead\n");
+#endif
+
+#ifdef CLK_TCK
+  result = CLK_TCK;
+  if (speed_option_verbose)
+    printf ("CLK_TCK is %ld per second\n", result);
+  return result;
+#else
+  fprintf (stderr, "CLK_TCK not defined, cannot continue\n");
+  abort ();
+#endif
+}
+
+
+/* If two times can be observed less than half a clock tick apart, then
+   assume "get" is microsecond accurate.
+
+   Two times only 1 microsecond apart are not believed, since some kernels
+   take it upon themselves to ensure gettimeofday doesn't return the same
+   value twice, for the benefit of applications using it for a timestamp.
+   This is obviously very stupid given the speed of CPUs these days.
+
+   Making "reps" many calls to noop_1() is designed to waste some CPU, with
+   a view to getting measurements 2 microseconds (or more) apart.  "reps" is
+   increased progressively until such a period is seen.
+
+   The outer loop "attempts" are just to allow for any random nonsense or
+   system load upsetting the measurements (ie. making two successive calls
+   to "get" come out as a longer interval than normal).
+
+   Bugs:
+
+   The assumption that any interval less than a half tick implies
+   microsecond resolution is obviously fairly rash, the true resolution
+   could be anything between a microsecond and that half tick.  Perhaps
+   something special would have to be done on a system where this is the
+   case, since there's no obvious reliable way to detect it
+   automatically.  */
+
+#define MICROSECONDS_P(name, type, get, sec, usec)                      \
+  {                                                                     \
+    static int  result = -1;                                            \
+    type      st, et;                                                   \
+    long      dt, half_tick;                                            \
+    unsigned  attempt, reps, i, j;                                      \
+									\
+    if (result != -1)                                                   \
+      return result;                                                    \
+									\
+    result = 0;                                                         \
+    half_tick = (1000000L / clk_tck ()) / 2;                            \
+									\
+    for (attempt = 0; attempt < 5; attempt++)                           \
+      {                                                                 \
+	reps = 0;                                                       \
+	for (;;)                                                        \
+	  {                                                             \
+	    get (st);                                                   \
+	    for (i = 0; i < reps; i++)                                  \
+	      for (j = 0; j < 100; j++)                                 \
+		noop_1 (CNST_LIMB(0));                                  \
+	    get (et);                                                   \
+									\
+	    dt = (sec(et)-sec(st))*1000000L + usec(et)-usec(st);        \
+									\
+	    if (speed_option_verbose >= 2)                              \
+	      printf ("%s attempt=%u, reps=%u, dt=%ld\n",               \
+		      name, attempt, reps, dt);                         \
+									\
+	    if (dt >= 2)                                                \
+	      break;                                                    \
+									\
+	    reps = (reps == 0 ? 1 : 2*reps);                            \
+	    if (reps == 0)                                              \
+	      break;  /* uint overflow, not normal */                   \
+	  }                                                             \
+									\
+	if (dt < half_tick)                                             \
+	  {                                                             \
+	    result = 1;                                                 \
+	    break;                                                      \
+	  }                                                             \
+      }                                                                 \
+									\
+    if (speed_option_verbose)                                           \
+      {                                                                 \
+	if (result)                                                     \
+	  printf ("%s is microsecond accurate\n", name);                \
+	else                                                            \
+	  printf ("%s is only %s clock tick accurate\n",                \
+		  name, unittime_string (1.0/clk_tck()));               \
+      }                                                                 \
+    return result;                                                      \
+  }
+
+
+int
+gettimeofday_microseconds_p (void)
+{
+#define call_gettimeofday(t)   gettimeofday (&(t), NULL)
+#define timeval_tv_sec(t)      ((t).tv_sec)
+#define timeval_tv_usec(t)     ((t).tv_usec)
+  MICROSECONDS_P ("gettimeofday", struct_timeval,
+		  call_gettimeofday, timeval_tv_sec, timeval_tv_usec);
+}
+
+int
+getrusage_microseconds_p (void)
+{
+#define call_getrusage(t)   getrusage (0, &(t))
+#define rusage_tv_sec(t)    ((t).ru_utime.tv_sec)
+#define rusage_tv_usec(t)   ((t).ru_utime.tv_usec)
+  MICROSECONDS_P ("getrusage", struct_rusage,
+		  call_getrusage, rusage_tv_sec, rusage_tv_usec);
+}
+
+/* Test whether getrusage goes backwards, return non-zero if it does
+   (suggesting it's flawed).
+
+   On a macintosh m68040-unknown-netbsd1.4.1 getrusage looks like it's
+   microsecond accurate, but has been seen remaining unchanged after many
+   microseconds have elapsed.  It also regularly goes backwards by 1000 to
+   5000 usecs, this has been seen after between 500 and 4000 attempts taking
+   perhaps 0.03 seconds.  We consider this too broken for good measuring.
+   We used to have configure pretend getrusage didn't exist on this system,
+   but a runtime test should be more reliable, since we imagine the problem
+   is not confined to just this exact system tuple.  */
+
+int
+getrusage_backwards_p (void)
+{
+  static int result = -1;
+  struct rusage  start, prev, next;
+  long  d;
+  int   i;
+
+  if (result != -1)
+    return result;
+
+  getrusage (0, &start);
+  memcpy (&next, &start, sizeof (next));
+
+  result = 0;
+  i = 0;
+  for (;;)
+    {
+      memcpy (&prev, &next, sizeof (prev));
+      getrusage (0, &next);
+
+      if (next.ru_utime.tv_sec < prev.ru_utime.tv_sec
+	  || (next.ru_utime.tv_sec == prev.ru_utime.tv_sec
+	      && next.ru_utime.tv_usec < prev.ru_utime.tv_usec))
+	{
+	  if (speed_option_verbose)
+	    printf ("getrusage went backwards (attempt %d: %ld.%06ld -> %ld.%06ld)\n",
+		    i,
+		    (long) prev.ru_utime.tv_sec, (long) prev.ru_utime.tv_usec,
+		    (long) next.ru_utime.tv_sec, (long) next.ru_utime.tv_usec);
+	  result = 1;
+	  break;
+	}
+
+      /* minimum 1000 attempts, then stop after either 0.1 seconds or 50000
+	 attempts, whichever comes first */
+      d = 1000000 * (next.ru_utime.tv_sec - start.ru_utime.tv_sec)
+	+ (next.ru_utime.tv_usec - start.ru_utime.tv_usec);
+      i++;
+      if (i > 50000 || (i > 1000 && d > 100000))
+	break;
+    }
+
+  return result;
+}
+
+/* CLOCK_PROCESS_CPUTIME_ID looks like it's going to be in a future version
+   of glibc (some time post 2.2).
+
+   CLOCK_VIRTUAL is process time, available in BSD systems (though sometimes
+   defined, but returning -1 for an error).  */
+
+#ifdef CLOCK_PROCESS_CPUTIME_ID
+# define CGT_ID        CLOCK_PROCESS_CPUTIME_ID
+#else
+# ifdef CLOCK_VIRTUAL
+#  define CGT_ID       CLOCK_VIRTUAL
+# endif
+#endif
+#ifdef CGT_ID
+const int  have_cgt_id = 1;
+#else
+const int  have_cgt_id = 0;
+# define CGT_ID       (ASSERT_FAIL (CGT_ID not determined), -1)
+#endif
+
+#define CGT_DELAY_COUNT 1000
+
+int
+cgt_works_p (void)
+{
+  static int  result = -1;
+  struct_timespec  unit;
+
+  if (! have_cgt)
+    return 0;
+
+  if (! have_cgt_id)
+    {
+      if (speed_option_verbose)
+	printf ("clock_gettime don't know what ID to use\n");
+      result = 0;
+      return result;
+    }
+
+  if (result != -1)
+    return result;
+
+  /* trial run to see if it works */
+  if (clock_gettime (CGT_ID, &unit) != 0)
+    {
+      if (speed_option_verbose)
+	printf ("clock_gettime id=%d error: %s\n", CGT_ID, strerror (errno));
+      result = 0;
+      return result;
+    }
+
+  /* get the resolution */
+  if (clock_getres (CGT_ID, &unit) != 0)
+    {
+      if (speed_option_verbose)
+	printf ("clock_getres id=%d error: %s\n", CGT_ID, strerror (errno));
+      result = 0;
+      return result;
+    }
+
+  cgt_unittime = unit.tv_sec + unit.tv_nsec * 1e-9;
+  if (speed_option_verbose)
+    printf ("clock_gettime is %s accurate\n", unittime_string (cgt_unittime));
+
+  if (cgt_unittime < 10e-9)
+    {
+      /* Do we believe this? */
+      struct timespec start, end;
+      static volatile int counter;
+      double duration;
+      if (clock_gettime (CGT_ID, &start))
+	{
+	  if (speed_option_verbose)
+	    printf ("clock_gettime id=%d error: %s\n", CGT_ID, strerror (errno));
+	  result = 0;
+	  return result;
+	}
+      /* Loop of at least 1000 memory accesses, ought to take at
+	 least 100 ns*/
+      for (counter = 0; counter < CGT_DELAY_COUNT; counter++)
+	;
+      if (clock_gettime (CGT_ID, &end))
+	{
+	  if (speed_option_verbose)
+	    printf ("clock_gettime id=%d error: %s\n", CGT_ID, strerror (errno));
+	  result = 0;
+	  return result;
+	}
+      duration = (end.tv_sec + end.tv_nsec * 1e-9
+		  - start.tv_sec - start.tv_nsec * 1e-9);
+      if (speed_option_verbose)
+	printf ("delay loop of %d rounds took %s (according to clock_gettime)\n",
+		CGT_DELAY_COUNT, unittime_string (duration));
+      if (duration < 100e-9)
+	{
+	  if (speed_option_verbose)
+	    printf ("clock_gettime id=%d not believable\n", CGT_ID);
+	  result = 0;
+	  return result;
+	}
+    }
+  result = 1;
+  return result;
+}
+
+
+static double
+freq_measure_mftb_one (void)
+{
+#define call_gettimeofday(t)   gettimeofday (&(t), NULL)
+#define timeval_tv_sec(t)      ((t).tv_sec)
+#define timeval_tv_usec(t)     ((t).tv_usec)
+  FREQ_MEASURE_ONE ("mftb", struct_timeval,
+		    call_gettimeofday, MFTB,
+		    timeval_tv_sec, timeval_tv_usec);
+}
+
+
+static jmp_buf  mftb_works_buf;
+
+static RETSIGTYPE
+mftb_works_handler (int sig)
+{
+  longjmp (mftb_works_buf, 1);
+}
+
+int
+mftb_works_p (void)
+{
+  unsigned   a[2];
+  RETSIGTYPE (*old_handler) (int);
+  double     cycletime;
+
+  /* suppress a warning about a[] unused */
+  a[0] = 0;
+
+  if (! have_mftb)
+    return 0;
+
+#ifdef SIGILL
+  old_handler = signal (SIGILL, mftb_works_handler);
+  if (old_handler == SIG_ERR)
+    {
+      if (speed_option_verbose)
+	printf ("mftb_works_p(): SIGILL not supported, assuming mftb works\n");
+      return 1;
+    }
+  if (setjmp (mftb_works_buf))
+    {
+      if (speed_option_verbose)
+	printf ("mftb_works_p(): SIGILL during mftb, so doesn't work\n");
+      return 0;
+    }
+  MFTB (a);
+  signal (SIGILL, old_handler);
+  if (speed_option_verbose)
+    printf ("mftb_works_p(): mftb works\n");
+#else
+
+  if (speed_option_verbose)
+    printf ("mftb_works_p(): SIGILL not defined, assuming mftb works\n");
+#endif
+
+#if ! HAVE_GETTIMEOFDAY
+  if (speed_option_verbose)
+    printf ("mftb_works_p(): no gettimeofday available to measure mftb\n");
+  return 0;
+#endif
+
+  /* The time base is normally 1/4 of the bus speed on 6xx and 7xx chips, on
+     other chips it can be driven from an external clock. */
+  cycletime = freq_measure ("mftb", freq_measure_mftb_one);
+  if (cycletime == -1.0)
+    {
+      if (speed_option_verbose)
+	printf ("mftb_works_p(): cannot measure mftb period\n");
+      return 0;
+    }
+
+  mftb_unittime = cycletime;
+  return 1;
+}
+
+
+volatile unsigned  *sgi_addr;
+
+int
+sgi_works_p (void)
+{
+#if HAVE_SYSSGI && HAVE_MMAP
+  static int  result = -1;
+
+  size_t          pagesize, offset;
+  __psunsigned_t  phys, physpage;
+  void            *virtpage;
+  unsigned        period_picoseconds;
+  int             size, fd;
+
+  if (result != -1)
+    return result;
+
+  phys = syssgi (SGI_QUERY_CYCLECNTR, &period_picoseconds);
+  if (phys == (__psunsigned_t) -1)
+    {
+      /* ENODEV is the error when a counter is not available */
+      if (speed_option_verbose)
+	printf ("syssgi SGI_QUERY_CYCLECNTR error: %s\n", strerror (errno));
+      result = 0;
+      return result;
+    }
+  sgi_unittime = period_picoseconds * 1e-12;
+
+  /* IRIX 5 doesn't have SGI_CYCLECNTR_SIZE, assume 32 bits in that case.
+     Challenge/ONYX hardware has a 64 bit byte counter, but there seems no
+     obvious way to identify that without SGI_CYCLECNTR_SIZE.  */
+#ifdef SGI_CYCLECNTR_SIZE
+  size = syssgi (SGI_CYCLECNTR_SIZE);
+  if (size == -1)
+    {
+      if (speed_option_verbose)
+	{
+	  printf ("syssgi SGI_CYCLECNTR_SIZE error: %s\n", strerror (errno));
+	  printf ("    will assume size==4\n");
+	}
+      size = 32;
+    }
+#else
+  size = 32;
+#endif
+
+  if (size < 32)
+    {
+      printf ("syssgi SGI_CYCLECNTR_SIZE gives %d, expected 32 or 64\n", size);
+      result = 0;
+      return result;
+    }
+
+  pagesize = getpagesize();
+  offset = (size_t) phys & (pagesize-1);
+  physpage = phys - offset;
+
+  /* shouldn't cross over a page boundary */
+  ASSERT_ALWAYS (offset + size/8 <= pagesize);
+
+  fd = open("/dev/mmem", O_RDONLY);
+  if (fd == -1)
+    {
+      if (speed_option_verbose)
+	printf ("open /dev/mmem: %s\n", strerror (errno));
+      result = 0;
+      return result;
+    }
+
+  virtpage = mmap (0, pagesize, PROT_READ, MAP_PRIVATE, fd, (off_t) physpage);
+  if (virtpage == (void *) -1)
+    {
+      if (speed_option_verbose)
+	printf ("mmap /dev/mmem: %s\n", strerror (errno));
+      result = 0;
+      return result;
+    }
+
+  /* address of least significant 4 bytes, knowing mips is big endian */
+  sgi_addr = (unsigned *) ((char *) virtpage + offset
+			   + size/8 - sizeof(unsigned));
+  result = 1;
+  return result;
+
+#else /* ! (HAVE_SYSSGI && HAVE_MMAP) */
+  return 0;
+#endif
+}
+
+
+#define DEFAULT(var,n)  \
+  do {                  \
+    if (! (var))        \
+      (var) = (n);      \
+  } while (0)
+
+void
+speed_time_init (void)
+{
+  double supplement_unittime = 0.0;
+
+  static int  speed_time_initialized = 0;
+  if (speed_time_initialized)
+    return;
+  speed_time_initialized = 1;
+
+  speed_cycletime_init ();
+
+  if (!speed_option_cycles_broken && have_cycles && cycles_works_p ())
+    {
+      use_cycles = 1;
+      DEFAULT (speed_cycletime, 1.0);
+      speed_unittime = speed_cycletime;
+      DEFAULT (speed_precision, 10000);
+      strcpy (speed_time_string, "CPU cycle counter");
+
+      /* only used if a supplementary method is chosen below */
+      cycles_limit = (have_cycles == 1 ? M_2POW32 : M_2POW64) / 2.0
+	* speed_cycletime;
+
+      if (have_grus && getrusage_microseconds_p() && ! getrusage_backwards_p())
+	{
+	  /* this is a good combination */
+	  use_grus = 1;
+	  supplement_unittime = grus_unittime = 1.0e-6;
+	  strcpy (speed_time_string, "CPU cycle counter, supplemented by microsecond getrusage()");
+	}
+      else if (have_cycles == 1)
+	{
+	  /* When speed_cyclecounter has a limited range, look for something
+	     to supplement it. */
+	  if (have_gtod && gettimeofday_microseconds_p())
+	    {
+	      use_gtod = 1;
+	      supplement_unittime = gtod_unittime = 1.0e-6;
+	      strcpy (speed_time_string, "CPU cycle counter, supplemented by microsecond gettimeofday()");
+	    }
+	  else if (have_grus)
+	    {
+	      use_grus = 1;
+	      supplement_unittime = grus_unittime = 1.0 / (double) clk_tck ();
+	      sprintf (speed_time_string, "CPU cycle counter, supplemented by %s clock tick getrusage()", unittime_string (supplement_unittime));
+	    }
+	  else if (have_times)
+	    {
+	      use_times = 1;
+	      supplement_unittime = times_unittime = 1.0 / (double) clk_tck ();
+	      sprintf (speed_time_string, "CPU cycle counter, supplemented by %s clock tick times()", unittime_string (supplement_unittime));
+	    }
+	  else if (have_gtod)
+	    {
+	      use_gtod = 1;
+	      supplement_unittime = gtod_unittime = 1.0 / (double) clk_tck ();
+	      sprintf (speed_time_string, "CPU cycle counter, supplemented by %s clock tick gettimeofday()", unittime_string (supplement_unittime));
+	    }
+	  else
+	    {
+	      fprintf (stderr, "WARNING: cycle counter is 32 bits and there's no other functions.\n");
+	      fprintf (stderr, "    Wraparounds may produce bad results on long measurements.\n");
+	    }
+	}
+
+      if (use_grus || use_times || use_gtod)
+	{
+	  /* must know cycle period to compare cycles to other measuring
+	     (via cycles_limit) */
+	  speed_cycletime_need_seconds ();
+
+	  if (speed_precision * supplement_unittime > cycles_limit)
+	    {
+	      fprintf (stderr, "WARNING: requested precision can't always be achieved due to limited range\n");
+	      fprintf (stderr, "    cycle counter and limited precision supplemental method\n");
+	      fprintf (stderr, "    (%s)\n", speed_time_string);
+	    }
+	}
+    }
+  else if (have_stck)
+    {
+      strcpy (speed_time_string, "STCK timestamp");
+      /* stck is in units of 2^-12 microseconds, which is very likely higher
+	 resolution than a cpu cycle */
+      if (speed_cycletime == 0.0)
+	speed_cycletime_fail
+	  ("Need to know CPU frequency for effective stck unit");
+      speed_unittime = MAX (speed_cycletime, STCK_PERIOD);
+      DEFAULT (speed_precision, 10000);
+    }
+  else if (have_mftb && mftb_works_p ())
+    {
+      use_mftb = 1;
+      DEFAULT (speed_precision, 10000);
+      speed_unittime = mftb_unittime;
+      sprintf (speed_time_string, "mftb counter (%s)",
+	       unittime_string (speed_unittime));
+    }
+  else if (have_sgi && sgi_works_p ())
+    {
+      use_sgi = 1;
+      DEFAULT (speed_precision, 10000);
+      speed_unittime = sgi_unittime;
+      sprintf (speed_time_string, "syssgi() mmap counter (%s), supplemented by millisecond getrusage()",
+	       unittime_string (speed_unittime));
+      /* supplemented with getrusage, which we assume to have 1ms resolution */
+      use_grus = 1;
+      supplement_unittime = 1e-3;
+    }
+  else if (have_rrt)
+    {
+      timebasestruct_t  t;
+      use_rrt = 1;
+      DEFAULT (speed_precision, 10000);
+      read_real_time (&t, sizeof(t));
+      switch (t.flag) {
+      case RTC_POWER:
+	/* FIXME: What's the actual RTC resolution? */
+	speed_unittime = 1e-7;
+	strcpy (speed_time_string, "read_real_time() power nanoseconds");
+	break;
+      case RTC_POWER_PC:
+	t.tb_high = 1;
+	t.tb_low = 0;
+	time_base_to_time (&t, sizeof(t));
+	speed_unittime = TIMEBASESTRUCT_SECS(&t) / M_2POW32;
+	sprintf (speed_time_string, "%s read_real_time() powerpc ticks",
+		 unittime_string (speed_unittime));
+	break;
+      default:
+	fprintf (stderr, "ERROR: Unrecognised timebasestruct_t flag=%d\n",
+		 t.flag);
+	abort ();
+      }
+    }
+  else if (have_cgt && cgt_works_p() && cgt_unittime < 1.5e-6)
+    {
+      /* use clock_gettime if microsecond or better resolution */
+    choose_cgt:
+      use_cgt = 1;
+      speed_unittime = cgt_unittime;
+      DEFAULT (speed_precision, (cgt_unittime <= 0.1e-6 ? 10000 : 1000));
+      strcpy (speed_time_string, "microsecond accurate clock_gettime()");
+    }
+  else if (have_times && clk_tck() > 1000000)
+    {
+      /* Cray vector systems have times() which is clock cycle resolution
+	 (eg. 450 MHz).  */
+      DEFAULT (speed_precision, 10000);
+      goto choose_times;
+    }
+  else if (have_grus && getrusage_microseconds_p() && ! getrusage_backwards_p())
+    {
+      use_grus = 1;
+      speed_unittime = grus_unittime = 1.0e-6;
+      DEFAULT (speed_precision, 1000);
+      strcpy (speed_time_string, "microsecond accurate getrusage()");
+    }
+  else if (have_gtod && gettimeofday_microseconds_p())
+    {
+      use_gtod = 1;
+      speed_unittime = gtod_unittime = 1.0e-6;
+      DEFAULT (speed_precision, 1000);
+      strcpy (speed_time_string, "microsecond accurate gettimeofday()");
+    }
+  else if (have_cgt && cgt_works_p() && cgt_unittime < 1.5/clk_tck())
+    {
+      /* use clock_gettime if 1 tick or better resolution */
+      goto choose_cgt;
+    }
+  else if (have_times)
+    {
+      use_tick_boundary = 1;
+      DEFAULT (speed_precision, 200);
+    choose_times:
+      use_times = 1;
+      speed_unittime = times_unittime = 1.0 / (double) clk_tck ();
+      sprintf (speed_time_string, "%s clock tick times()",
+	       unittime_string (speed_unittime));
+    }
+  else if (have_grus)
+    {
+      use_grus = 1;
+      use_tick_boundary = 1;
+      speed_unittime = grus_unittime = 1.0 / (double) clk_tck ();
+      DEFAULT (speed_precision, 200);
+      sprintf (speed_time_string, "%s clock tick getrusage()\n",
+	       unittime_string (speed_unittime));
+    }
+  else if (have_gtod)
+    {
+      use_gtod = 1;
+      use_tick_boundary = 1;
+      speed_unittime = gtod_unittime = 1.0 / (double) clk_tck ();
+      DEFAULT (speed_precision, 200);
+      sprintf (speed_time_string, "%s clock tick gettimeofday()",
+	       unittime_string (speed_unittime));
+    }
+  else
+    {
+      fprintf (stderr, "No time measuring method available\n");
+      fprintf (stderr, "None of: speed_cyclecounter(), STCK(), getrusage(), gettimeofday(), times()\n");
+      abort ();
+    }
+
+  if (speed_option_verbose)
+    {
+      printf ("speed_time_init: %s\n", speed_time_string);
+      printf ("    speed_precision     %d\n", speed_precision);
+      printf ("    speed_unittime      %.2g\n", speed_unittime);
+      if (supplement_unittime)
+	printf ("    supplement_unittime %.2g\n", supplement_unittime);
+      printf ("    use_tick_boundary   %d\n", use_tick_boundary);
+      if (have_cycles)
+	printf ("    cycles_limit        %.2g seconds\n", cycles_limit);
+    }
+}
+
+
+
+/* Burn up CPU until a clock tick boundary, for greater accuracy.  Set the
+   corresponding "start_foo" appropriately too. */
+
+void
+grus_tick_boundary (void)
+{
+  struct_rusage  prev;
+  getrusage (0, &prev);
+  do {
+    getrusage (0, &start_grus);
+  } while (start_grus.ru_utime.tv_usec == prev.ru_utime.tv_usec);
+}
+
+void
+gtod_tick_boundary (void)
+{
+  struct_timeval  prev;
+  gettimeofday (&prev, NULL);
+  do {
+    gettimeofday (&start_gtod, NULL);
+  } while (start_gtod.tv_usec == prev.tv_usec);
+}
+
+void
+times_tick_boundary (void)
+{
+  struct_tms  prev;
+  times (&prev);
+  do
+    times (&start_times);
+  while (start_times.tms_utime == prev.tms_utime);
+}
+
+
+/* "have_" values are tested to let unused code go dead.  */
+
+void
+speed_starttime (void)
+{
+  speed_time_init ();
+
+  if (have_grus && use_grus)
+    {
+      if (use_tick_boundary)
+	grus_tick_boundary ();
+      else
+	getrusage (0, &start_grus);
+    }
+
+  if (have_gtod && use_gtod)
+    {
+      if (use_tick_boundary)
+	gtod_tick_boundary ();
+      else
+	gettimeofday (&start_gtod, NULL);
+    }
+
+  if (have_times && use_times)
+    {
+      if (use_tick_boundary)
+	times_tick_boundary ();
+      else
+	times (&start_times);
+    }
+
+  if (have_cgt && use_cgt)
+    clock_gettime (CGT_ID, &start_cgt);
+
+  if (have_rrt && use_rrt)
+    read_real_time (&start_rrt, sizeof(start_rrt));
+
+  if (have_sgi && use_sgi)
+    start_sgi = *sgi_addr;
+
+  if (have_mftb && use_mftb)
+    MFTB (start_mftb);
+
+  if (have_stck && use_stck)
+    STCK (start_stck);
+
+  /* Cycles sampled last for maximum accuracy. */
+  if (have_cycles && use_cycles)
+    speed_cyclecounter (start_cycles);
+}
+
+
+/* Calculate the difference between two cycle counter samples, as a "double"
+   counter of cycles.
+
+   The start and end values are allowed to cancel in integers in case the
+   counter values are bigger than the 53 bits that normally fit in a double.
+
+   This works even if speed_cyclecounter() puts a value bigger than 32-bits
+   in the low word (the high word always gets a 2**32 multiplier though). */
+
+double
+speed_cyclecounter_diff (const unsigned end[2], const unsigned start[2])
+{
+  unsigned  d;
+  double    t;
+
+  if (have_cycles == 1)
+    {
+      t = (end[0] - start[0]);
+    }
+  else
+    {
+      d = end[0] - start[0];
+      t = d - (d > end[0] ? M_2POWU : 0.0);
+      t += (end[1] - start[1]) * M_2POW32;
+    }
+  return t;
+}
+
+
+double
+speed_mftb_diff (const unsigned end[2], const unsigned start[2])
+{
+  unsigned  d;
+  double    t;
+
+  d = end[0] - start[0];
+  t = (double) d - (d > end[0] ? M_2POW32 : 0.0);
+  t += (end[1] - start[1]) * M_2POW32;
+  return t;
+}
+
+
+/* Calculate the difference between "start" and "end" using fields "sec" and
+   "psec", where each "psec" is a "punit" of a second.
+
+   The seconds parts are allowed to cancel before being combined with the
+   psec parts, in case a simple "sec+psec*punit" exceeds the precision of a
+   double.
+
+   Total time is only calculated in a "double" since an integer count of
+   psecs might overflow.  2^32 microseconds is only a bit over an hour, or
+   2^32 nanoseconds only about 4 seconds.
+
+   The casts to "long" are for the benefit of timebasestruct_t, where the
+   fields are only "unsigned int", but we want a signed difference.  */
+
+#define DIFF_SECS_ROUTINE(sec, psec, punit)                     \
+  {                                                             \
+    long  sec_diff, psec_diff;                                  \
+    sec_diff = (long) end->sec - (long) start->sec;             \
+    psec_diff = (long) end->psec - (long) start->psec;          \
+    return (double) sec_diff + punit * (double) psec_diff;      \
+  }
+
+double
+timeval_diff_secs (const struct_timeval *end, const struct_timeval *start)
+{
+  DIFF_SECS_ROUTINE (tv_sec, tv_usec, 1e-6);
+}
+
+double
+rusage_diff_secs (const struct_rusage *end, const struct_rusage *start)
+{
+  DIFF_SECS_ROUTINE (ru_utime.tv_sec, ru_utime.tv_usec, 1e-6);
+}
+
+double
+timespec_diff_secs (const struct_timespec *end, const struct_timespec *start)
+{
+  DIFF_SECS_ROUTINE (tv_sec, tv_nsec, 1e-9);
+}
+
+/* This is for use after time_base_to_time, ie. for seconds and nanoseconds. */
+double
+timebasestruct_diff_secs (const timebasestruct_t *end,
+			  const timebasestruct_t *start)
+{
+  DIFF_SECS_ROUTINE (tb_high, tb_low, 1e-9);
+}
+
+
+double
+speed_endtime (void)
+{
+#define END_USE(name,value)                             \
+  do {                                                  \
+    if (speed_option_verbose >= 3)                      \
+      printf ("speed_endtime(): used %s\n", name);      \
+    result = value;                                     \
+    goto done;                                          \
+  } while (0)
+
+#define END_ENOUGH(name,value)                                          \
+  do {                                                                  \
+    if (speed_option_verbose >= 3)                                      \
+      printf ("speed_endtime(): %s gives enough precision\n", name);    \
+    result = value;                                                     \
+    goto done;                                                          \
+  } while (0)
+
+#define END_EXCEED(name,value)                                            \
+  do {                                                                    \
+    if (speed_option_verbose >= 3)                                        \
+      printf ("speed_endtime(): cycle counter limit exceeded, used %s\n", \
+	      name);                                                      \
+    result = value;                                                       \
+    goto done;                                                            \
+  } while (0)
+
+  unsigned          end_cycles[2];
+  stck_t            end_stck;
+  unsigned          end_mftb[2];
+  unsigned          end_sgi;
+  timebasestruct_t  end_rrt;
+  struct_timespec   end_cgt;
+  struct_timeval    end_gtod;
+  struct_rusage     end_grus;
+  struct_tms        end_times;
+  double            t_gtod, t_grus, t_times, t_cgt;
+  double            t_rrt, t_sgi, t_mftb, t_stck, t_cycles;
+  double            result;
+
+  /* Cycles sampled first for maximum accuracy.
+     "have_" values tested to let unused code go dead.  */
+
+  if (have_cycles && use_cycles)  speed_cyclecounter (end_cycles);
+  if (have_stck   && use_stck)    STCK (end_stck);
+  if (have_mftb   && use_mftb)    MFTB (end_mftb);
+  if (have_sgi    && use_sgi)     end_sgi = *sgi_addr;
+  if (have_rrt    && use_rrt)     read_real_time (&end_rrt, sizeof(end_rrt));
+  if (have_cgt    && use_cgt)     clock_gettime (CGT_ID, &end_cgt);
+  if (have_gtod   && use_gtod)    gettimeofday (&end_gtod, NULL);
+  if (have_grus   && use_grus)    getrusage (0, &end_grus);
+  if (have_times  && use_times)   times (&end_times);
+
+  result = -1.0;
+
+  if (speed_option_verbose >= 4)
+    {
+      printf ("speed_endtime():\n");
+      if (use_cycles)
+	printf ("   cycles  0x%X,0x%X -> 0x%X,0x%X\n",
+		start_cycles[1], start_cycles[0],
+		end_cycles[1], end_cycles[0]);
+
+      if (use_stck)
+	printf ("   stck  0x%lX -> 0x%lX\n", start_stck, end_stck);
+
+      if (use_mftb)
+	printf ("   mftb  0x%X,%08X -> 0x%X,%08X\n",
+		start_mftb[1], start_mftb[0],
+		end_mftb[1], end_mftb[0]);
+
+      if (use_sgi)
+	printf ("   sgi  0x%X -> 0x%X\n", start_sgi, end_sgi);
+
+      if (use_rrt)
+	printf ("   read_real_time  (%d)%u,%u -> (%d)%u,%u\n",
+		start_rrt.flag, start_rrt.tb_high, start_rrt.tb_low,
+		end_rrt.flag, end_rrt.tb_high, end_rrt.tb_low);
+
+      if (use_cgt)
+	printf ("   clock_gettime  %ld.%09ld -> %ld.%09ld\n",
+		start_cgt.tv_sec, start_cgt.tv_nsec,
+		end_cgt.tv_sec, end_cgt.tv_nsec);
+
+      if (use_gtod)
+	printf ("   gettimeofday  %ld.%06ld -> %ld.%06ld\n",
+		start_gtod.tv_sec, start_gtod.tv_usec,
+		end_gtod.tv_sec, end_gtod.tv_usec);
+
+      if (use_grus)
+	printf ("   getrusage  %ld.%06ld -> %ld.%06ld\n",
+		start_grus.ru_utime.tv_sec, start_grus.ru_utime.tv_usec,
+		end_grus.ru_utime.tv_sec, end_grus.ru_utime.tv_usec);
+
+      if (use_times)
+	printf ("   times  %ld -> %ld\n",
+		start_times.tms_utime, end_times.tms_utime);
+    }
+
+  if (use_rrt)
+    {
+      time_base_to_time (&start_rrt, sizeof(start_rrt));
+      time_base_to_time (&end_rrt, sizeof(end_rrt));
+      t_rrt = timebasestruct_diff_secs (&end_rrt, &start_rrt);
+      END_USE ("read_real_time()", t_rrt);
+    }
+
+  if (use_cgt)
+    {
+      t_cgt = timespec_diff_secs (&end_cgt, &start_cgt);
+      END_USE ("clock_gettime()", t_cgt);
+    }
+
+  if (use_grus)
+    {
+      t_grus = rusage_diff_secs (&end_grus, &start_grus);
+
+      /* Use getrusage() if the cycle counter limit would be exceeded, or if
+	 it provides enough accuracy already. */
+      if (use_cycles)
+	{
+	  if (t_grus >= speed_precision*grus_unittime)
+	    END_ENOUGH ("getrusage()", t_grus);
+	  if (t_grus >= cycles_limit)
+	    END_EXCEED ("getrusage()", t_grus);
+	}
+    }
+
+  if (use_times)
+    {
+      t_times = (end_times.tms_utime - start_times.tms_utime) * times_unittime;
+
+      /* Use times() if the cycle counter limit would be exceeded, or if
+	 it provides enough accuracy already. */
+      if (use_cycles)
+	{
+	  if (t_times >= speed_precision*times_unittime)
+	    END_ENOUGH ("times()", t_times);
+	  if (t_times >= cycles_limit)
+	    END_EXCEED ("times()", t_times);
+	}
+    }
+
+  if (use_gtod)
+    {
+      t_gtod = timeval_diff_secs (&end_gtod, &start_gtod);
+
+      /* Use gettimeofday() if it measured a value bigger than the cycle
+	 counter can handle.  */
+      if (use_cycles)
+	{
+	  if (t_gtod >= cycles_limit)
+	    END_EXCEED ("gettimeofday()", t_gtod);
+	}
+    }
+
+  if (use_mftb)
+    {
+      t_mftb = speed_mftb_diff (end_mftb, start_mftb) * mftb_unittime;
+      END_USE ("mftb", t_mftb);
+    }
+
+  if (use_stck)
+    {
+      t_stck = (end_stck - start_stck) * STCK_PERIOD;
+      END_USE ("stck", t_stck);
+    }
+
+  if (use_sgi)
+    {
+      t_sgi = (end_sgi - start_sgi) * sgi_unittime;
+      END_USE ("SGI hardware counter", t_sgi);
+    }
+
+  if (use_cycles)
+    {
+      t_cycles = speed_cyclecounter_diff (end_cycles, start_cycles)
+	* speed_cycletime;
+      END_USE ("cycle counter", t_cycles);
+    }
+
+  if (use_grus && getrusage_microseconds_p())
+    END_USE ("getrusage()", t_grus);
+
+  if (use_gtod && gettimeofday_microseconds_p())
+    END_USE ("gettimeofday()", t_gtod);
+
+  if (use_times)  END_USE ("times()",        t_times);
+  if (use_grus)   END_USE ("getrusage()",    t_grus);
+  if (use_gtod)   END_USE ("gettimeofday()", t_gtod);
+
+  fprintf (stderr, "speed_endtime(): oops, no time method available\n");
+  abort ();
+
+ done:
+  if (result < 0.0)
+    {
+      if (speed_option_verbose >= 2)
+	fprintf (stderr, "speed_endtime(): warning, treating negative time as zero: %.9f\n", result);
+      result = 0.0;
+    }
+  return result;
+}

diff --git a/third_party/gmp/tune/tune-gcd-p.c b/third_party/gmp/tune/tune-gcd-p.c
new file mode 100644
index 0000000..3b5a4a8
--- /dev/null
+++ b/third_party/gmp/tune/tune-gcd-p.c

@@ -0,0 +1,225 @@
+/* tune-gcd-p
+
+   Tune the choice for splitting p in divide-and-conquer gcd.
+
+Copyright 2008, 2010, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define TUNE_GCD_P 1
+
+#include "../mpn/gcd.c"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "speed.h"
+
+/* Search for minimum over a range. FIXME: Implement golden-section /
+   fibonacci search*/
+static int
+search (double *minp, double (*f)(void *, int), void *ctx, int start, int end)
+{
+  int x[4];
+  double y[4];
+
+  int best_i;
+
+  x[0] = start;
+  x[3] = end;
+
+  y[0] = f(ctx, x[0]);
+  y[3] = f(ctx, x[3]);
+
+  for (;;)
+    {
+      int i;
+      int length = x[3] - x[0];
+
+      x[1] = x[0] + length/3;
+      x[2] = x[0] + 2*length/3;
+
+      y[1] = f(ctx, x[1]);
+      y[2] = f(ctx, x[2]);
+
+#if 0
+      printf("%d: %f, %d: %f, %d:, %f %d: %f\n",
+	     x[0], y[0], x[1], y[1], x[2], y[2], x[3], y[3]);
+#endif
+      for (best_i = 0, i = 1; i < 4; i++)
+	if (y[i] < y[best_i])
+	  best_i = i;
+
+      if (length <= 4)
+	break;
+
+      if (best_i >= 2)
+	{
+	  x[0] = x[1];
+	  y[0] = y[1];
+	}
+      else
+	{
+	  x[3] = x[2];
+	  y[3] = y[2];
+	}
+    }
+  *minp = y[best_i];
+  return x[best_i];
+}
+
+static int
+compare_double(const void *ap, const void *bp)
+{
+  double a = * (const double *) ap;
+  double b = * (const double *) bp;
+
+  if (a < b)
+    return -1;
+  else if (a > b)
+    return 1;
+  else
+    return 0;
+}
+
+static double
+median (double *v, size_t n)
+{
+  qsort(v, n, sizeof(*v), compare_double);
+
+  return v[n/2];
+}
+
+#define TIME(res, code) do {				\
+  double time_measurement[5];				\
+  unsigned time_i;					\
+							\
+  for (time_i = 0; time_i < 5; time_i++)		\
+    {							\
+      speed_starttime();				\
+      code;						\
+      time_measurement[time_i] = speed_endtime();	\
+    }							\
+  res = median(time_measurement, 5);			\
+} while (0)
+
+struct bench_data
+{
+  mp_size_t n;
+  mp_ptr ap;
+  mp_ptr bp;
+  mp_ptr up;
+  mp_ptr vp;
+  mp_ptr gp;
+};
+
+static double
+bench_gcd (void *ctx, int p)
+{
+  struct bench_data *data = (struct bench_data *) ctx;
+  double t;
+
+  p_table[data->n] = p;
+  TIME(t, {
+      MPN_COPY (data->up, data->ap, data->n);
+      MPN_COPY (data->vp, data->bp, data->n);
+      mpn_gcd (data->gp, data->up, data->n, data->vp, data->n);
+    });
+
+  return t;
+}
+
+int
+main(int argc, char **argv)
+{
+  gmp_randstate_t rands;  struct bench_data data;
+  mp_size_t n;
+
+  TMP_DECL;
+
+  /* Unbuffered so if output is redirected to a file it isn't lost if the
+     program is killed part way through.  */
+  setbuf (stdout, NULL);
+  setbuf (stderr, NULL);
+
+  gmp_randinit_default (rands);
+
+  TMP_MARK;
+
+  data.ap = TMP_ALLOC_LIMBS (P_TABLE_SIZE);
+  data.bp = TMP_ALLOC_LIMBS (P_TABLE_SIZE);
+  data.up = TMP_ALLOC_LIMBS (P_TABLE_SIZE);
+  data.vp = TMP_ALLOC_LIMBS (P_TABLE_SIZE);
+  data.gp = TMP_ALLOC_LIMBS (P_TABLE_SIZE);
+
+  mpn_random (data.ap, P_TABLE_SIZE);
+  mpn_random (data.bp, P_TABLE_SIZE);
+
+  memset (p_table, 0, sizeof(p_table));
+
+  for (n = 100; n < P_TABLE_SIZE; n++)
+    {
+      mp_size_t p;
+      mp_size_t best_p;
+      double best_time;
+      double lehmer_time;
+
+      if (data.ap[n-1] == 0)
+	data.ap[n-1] = 1;
+
+      if (data.bp[n-1] == 0)
+	data.bp[n-1] = 1;
+
+      data.n = n;
+
+      lehmer_time = bench_gcd (&data, 0);
+
+      best_p = search (&best_time, bench_gcd, &data, n/5, 4*n/5);
+      if (best_time > lehmer_time)
+	best_p = 0;
+
+      printf("%6zu %6zu %5.3g", n, best_p, (double) best_p / n);
+      if (best_p > 0)
+	{
+	  double speedup = 100 * (lehmer_time - best_time) / lehmer_time;
+	  printf(" %5.3g%%", speedup);
+	  if (speedup < 1.0)
+	    {
+	      printf(" (ignored)");
+	      best_p = 0;
+	    }
+	}
+      printf("\n");
+
+      p_table[n] = best_p;
+    }
+  TMP_FREE;
+  gmp_randclear(rands);
+  return 0;
+}

diff --git a/third_party/gmp/tune/tuneup.c b/third_party/gmp/tune/tuneup.c
new file mode 100644
index 0000000..86ba5ed
--- /dev/null
+++ b/third_party/gmp/tune/tuneup.c

@@ -0,0 +1,3070 @@
+/* Create tuned thresholds for various algorithms.
+
+Copyright 1999-2003, 2005, 2006, 2008-2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+/* Usage: tuneup [-t] [-t] [-p precision]
+
+   -t turns on some diagnostic traces, a second -t turns on more traces.
+
+   Notes:
+
+   The code here isn't a vision of loveliness, mainly because it's subject
+   to ongoing changes according to new things wanting to be tuned, and
+   practical requirements of systems tested.
+
+   Sometimes running the program twice produces slightly different results.
+   This is probably because there's so little separating algorithms near
+   their crossover, and on that basis it should make little or no difference
+   to the final speed of the relevant routines, but nothing has been done to
+   check that carefully.
+
+   Algorithm:
+
+   The thresholds are determined as follows.  A crossover may not be a
+   single size but rather a range where it oscillates between method A or
+   method B faster.  If the threshold is set making B used where A is faster
+   (or vice versa) that's bad.  Badness is the percentage time lost and
+   total badness is the sum of this over all sizes measured.  The threshold
+   is set to minimize total badness.
+
+   Suppose, as sizes increase, method B becomes faster than method A.  The
+   effect of the rule is that, as you look at increasing sizes, isolated
+   points where B is faster are ignored, but when it's consistently faster,
+   or faster on balance, then the threshold is set there.  The same result
+   is obtained thinking in the other direction of A becoming faster at
+   smaller sizes.
+
+   In practice the thresholds tend to be chosen to bring on the next
+   algorithm fairly quickly.
+
+   This rule is attractive because it's got a basis in reason and is fairly
+   easy to implement, but no work has been done to actually compare it in
+   absolute terms to other possibilities.
+
+   Implementation:
+
+   In a normal library build the thresholds are constants.  To tune them
+   selected objects are recompiled with the thresholds as global variables
+   instead.  #define TUNE_PROGRAM_BUILD does this, with help from code at
+   the end of gmp-impl.h, and rules in tune/Makefile.am.
+
+   MUL_TOOM22_THRESHOLD for example uses a recompiled mpn_mul_n.  The
+   threshold is set to "size+1" to avoid karatsuba, or to "size" to use one
+   level, but recurse into the basecase.
+
+   MUL_TOOM33_THRESHOLD makes use of the tuned MUL_TOOM22_THRESHOLD value.
+   Other routines in turn will make use of both of those.  Naturally the
+   dependants must be tuned first.
+
+   In a couple of cases, like DIVEXACT_1_THRESHOLD, there's no recompiling,
+   just a threshold based on comparing two routines (mpn_divrem_1 and
+   mpn_divexact_1), and no further use of the value determined.
+
+   Flags like USE_PREINV_MOD_1 or JACOBI_BASE_METHOD are even simpler, being
+   just comparisons between certain routines on representative data.
+
+   Shortcuts are applied when native (assembler) versions of routines exist.
+   For instance a native mpn_sqr_basecase is assumed to be always faster
+   than mpn_mul_basecase, with no measuring.
+
+   No attempt is made to tune within assembler routines, for instance
+   DIVREM_1_NORM_THRESHOLD.  An assembler mpn_divrem_1 is expected to be
+   written and tuned all by hand.  Assembler routines that might have hard
+   limits are recompiled though, to make them accept a bigger range of sizes
+   than normal, eg. mpn_sqr_basecase to compare against mpn_toom2_sqr.
+
+   Limitations:
+
+   The FFTs aren't subject to the same badness rule as the other thresholds,
+   so each k is probably being brought on a touch early.  This isn't likely
+   to make a difference, and the simpler probing means fewer tests.
+
+*/
+
+#define TUNE_PROGRAM_BUILD  1   /* for gmp-impl.h */
+
+#include "config.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#if HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#include "tests.h"
+#include "speed.h"
+
+#if !HAVE_DECL_OPTARG
+extern char *optarg;
+extern int optind, opterr;
+#endif
+
+
+#define DEFAULT_MAX_SIZE   1000  /* limbs */
+
+#if WANT_FFT
+mp_size_t  option_fft_max_size = 50000;  /* limbs */
+#else
+mp_size_t  option_fft_max_size = 0;
+#endif
+int        option_trace = 0;
+int        option_fft_trace = 0;
+struct speed_params  s;
+
+struct dat_t {
+  mp_size_t  size;
+  double     d;
+} *dat = NULL;
+int  ndat = 0;
+int  allocdat = 0;
+
+/* This is not defined if mpn_sqr_basecase doesn't declare a limit.  In that
+   case use zero here, which for params.max_size means no limit.  */
+#ifndef TUNE_SQR_TOOM2_MAX
+#define TUNE_SQR_TOOM2_MAX  0
+#endif
+
+mp_size_t  mul_toom22_threshold         = MP_SIZE_T_MAX;
+mp_size_t  mul_toom33_threshold         = MUL_TOOM33_THRESHOLD_LIMIT;
+mp_size_t  mul_toom44_threshold         = MUL_TOOM44_THRESHOLD_LIMIT;
+mp_size_t  mul_toom6h_threshold         = MUL_TOOM6H_THRESHOLD_LIMIT;
+mp_size_t  mul_toom8h_threshold         = MUL_TOOM8H_THRESHOLD_LIMIT;
+mp_size_t  mul_toom32_to_toom43_threshold = MP_SIZE_T_MAX;
+mp_size_t  mul_toom32_to_toom53_threshold = MP_SIZE_T_MAX;
+mp_size_t  mul_toom42_to_toom53_threshold = MP_SIZE_T_MAX;
+mp_size_t  mul_toom42_to_toom63_threshold = MP_SIZE_T_MAX;
+mp_size_t  mul_toom43_to_toom54_threshold = MP_SIZE_T_MAX;
+mp_size_t  mul_fft_threshold            = MP_SIZE_T_MAX;
+mp_size_t  mul_fft_modf_threshold       = MP_SIZE_T_MAX;
+mp_size_t  sqr_basecase_threshold       = MP_SIZE_T_MAX;
+mp_size_t  sqr_toom2_threshold
+  = (TUNE_SQR_TOOM2_MAX == 0 ? MP_SIZE_T_MAX : TUNE_SQR_TOOM2_MAX);
+mp_size_t  sqr_toom3_threshold          = SQR_TOOM3_THRESHOLD_LIMIT;
+mp_size_t  sqr_toom4_threshold          = SQR_TOOM4_THRESHOLD_LIMIT;
+mp_size_t  sqr_toom6_threshold          = SQR_TOOM6_THRESHOLD_LIMIT;
+mp_size_t  sqr_toom8_threshold          = SQR_TOOM8_THRESHOLD_LIMIT;
+mp_size_t  sqr_fft_threshold            = MP_SIZE_T_MAX;
+mp_size_t  sqr_fft_modf_threshold       = MP_SIZE_T_MAX;
+mp_size_t  mullo_basecase_threshold     = MP_SIZE_T_MAX;
+mp_size_t  mullo_dc_threshold           = MP_SIZE_T_MAX;
+mp_size_t  mullo_mul_n_threshold        = MP_SIZE_T_MAX;
+mp_size_t  sqrlo_basecase_threshold     = MP_SIZE_T_MAX;
+mp_size_t  sqrlo_dc_threshold           = MP_SIZE_T_MAX;
+mp_size_t  sqrlo_sqr_threshold          = MP_SIZE_T_MAX;
+mp_size_t  mulmid_toom42_threshold      = MP_SIZE_T_MAX;
+mp_size_t  mulmod_bnm1_threshold        = MP_SIZE_T_MAX;
+mp_size_t  sqrmod_bnm1_threshold        = MP_SIZE_T_MAX;
+mp_size_t  div_qr_2_pi2_threshold       = MP_SIZE_T_MAX;
+mp_size_t  dc_div_qr_threshold          = MP_SIZE_T_MAX;
+mp_size_t  dc_divappr_q_threshold       = MP_SIZE_T_MAX;
+mp_size_t  mu_div_qr_threshold          = MP_SIZE_T_MAX;
+mp_size_t  mu_divappr_q_threshold       = MP_SIZE_T_MAX;
+mp_size_t  mupi_div_qr_threshold        = MP_SIZE_T_MAX;
+mp_size_t  mu_div_q_threshold           = MP_SIZE_T_MAX;
+mp_size_t  dc_bdiv_qr_threshold         = MP_SIZE_T_MAX;
+mp_size_t  dc_bdiv_q_threshold          = MP_SIZE_T_MAX;
+mp_size_t  mu_bdiv_qr_threshold         = MP_SIZE_T_MAX;
+mp_size_t  mu_bdiv_q_threshold          = MP_SIZE_T_MAX;
+mp_size_t  inv_mulmod_bnm1_threshold    = MP_SIZE_T_MAX;
+mp_size_t  inv_newton_threshold         = MP_SIZE_T_MAX;
+mp_size_t  inv_appr_threshold           = MP_SIZE_T_MAX;
+mp_size_t  binv_newton_threshold        = MP_SIZE_T_MAX;
+mp_size_t  redc_1_to_redc_2_threshold   = MP_SIZE_T_MAX;
+mp_size_t  redc_1_to_redc_n_threshold   = MP_SIZE_T_MAX;
+mp_size_t  redc_2_to_redc_n_threshold   = MP_SIZE_T_MAX;
+mp_size_t  matrix22_strassen_threshold  = MP_SIZE_T_MAX;
+mp_size_t  hgcd_threshold               = MP_SIZE_T_MAX;
+mp_size_t  hgcd_appr_threshold          = MP_SIZE_T_MAX;
+mp_size_t  hgcd_reduce_threshold        = MP_SIZE_T_MAX;
+mp_size_t  gcd_dc_threshold             = MP_SIZE_T_MAX;
+mp_size_t  gcdext_dc_threshold          = MP_SIZE_T_MAX;
+int	   div_qr_1n_pi1_method		= 0;
+mp_size_t  div_qr_1_norm_threshold      = MP_SIZE_T_MAX;
+mp_size_t  div_qr_1_unnorm_threshold    = MP_SIZE_T_MAX;
+mp_size_t  divrem_1_norm_threshold      = MP_SIZE_T_MAX;
+mp_size_t  divrem_1_unnorm_threshold    = MP_SIZE_T_MAX;
+mp_size_t  mod_1_norm_threshold         = MP_SIZE_T_MAX;
+mp_size_t  mod_1_unnorm_threshold       = MP_SIZE_T_MAX;
+int	   mod_1_1p_method		= 0;
+mp_size_t  mod_1n_to_mod_1_1_threshold  = MP_SIZE_T_MAX;
+mp_size_t  mod_1u_to_mod_1_1_threshold  = MP_SIZE_T_MAX;
+mp_size_t  mod_1_1_to_mod_1_2_threshold = MP_SIZE_T_MAX;
+mp_size_t  mod_1_2_to_mod_1_4_threshold = MP_SIZE_T_MAX;
+mp_size_t  preinv_mod_1_to_mod_1_threshold = MP_SIZE_T_MAX;
+mp_size_t  divrem_2_threshold           = MP_SIZE_T_MAX;
+mp_size_t  get_str_dc_threshold         = MP_SIZE_T_MAX;
+mp_size_t  get_str_precompute_threshold = MP_SIZE_T_MAX;
+mp_size_t  set_str_dc_threshold         = MP_SIZE_T_MAX;
+mp_size_t  set_str_precompute_threshold = MP_SIZE_T_MAX;
+mp_size_t  fac_odd_threshold            = 0;
+mp_size_t  fac_dsc_threshold            = FAC_DSC_THRESHOLD_LIMIT;
+
+mp_size_t  fft_modf_sqr_threshold = MP_SIZE_T_MAX;
+mp_size_t  fft_modf_mul_threshold = MP_SIZE_T_MAX;
+
+struct param_t {
+  const char        *name;
+  speed_function_t  function;
+  speed_function_t  function2;
+  double            step_factor;    /* how much to step relatively */
+  int               step;           /* how much to step absolutely */
+  double            function_fudge; /* multiplier for "function" speeds */
+  int               stop_since_change;
+  double            stop_factor;
+  mp_size_t         min_size;
+  int               min_is_always;
+  mp_size_t         max_size;
+  mp_size_t         check_size;
+  mp_size_t         size_extra;
+
+#define DATA_HIGH_LT_R  1
+#define DATA_HIGH_GE_R  2
+  int               data_high;
+
+  int               noprint;
+};
+
+
+/* These are normally undefined when false, which suits "#if" fine.
+   But give them zero values so they can be used in plain C "if"s.  */
+#ifndef UDIV_PREINV_ALWAYS
+#define UDIV_PREINV_ALWAYS 0
+#endif
+#ifndef HAVE_NATIVE_mpn_divexact_1
+#define HAVE_NATIVE_mpn_divexact_1 0
+#endif
+#ifndef HAVE_NATIVE_mpn_div_qr_1n_pi1
+#define HAVE_NATIVE_mpn_div_qr_1n_pi1 0
+#endif
+#ifndef HAVE_NATIVE_mpn_divrem_1
+#define HAVE_NATIVE_mpn_divrem_1 0
+#endif
+#ifndef HAVE_NATIVE_mpn_divrem_2
+#define HAVE_NATIVE_mpn_divrem_2 0
+#endif
+#ifndef HAVE_NATIVE_mpn_mod_1
+#define HAVE_NATIVE_mpn_mod_1 0
+#endif
+#ifndef HAVE_NATIVE_mpn_mod_1_1p
+#define HAVE_NATIVE_mpn_mod_1_1p 0
+#endif
+#ifndef HAVE_NATIVE_mpn_modexact_1_odd
+#define HAVE_NATIVE_mpn_modexact_1_odd 0
+#endif
+#ifndef HAVE_NATIVE_mpn_preinv_divrem_1
+#define HAVE_NATIVE_mpn_preinv_divrem_1 0
+#endif
+#ifndef HAVE_NATIVE_mpn_preinv_mod_1
+#define HAVE_NATIVE_mpn_preinv_mod_1 0
+#endif
+#ifndef HAVE_NATIVE_mpn_sqr_basecase
+#define HAVE_NATIVE_mpn_sqr_basecase 0
+#endif
+
+
+#define MAX3(a,b,c)  MAX (MAX (a, b), c)
+
+mp_limb_t
+randlimb_norm (void)
+{
+  mp_limb_t  n;
+  mpn_random (&n, 1);
+  n |= GMP_NUMB_HIGHBIT;
+  return n;
+}
+
+#define GMP_NUMB_HALFMASK  ((CNST_LIMB(1) << (GMP_NUMB_BITS/2)) - 1)
+
+mp_limb_t
+randlimb_half (void)
+{
+  mp_limb_t  n;
+  mpn_random (&n, 1);
+  n &= GMP_NUMB_HALFMASK;
+  n += (n==0);
+  return n;
+}
+
+
+/* Add an entry to the end of the dat[] array, reallocing to make it bigger
+   if necessary.  */
+void
+add_dat (mp_size_t size, double d)
+{
+#define ALLOCDAT_STEP  500
+
+  ASSERT_ALWAYS (ndat <= allocdat);
+
+  if (ndat == allocdat)
+    {
+      dat = (struct dat_t *) __gmp_allocate_or_reallocate
+        (dat, allocdat * sizeof(dat[0]),
+         (allocdat+ALLOCDAT_STEP) * sizeof(dat[0]));
+      allocdat += ALLOCDAT_STEP;
+    }
+
+  dat[ndat].size = size;
+  dat[ndat].d = d;
+  ndat++;
+}
+
+
+/* Return the threshold size based on the data accumulated. */
+mp_size_t
+analyze_dat (int final)
+{
+  double  x, min_x;
+  int     j, min_j;
+
+  /* If the threshold is set at dat[0].size, any positive values are bad. */
+  x = 0.0;
+  for (j = 0; j < ndat; j++)
+    if (dat[j].d > 0.0)
+      x += dat[j].d;
+
+  if (option_trace >= 2 && final)
+    {
+      printf ("\n");
+      printf ("x is the sum of the badness from setting thresh at given size\n");
+      printf ("  (minimum x is sought)\n");
+      printf ("size=%ld  first x=%.4f\n", (long) dat[j].size, x);
+    }
+
+  min_x = x;
+  min_j = 0;
+
+
+  /* When stepping to the next dat[j].size, positive values are no longer
+     bad (so subtracted), negative values become bad (so add the absolute
+     value, meaning subtract). */
+  for (j = 0; j < ndat; x -= dat[j].d, j++)
+    {
+      if (option_trace >= 2 && final)
+        printf ("size=%ld  x=%.4f\n", (long) dat[j].size, x);
+
+      if (x < min_x)
+        {
+          min_x = x;
+          min_j = j;
+        }
+    }
+
+  return min_j;
+}
+
+
+/* Measuring for recompiled mpn/generic/div_qr_1.c,
+ * mpn/generic/divrem_1.c, mpn/generic/mod_1.c and mpz/fac_ui.c */
+
+mp_limb_t mpn_div_qr_1_tune (mp_ptr, mp_limb_t *, mp_srcptr, mp_size_t, mp_limb_t);
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+mp_limb_t mpn_divrem_1_tune (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
+mp_limb_t mpn_mod_1_tune (mp_srcptr, mp_size_t, mp_limb_t);
+void mpz_fac_ui_tune (mpz_ptr, unsigned long);
+
+#if defined (__cplusplus)
+}
+#endif
+
+double
+speed_mpn_mod_1_tune (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune);
+}
+double
+speed_mpn_divrem_1_tune (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune);
+}
+double
+speed_mpz_fac_ui_tune (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui_tune);
+}
+double
+speed_mpn_div_qr_1_tune (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_DIV_QR_1 (mpn_div_qr_1_tune);
+}
+
+double
+tuneup_measure (speed_function_t fun,
+                const struct param_t *param,
+                struct speed_params *s)
+{
+  static struct param_t  dummy;
+  double   t;
+  TMP_DECL;
+
+  if (! param)
+    param = &dummy;
+
+  s->size += param->size_extra;
+
+  TMP_MARK;
+  SPEED_TMP_ALLOC_LIMBS (s->xp, s->size, 0);
+  SPEED_TMP_ALLOC_LIMBS (s->yp, s->size, 0);
+
+  mpn_random (s->xp, s->size);
+  mpn_random (s->yp, s->size);
+
+  switch (param->data_high) {
+  case DATA_HIGH_LT_R:
+    s->xp[s->size-1] %= s->r;
+    s->yp[s->size-1] %= s->r;
+    break;
+  case DATA_HIGH_GE_R:
+    s->xp[s->size-1] |= s->r;
+    s->yp[s->size-1] |= s->r;
+    break;
+  }
+
+  t = speed_measure (fun, s);
+
+  s->size -= param->size_extra;
+
+  TMP_FREE;
+  return t;
+}
+
+
+#define PRINT_WIDTH  31
+
+void
+print_define_start (const char *name)
+{
+  printf ("#define %-*s  ", PRINT_WIDTH, name);
+  if (option_trace)
+    printf ("...\n");
+}
+
+void
+print_define_end_remark (const char *name, mp_size_t value, const char *remark)
+{
+  if (option_trace)
+    printf ("#define %-*s  ", PRINT_WIDTH, name);
+
+  if (value == MP_SIZE_T_MAX)
+    printf ("MP_SIZE_T_MAX");
+  else
+    printf ("%5ld", (long) value);
+
+  if (remark != NULL)
+    printf ("  /* %s */", remark);
+  printf ("\n");
+  fflush (stdout);
+}
+
+void
+print_define_end (const char *name, mp_size_t value)
+{
+  const char  *remark;
+  if (value == MP_SIZE_T_MAX)
+    remark = "never";
+  else if (value == 0)
+    remark = "always";
+  else
+    remark = NULL;
+  print_define_end_remark (name, value, remark);
+}
+
+void
+print_define (const char *name, mp_size_t value)
+{
+  print_define_start (name);
+  print_define_end (name, value);
+}
+
+void
+print_define_remark (const char *name, mp_size_t value, const char *remark)
+{
+  print_define_start (name);
+  print_define_end_remark (name, value, remark);
+}
+
+void
+print_define_with_speedup (const char *name, mp_size_t value,
+			   mp_size_t runner_up, double speedup)
+{
+  char buf[100];
+  snprintf (buf, sizeof(buf), "%.2f%% faster than %ld",
+	    100.0 * (speedup - 1), runner_up);
+  print_define_remark (name, value, buf);
+}
+
+void
+one (mp_size_t *threshold, struct param_t *param)
+{
+  int  since_positive, since_thresh_change;
+  int  thresh_idx, new_thresh_idx;
+
+#define DEFAULT(x,n)  do { if (! (x))  (x) = (n); } while (0)
+
+  DEFAULT (param->function_fudge, 1.0);
+  DEFAULT (param->function2, param->function);
+  DEFAULT (param->step_factor, 0.01);  /* small steps by default */
+  DEFAULT (param->step, 1);            /* small steps by default */
+  DEFAULT (param->stop_since_change, 80);
+  DEFAULT (param->stop_factor, 1.2);
+  DEFAULT (param->min_size, 10);
+  DEFAULT (param->max_size, DEFAULT_MAX_SIZE);
+
+  if (param->check_size != 0)
+    {
+      double   t1, t2;
+      s.size = param->check_size;
+
+      *threshold = s.size+1;
+      t1 = tuneup_measure (param->function, param, &s);
+
+      *threshold = s.size;
+      t2 = tuneup_measure (param->function2, param, &s);
+      if (t1 == -1.0 || t2 == -1.0)
+        {
+          printf ("Oops, can't run both functions at size %ld\n",
+                  (long) s.size);
+          abort ();
+        }
+      t1 *= param->function_fudge;
+
+      /* ask that t2 is at least 4% below t1 */
+      if (t1 < t2*1.04)
+        {
+          if (option_trace)
+            printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2);
+          *threshold = MP_SIZE_T_MAX;
+          if (! param->noprint)
+            print_define (param->name, *threshold);
+          return;
+        }
+
+      if (option_trace >= 2)
+        printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n",
+                (long) s.size, t1, t2);
+    }
+
+  if (! param->noprint || option_trace)
+    print_define_start (param->name);
+
+  ndat = 0;
+  since_positive = 0;
+  since_thresh_change = 0;
+  thresh_idx = 0;
+
+  if (option_trace >= 2)
+    {
+      printf ("             algorithm-A  algorithm-B   ratio  possible\n");
+      printf ("              (seconds)    (seconds)    diff    thresh\n");
+    }
+
+  for (s.size = param->min_size;
+       s.size < param->max_size;
+       s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), param->step))
+    {
+      double   ti, tiplus1, d;
+
+      /*
+        FIXME: check minimum size requirements are met, possibly by just
+        checking for the -1 returns from the speed functions.
+      */
+
+      /* using method A at this size */
+      *threshold = s.size+1;
+      ti = tuneup_measure (param->function, param, &s);
+      if (ti == -1.0)
+        abort ();
+      ti *= param->function_fudge;
+
+      /* using method B at this size */
+      *threshold = s.size;
+      tiplus1 = tuneup_measure (param->function2, param, &s);
+      if (tiplus1 == -1.0)
+        abort ();
+
+      /* Calculate the fraction by which the one or the other routine is
+         slower.  */
+      if (tiplus1 >= ti)
+        d = (tiplus1 - ti) / tiplus1;  /* negative */
+      else
+        d = (tiplus1 - ti) / ti;       /* positive */
+
+      add_dat (s.size, d);
+
+      new_thresh_idx = analyze_dat (0);
+
+      if (option_trace >= 2)
+        printf ("size=%ld  %.9f  %.9f  % .4f %c  %ld\n",
+                (long) s.size, ti, tiplus1, d,
+                ti > tiplus1 ? '#' : ' ',
+                (long) dat[new_thresh_idx].size);
+
+      /* Stop if the last time method i was faster was more than a
+         certain number of measurements ago.  */
+#define STOP_SINCE_POSITIVE  200
+      if (d >= 0)
+        since_positive = 0;
+      else
+        if (++since_positive > STOP_SINCE_POSITIVE)
+          {
+            if (option_trace >= 1)
+              printf ("stopped due to since_positive (%d)\n",
+                      STOP_SINCE_POSITIVE);
+            break;
+          }
+
+      /* Stop if method A has become slower by a certain factor. */
+      if (ti >= tiplus1 * param->stop_factor)
+        {
+          if (option_trace >= 1)
+            printf ("stopped due to ti >= tiplus1 * factor (%.1f)\n",
+                    param->stop_factor);
+          break;
+        }
+
+      /* Stop if the threshold implied hasn't changed in a certain
+         number of measurements.  (It's this condition that usually
+         stops the loop.) */
+      if (thresh_idx != new_thresh_idx)
+        since_thresh_change = 0, thresh_idx = new_thresh_idx;
+      else
+        if (++since_thresh_change > param->stop_since_change)
+          {
+            if (option_trace >= 1)
+              printf ("stopped due to since_thresh_change (%d)\n",
+                      param->stop_since_change);
+            break;
+          }
+
+      /* Stop if the threshold implied is more than a certain number of
+         measurements ago.  */
+#define STOP_SINCE_AFTER   500
+      if (ndat - thresh_idx > STOP_SINCE_AFTER)
+        {
+          if (option_trace >= 1)
+            printf ("stopped due to ndat - thresh_idx > amount (%d)\n",
+                    STOP_SINCE_AFTER);
+          break;
+        }
+
+      /* Stop when the size limit is reached before the end of the
+         crossover, but only show this as an error for >= the default max
+         size.  FIXME: Maybe should make it a param choice whether this is
+         an error.  */
+      if (s.size >= param->max_size && param->max_size >= DEFAULT_MAX_SIZE)
+        {
+          fprintf (stderr, "%s\n", param->name);
+          fprintf (stderr, "sizes %ld to %ld total %d measurements\n",
+                   (long) dat[0].size, (long) dat[ndat-1].size, ndat);
+          fprintf (stderr, "    max size reached before end of crossover\n");
+          break;
+        }
+    }
+
+  if (option_trace >= 1)
+    printf ("sizes %ld to %ld total %d measurements\n",
+            (long) dat[0].size, (long) dat[ndat-1].size, ndat);
+
+  *threshold = dat[analyze_dat (1)].size;
+
+  if (param->min_is_always)
+    {
+      if (*threshold == param->min_size)
+        *threshold = 0;
+    }
+
+  if (! param->noprint || option_trace)
+    print_define_end (param->name, *threshold);
+}
+
+/* Time N different FUNCTIONS with the same parameters and size, to
+   select the fastest. Since *_METHOD defines start numbering from
+   one, if functions[i] is fastest, the value of the define is i+1.
+   Also output a comment with speedup compared to the next fastest
+   function. The NAME argument is used only for trace output.
+
+   Returns the index of the fastest function.
+*/
+int
+one_method (int n, speed_function_t *functions,
+	    const char *name, const char *define,
+	    const struct param_t *param)
+{
+  double *t;
+  int i;
+  int method;
+  int method_runner_up;
+
+  TMP_DECL;
+  TMP_MARK;
+  t = (double*) TMP_ALLOC (n * sizeof (*t));
+
+  for (i = 0; i < n; i++)
+    {
+      t[i] = tuneup_measure (functions[i], param, &s);
+      if (option_trace >= 1)
+	printf ("size=%ld, %s, method %d %.9f\n",
+		(long) s.size, name, i + 1, t[i]);
+      if (t[i] == -1.0)
+	{
+	  printf ("Oops, can't measure all %s methods\n", name);
+	  abort ();
+	}
+    }
+  method = 0;
+  for (i = 1; i < n; i++)
+    if (t[i] < t[method])
+      method = i;
+
+  method_runner_up = (method == 0);
+  for (i = 0; i < n; i++)
+    if (i != method && t[i] < t[method_runner_up])
+      method_runner_up = i;
+
+  print_define_with_speedup (define, method + 1, method_runner_up + 1,
+			     t[method_runner_up] / t[method]);
+
+  TMP_FREE;
+  return method;
+}
+
+
+/* Special probing for the fft thresholds.  The size restrictions on the
+   FFTs mean the graph of time vs size has a step effect.  See this for
+   example using
+
+       ./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9
+       gnuplot foo.gnuplot
+
+   The current approach is to compare routines at the midpoint of relevant
+   steps.  Arguably a more sophisticated system of threshold data is wanted
+   if this step effect remains. */
+
+struct fft_param_t {
+  const char        *table_name;
+  const char        *threshold_name;
+  const char        *modf_threshold_name;
+  mp_size_t         *p_threshold;
+  mp_size_t         *p_modf_threshold;
+  mp_size_t         first_size;
+  mp_size_t         max_size;
+  speed_function_t  function;
+  speed_function_t  mul_modf_function;
+  speed_function_t  mul_function;
+  mp_size_t         sqr;
+};
+
+
+/* mpn_mul_fft requires pl a multiple of 2^k limbs, but with
+   N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple
+   of 2^(k-1) bits. */
+
+mp_size_t
+fft_step_size (int k)
+{
+  mp_size_t  step;
+
+  step = MAX ((mp_size_t) 1 << (k-1), GMP_LIMB_BITS) / GMP_LIMB_BITS;
+  step *= (mp_size_t) 1 << k;
+
+  if (step <= 0)
+    {
+      printf ("Can't handle k=%d\n", k);
+      abort ();
+    }
+
+  return step;
+}
+
+mp_size_t
+fft_next_size (mp_size_t pl, int k)
+{
+  mp_size_t  m = fft_step_size (k);
+
+/*    printf ("[k=%d %ld] %ld ->", k, m, pl); */
+
+  if (pl == 0 || (pl & (m-1)) != 0)
+    pl = (pl | (m-1)) + 1;
+
+/*    printf (" %ld\n", pl); */
+  return pl;
+}
+
+#define NMAX_DEFAULT 1000000
+#define MAX_REPS 25
+#define MIN_REPS 5
+
+static inline size_t
+mpn_mul_fft_lcm (size_t a, unsigned int k)
+{
+  unsigned int l = k;
+
+  while (a % 2 == 0 && k > 0)
+    {
+      a >>= 1;
+      k--;
+    }
+  return a << l;
+}
+
+mp_size_t
+fftfill (mp_size_t pl, int k, int sqr)
+{
+  mp_size_t maxLK;
+  mp_bitcnt_t N, Nprime, nprime, M;
+
+  N = pl * GMP_NUMB_BITS;
+  M = N >> k;
+
+  maxLK = mpn_mul_fft_lcm ((unsigned long) GMP_NUMB_BITS, k);
+
+  Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK;
+  nprime = Nprime / GMP_NUMB_BITS;
+  if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
+    {
+      size_t K2;
+      for (;;)
+	{
+	  K2 = 1L << mpn_fft_best_k (nprime, sqr);
+	  if ((nprime & (K2 - 1)) == 0)
+	    break;
+	  nprime = (nprime + K2 - 1) & -K2;
+	  Nprime = nprime * GMP_LIMB_BITS;
+	}
+    }
+  ASSERT_ALWAYS (nprime < pl);
+
+  return Nprime;
+}
+
+static int
+compare_double (const void *ap, const void *bp)
+{
+  double a = * (const double *) ap;
+  double b = * (const double *) bp;
+
+  if (a < b)
+    return -1;
+  else if (a > b)
+    return 1;
+  else
+    return 0;
+}
+
+double
+median (double *times, int n)
+{
+  qsort (times, n, sizeof (double), compare_double);
+  return times[n/2];
+}
+
+#define FFT_CACHE_SIZE 25
+typedef struct fft_cache
+{
+  mp_size_t n;
+  double time;
+} fft_cache_t;
+
+fft_cache_t fft_cache[FFT_CACHE_SIZE];
+
+double
+cached_measure (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, int k,
+		int n_measurements)
+{
+  int i;
+  double t, ttab[MAX_REPS];
+
+  if (fft_cache[k].n == n)
+    return fft_cache[k].time;
+
+  for (i = 0; i < n_measurements; i++)
+    {
+      speed_starttime ();
+      mpn_mul_fft (rp, n, ap, n, bp, n, k);
+      ttab[i] = speed_endtime ();
+    }
+
+  t = median (ttab, n_measurements);
+  fft_cache[k].n = n;
+  fft_cache[k].time = t;
+  return t;
+}
+
+#define INSERT_FFTTAB(idx, nval, kval)					\
+  do {									\
+    fft_tab[idx].n = nval;						\
+    fft_tab[idx].k = kval;						\
+    fft_tab[idx+1].n = (1 << 27) - 1;	/* sentinel, 27b wide field */	\
+    fft_tab[idx+1].k = (1 <<  5) - 1;					\
+  } while (0)
+
+int
+fftmes (mp_size_t nmin, mp_size_t nmax, int initial_k, struct fft_param_t *p, int idx, int print)
+{
+  mp_size_t n, n1, prev_n1;
+  int k, best_k, last_best_k, kmax;
+  int eff, prev_eff;
+  double t0, t1;
+  int n_measurements;
+  mp_limb_t *ap, *bp, *rp;
+  mp_size_t alloc;
+  struct fft_table_nk *fft_tab;
+
+  fft_tab = mpn_fft_table3[p->sqr];
+
+  for (k = 0; k < FFT_CACHE_SIZE; k++)
+    fft_cache[k].n = 0;
+
+  if (nmin < (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
+    {
+      nmin = (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD);
+    }
+
+  if (print)
+    printf ("#define %s%*s", p->table_name, 38, "");
+
+  if (idx == 0)
+    {
+      INSERT_FFTTAB (0, nmin, initial_k);
+
+      if (print)
+	{
+	  printf ("\\\n  { ");
+	  printf ("{%7u,%2u}", fft_tab[0].n, fft_tab[0].k);
+	}
+
+      idx = 1;
+    }
+
+  ap = (mp_ptr) malloc (sizeof (mp_limb_t));
+  if (p->sqr)
+    bp = ap;
+  else
+    bp = (mp_ptr) malloc (sizeof (mp_limb_t));
+  rp = (mp_ptr) malloc (sizeof (mp_limb_t));
+  alloc = 1;
+
+  /* Round n to comply to initial k value */
+  n = (nmin + ((1ul << initial_k) - 1)) & (MP_SIZE_T_MAX << initial_k);
+
+  n_measurements = (18 - initial_k) | 1;
+  n_measurements = MAX (n_measurements, MIN_REPS);
+  n_measurements = MIN (n_measurements, MAX_REPS);
+
+  last_best_k = initial_k;
+  best_k = initial_k;
+
+  while (n < nmax)
+    {
+      int start_k, end_k;
+
+      /* Assume the current best k is best until we hit its next FFT step.  */
+      t0 = 99999;
+
+      prev_n1 = n + 1;
+
+      start_k = MAX (4, best_k - 4);
+      end_k = MIN (24, best_k + 4);
+      for (k = start_k; k <= end_k; k++)
+	{
+          n1 = mpn_fft_next_size (prev_n1, k);
+
+	  eff = 200 * (n1 * GMP_NUMB_BITS >> k) / fftfill (n1, k, p->sqr);
+
+	  if (eff < 70)		/* avoid measuring too slow fft:s */
+	    continue;
+
+	  if (n1 > alloc)
+	    {
+	      alloc = n1;
+	      if (p->sqr)
+		{
+		  ap = (mp_ptr) realloc (ap, sizeof (mp_limb_t));
+		  rp = (mp_ptr) realloc (rp, sizeof (mp_limb_t));
+		  ap = bp = (mp_ptr) realloc (ap, alloc * sizeof (mp_limb_t));
+		  mpn_random (ap, alloc);
+		  rp = (mp_ptr) realloc (rp, alloc * sizeof (mp_limb_t));
+		}
+	      else
+		{
+		  ap = (mp_ptr) realloc (ap, sizeof (mp_limb_t));
+		  bp = (mp_ptr) realloc (bp, sizeof (mp_limb_t));
+		  rp = (mp_ptr) realloc (rp, sizeof (mp_limb_t));
+		  ap = (mp_ptr) realloc (ap, alloc * sizeof (mp_limb_t));
+		  mpn_random (ap, alloc);
+		  bp = (mp_ptr) realloc (bp, alloc * sizeof (mp_limb_t));
+		  mpn_random (bp, alloc);
+		  rp = (mp_ptr) realloc (rp, alloc * sizeof (mp_limb_t));
+		}
+	    }
+
+	  t1 = cached_measure (rp, ap, bp, n1, k, n_measurements);
+
+	  if (t1 * n_measurements > 0.3)
+	    n_measurements -= 2;
+	  n_measurements = MAX (n_measurements, MIN_REPS);
+
+	  if (t1 < t0)
+	    {
+	      best_k = k;
+	      t0 = t1;
+	    }
+	}
+
+      n1 = mpn_fft_next_size (prev_n1, best_k);
+
+      if (last_best_k != best_k)
+	{
+	  ASSERT_ALWAYS ((prev_n1 & ((1ul << last_best_k) - 1)) == 1);
+
+	  if (idx >= FFT_TABLE3_SIZE)
+	    {
+	      printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
+	      abort ();
+	    }
+	  INSERT_FFTTAB (idx, prev_n1 >> last_best_k, best_k);
+
+	  if (print)
+	    {
+	      printf (", ");
+	      if (idx % 4 == 0)
+		printf ("\\\n    ");
+	      printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
+	    }
+
+	  if (option_trace >= 2)
+	    {
+	      printf ("{%lu,%u}\n", prev_n1, best_k);
+	      fflush (stdout);
+	    }
+
+	  last_best_k = best_k;
+	  idx++;
+	}
+
+      for (;;)
+	{
+	  prev_n1 = n1;
+	  prev_eff = fftfill (prev_n1, best_k, p->sqr);
+	  n1 = mpn_fft_next_size (prev_n1 + 1, best_k);
+	  eff = fftfill (n1, best_k, p->sqr);
+
+	  if (eff != prev_eff)
+	    break;
+	}
+
+      n = prev_n1;
+    }
+
+  kmax = sizeof (mp_size_t) * 4;	/* GMP_MP_SIZE_T_BITS / 2 */
+  kmax = MIN (kmax, 25-1);
+  for (k = last_best_k + 1; k <= kmax; k++)
+    {
+      if (idx >= FFT_TABLE3_SIZE)
+	{
+	  printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
+	  abort ();
+	}
+      INSERT_FFTTAB (idx, ((1ul << (2*k-2)) + 1) >> (k-1), k);
+
+      if (print)
+	{
+	  printf (", ");
+	  if (idx % 4 == 0)
+	    printf ("\\\n    ");
+	  printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
+	}
+
+      idx++;
+    }
+
+  if (print)
+    printf (" }\n");
+
+  free (ap);
+  if (! p->sqr)
+    free (bp);
+  free (rp);
+
+  return idx;
+}
+
+void
+fft (struct fft_param_t *p)
+{
+  mp_size_t  size;
+  int        k, idx, initial_k;
+
+  /*** Generate MUL_FFT_MODF_THRESHOLD / SQR_FFT_MODF_THRESHOLD ***/
+
+#if 1
+  {
+    /* Use plain one() mechanism, for some reasonable initial values of k.  The
+       advantage is that we don't depend on mpn_fft_table3, which can therefore
+       leave it completely uninitialized.  */
+
+    static struct param_t param;
+    mp_size_t thres, best_thres;
+    int best_k;
+    char buf[20];
+
+    best_thres = MP_SIZE_T_MAX;
+    best_k = -1;
+
+    for (k = 5; k <= 7; k++)
+      {
+	param.name = p->modf_threshold_name;
+	param.min_size = 100;
+	param.max_size = 2000;
+	param.function  = p->mul_function;
+	param.step_factor = 0.0;
+	param.step = 4;
+	param.function2 = p->mul_modf_function;
+	param.noprint = 1;
+	s.r = k;
+	one (&thres, &param);
+	if (thres < best_thres)
+	  {
+	    best_thres = thres;
+	    best_k = k;
+	  }
+      }
+
+    *(p->p_modf_threshold) = best_thres;
+    sprintf (buf, "k = %d", best_k);
+    print_define_remark (p->modf_threshold_name, best_thres, buf);
+    initial_k = best_k;
+  }
+#else
+  size = p->first_size;
+  for (;;)
+    {
+      double  tk, tm;
+
+      size = mpn_fft_next_size (size+1, mpn_fft_best_k (size+1, p->sqr));
+      k = mpn_fft_best_k (size, p->sqr);
+
+      if (size >= p->max_size)
+        break;
+
+      s.size = size + fft_step_size (k) / 2;
+      s.r = k;
+      tk = tuneup_measure (p->mul_modf_function, NULL, &s);
+      if (tk == -1.0)
+        abort ();
+
+      tm = tuneup_measure (p->mul_function, NULL, &s);
+      if (tm == -1.0)
+        abort ();
+
+      if (option_trace >= 2)
+        printf ("at %ld   size=%ld  k=%d  %.9f   size=%ld modf %.9f\n",
+                (long) size,
+                (long) size + fft_step_size (k) / 2, k, tk,
+                (long) s.size, tm);
+
+      if (tk < tm)
+        {
+	  *p->p_modf_threshold = s.size;
+	  print_define (p->modf_threshold_name, *p->p_modf_threshold);
+	  break;
+        }
+    }
+  initial_k = ?;
+#endif
+
+  /*** Generate MUL_FFT_TABLE3 / SQR_FFT_TABLE3 ***/
+
+  idx = fftmes (*p->p_modf_threshold, p->max_size, initial_k, p, 0, 1);
+  printf ("#define %s_SIZE %d\n", p->table_name, idx);
+
+  /*** Generate MUL_FFT_THRESHOLD / SQR_FFT_THRESHOLD ***/
+
+  size = 2 * *p->p_modf_threshold;	/* OK? */
+  for (;;)
+    {
+      double  tk, tm;
+      mp_size_t mulmod_size, mul_size;;
+
+      if (size >= p->max_size)
+        break;
+
+      mulmod_size = mpn_mulmod_bnm1_next_size (2 * (size + 1)) / 2;
+      mul_size = (size + mulmod_size) / 2;	/* middle of step */
+
+      s.size = mulmod_size;
+      tk = tuneup_measure (p->function, NULL, &s);
+      if (tk == -1.0)
+        abort ();
+
+      s.size = mul_size;
+      tm = tuneup_measure (p->mul_function, NULL, &s);
+      if (tm == -1.0)
+        abort ();
+
+      if (option_trace >= 2)
+        printf ("at %ld   size=%ld  %.9f   size=%ld mul %.9f\n",
+                (long) size,
+                (long) mulmod_size, tk,
+                (long) mul_size, tm);
+
+      size = mulmod_size;
+
+      if (tk < tm)
+        {
+	  *p->p_threshold = s.size;
+	  print_define (p->threshold_name, *p->p_threshold);
+	  break;
+        }
+    }
+}
+
+/* Compare mpn_mul_1 to whatever fast exact single-limb division we have.  This
+   is currently mpn_divexact_1, but will become mpn_bdiv_1_qr_pi2 or somesuch.
+   This is used in get_str and set_str.  */
+void
+relspeed_div_1_vs_mul_1 (void)
+{
+  const size_t max_opsize = 100;
+  mp_size_t n;
+  long j;
+  mp_limb_t rp[max_opsize];
+  mp_limb_t ap[max_opsize];
+  double multime, divtime;
+
+  mpn_random (ap, max_opsize);
+
+  multime = 0;
+  for (n = max_opsize; n > 1; n--)
+    {
+      mpn_mul_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
+      speed_starttime ();
+      for (j = speed_precision; j != 0 ; j--)
+	mpn_mul_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
+      multime += speed_endtime () / n;
+    }
+
+  divtime = 0;
+  for (n = max_opsize; n > 1; n--)
+    {
+      /* Make input divisible for good measure.  */
+      ap[n - 1] = mpn_mul_1 (ap, ap, n - 1, MP_BASES_BIG_BASE_10);
+
+#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 || ! HAVE_NATIVE_mpn_divexact_1
+	  mpn_pi1_bdiv_q_1 (rp, ap, n, MP_BASES_BIG_BASE_10,
+			    MP_BASES_BIG_BASE_BINVERTED_10,
+			    MP_BASES_BIG_BASE_CTZ_10);
+#else
+	  mpn_divexact_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
+#endif
+      speed_starttime ();
+      for (j = speed_precision; j != 0 ; j--)
+	{
+#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 || ! HAVE_NATIVE_mpn_divexact_1
+	  mpn_pi1_bdiv_q_1 (rp, ap, n, MP_BASES_BIG_BASE_10,
+			    MP_BASES_BIG_BASE_BINVERTED_10,
+			    MP_BASES_BIG_BASE_CTZ_10);
+#else
+	  mpn_divexact_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
+#endif
+	}
+      divtime += speed_endtime () / n;
+    }
+
+  print_define ("DIV_1_VS_MUL_1_PERCENT", (int) (100 * divtime/multime));
+}
+
+
+/* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2,
+   giving wrong results.  */
+void
+tune_mul_n (void)
+{
+  static struct param_t  param;
+  mp_size_t next_toom_start;
+  int something_changed;
+
+  param.function = speed_mpn_mul_n;
+
+  param.name = "MUL_TOOM22_THRESHOLD";
+  param.min_size = MAX (4, MPN_TOOM22_MUL_MINSIZE);
+  param.max_size = MUL_TOOM22_THRESHOLD_LIMIT-1;
+  one (&mul_toom22_threshold, &param);
+
+  param.noprint = 1;
+
+  /* Threshold sequence loop.  Disable functions that would be used in a very
+     narrow range, re-measuring things when that happens.  */
+  something_changed = 1;
+  while (something_changed)
+    {
+      something_changed = 0;
+
+	next_toom_start = mul_toom22_threshold;
+
+	if (mul_toom33_threshold != 0)
+	  {
+	    param.name = "MUL_TOOM33_THRESHOLD";
+	    param.min_size = MAX (next_toom_start, MPN_TOOM33_MUL_MINSIZE);
+	    param.max_size = MUL_TOOM33_THRESHOLD_LIMIT-1;
+	    one (&mul_toom33_threshold, &param);
+
+	    if (next_toom_start * 1.05 >= mul_toom33_threshold)
+	      {
+		mul_toom33_threshold = 0;
+		something_changed = 1;
+	      }
+	  }
+
+	next_toom_start = MAX (next_toom_start, mul_toom33_threshold);
+
+	if (mul_toom44_threshold != 0)
+	  {
+	    param.name = "MUL_TOOM44_THRESHOLD";
+	    param.min_size = MAX (next_toom_start, MPN_TOOM44_MUL_MINSIZE);
+	    param.max_size = MUL_TOOM44_THRESHOLD_LIMIT-1;
+	    one (&mul_toom44_threshold, &param);
+
+	    if (next_toom_start * 1.05 >= mul_toom44_threshold)
+	      {
+		mul_toom44_threshold = 0;
+		something_changed = 1;
+	      }
+	  }
+
+	next_toom_start = MAX (next_toom_start, mul_toom44_threshold);
+
+	if (mul_toom6h_threshold != 0)
+	  {
+	    param.name = "MUL_TOOM6H_THRESHOLD";
+	    param.min_size = MAX (next_toom_start, MPN_TOOM6H_MUL_MINSIZE);
+	    param.max_size = MUL_TOOM6H_THRESHOLD_LIMIT-1;
+	    one (&mul_toom6h_threshold, &param);
+
+	    if (next_toom_start * 1.05 >= mul_toom6h_threshold)
+	      {
+		mul_toom6h_threshold = 0;
+		something_changed = 1;
+	      }
+	  }
+
+	next_toom_start = MAX (next_toom_start, mul_toom6h_threshold);
+
+	if (mul_toom8h_threshold != 0)
+	  {
+	    param.name = "MUL_TOOM8H_THRESHOLD";
+	    param.min_size = MAX (next_toom_start, MPN_TOOM8H_MUL_MINSIZE);
+	    param.max_size = MUL_TOOM8H_THRESHOLD_LIMIT-1;
+	    one (&mul_toom8h_threshold, &param);
+
+	    if (next_toom_start * 1.05 >= mul_toom8h_threshold)
+	      {
+		mul_toom8h_threshold = 0;
+		something_changed = 1;
+	      }
+	  }
+    }
+
+    print_define ("MUL_TOOM33_THRESHOLD", MUL_TOOM33_THRESHOLD);
+    print_define ("MUL_TOOM44_THRESHOLD", MUL_TOOM44_THRESHOLD);
+    print_define ("MUL_TOOM6H_THRESHOLD", MUL_TOOM6H_THRESHOLD);
+    print_define ("MUL_TOOM8H_THRESHOLD", MUL_TOOM8H_THRESHOLD);
+
+  /* disabled until tuned */
+  MUL_FFT_THRESHOLD = MP_SIZE_T_MAX;
+}
+
+void
+tune_mul (void)
+{
+  static struct param_t  param;
+  mp_size_t thres;
+
+  param.noprint = 1;
+
+  param.function = speed_mpn_toom32_for_toom43_mul;
+  param.function2 = speed_mpn_toom43_for_toom32_mul;
+  param.name = "MUL_TOOM32_TO_TOOM43_THRESHOLD";
+  param.min_size = MPN_TOOM43_MUL_MINSIZE * 24 / 17;
+  one (&thres, &param);
+  mul_toom32_to_toom43_threshold = thres * 17 / 24;
+  print_define ("MUL_TOOM32_TO_TOOM43_THRESHOLD", mul_toom32_to_toom43_threshold);
+
+  param.function = speed_mpn_toom32_for_toom53_mul;
+  param.function2 = speed_mpn_toom53_for_toom32_mul;
+  param.name = "MUL_TOOM32_TO_TOOM53_THRESHOLD";
+  param.min_size = MPN_TOOM53_MUL_MINSIZE * 30 / 19;
+  one (&thres, &param);
+  mul_toom32_to_toom53_threshold = thres * 19 / 30;
+  print_define ("MUL_TOOM32_TO_TOOM53_THRESHOLD", mul_toom32_to_toom53_threshold);
+
+  param.function = speed_mpn_toom42_for_toom53_mul;
+  param.function2 = speed_mpn_toom53_for_toom42_mul;
+  param.name = "MUL_TOOM42_TO_TOOM53_THRESHOLD";
+  param.min_size = MPN_TOOM53_MUL_MINSIZE * 20 / 11;
+  one (&thres, &param);
+  mul_toom42_to_toom53_threshold = thres * 11 / 20;
+  print_define ("MUL_TOOM42_TO_TOOM53_THRESHOLD", mul_toom42_to_toom53_threshold);
+
+  param.function = speed_mpn_toom42_mul;
+  param.function2 = speed_mpn_toom63_mul;
+  param.name = "MUL_TOOM42_TO_TOOM63_THRESHOLD";
+  param.min_size = MPN_TOOM63_MUL_MINSIZE * 2;
+  one (&thres, &param);
+  mul_toom42_to_toom63_threshold = thres / 2;
+  print_define ("MUL_TOOM42_TO_TOOM63_THRESHOLD", mul_toom42_to_toom63_threshold);
+
+  /* Use ratio 5/6 when measuring, the middle of the range 2/3 to 1. */
+  param.function = speed_mpn_toom43_for_toom54_mul;
+  param.function2 = speed_mpn_toom54_for_toom43_mul;
+  param.name = "MUL_TOOM43_TO_TOOM54_THRESHOLD";
+  param.min_size = MPN_TOOM54_MUL_MINSIZE * 6 / 5;
+  one (&thres, &param);
+  mul_toom43_to_toom54_threshold = thres * 5 / 6;
+  print_define ("MUL_TOOM43_TO_TOOM54_THRESHOLD", mul_toom43_to_toom54_threshold);
+}
+
+
+void
+tune_mullo (void)
+{
+  static struct param_t  param;
+
+  param.function = speed_mpn_mullo_n;
+
+  param.name = "MULLO_BASECASE_THRESHOLD";
+  param.min_size = 2;
+  param.min_is_always = 1;
+  param.max_size = MULLO_BASECASE_THRESHOLD_LIMIT-1;
+  param.stop_factor = 1.5;
+  param.noprint = 1;
+  one (&mullo_basecase_threshold, &param);
+
+  param.name = "MULLO_DC_THRESHOLD";
+  param.min_size = 8;
+  param.min_is_always = 0;
+  param.max_size = 1000;
+  one (&mullo_dc_threshold, &param);
+
+  if (mullo_basecase_threshold >= mullo_dc_threshold)
+    {
+      print_define ("MULLO_BASECASE_THRESHOLD", mullo_dc_threshold);
+      print_define_remark ("MULLO_DC_THRESHOLD", 0, "never mpn_mullo_basecase");
+    }
+  else
+    {
+      print_define ("MULLO_BASECASE_THRESHOLD", mullo_basecase_threshold);
+      print_define ("MULLO_DC_THRESHOLD", mullo_dc_threshold);
+    }
+
+  if (WANT_FFT && mul_fft_threshold < MP_SIZE_T_MAX / 2)
+    {
+      param.name = "MULLO_MUL_N_THRESHOLD";
+      param.min_size = mullo_dc_threshold;
+      param.max_size = 2 * mul_fft_threshold;
+      param.noprint = 0;
+      param.step_factor = 0.03;
+      one (&mullo_mul_n_threshold, &param);
+    }
+  else
+    print_define_remark ("MULLO_MUL_N_THRESHOLD", MP_SIZE_T_MAX,
+			 "without FFT use mullo forever");
+}
+
+void
+tune_sqrlo (void)
+{
+  static struct param_t  param;
+
+  param.function = speed_mpn_sqrlo;
+
+  param.name = "SQRLO_BASECASE_THRESHOLD";
+  param.min_size = 2;
+  param.min_is_always = 1;
+  param.max_size = SQRLO_BASECASE_THRESHOLD_LIMIT-1;
+  param.stop_factor = 1.5;
+  param.noprint = 1;
+  one (&sqrlo_basecase_threshold, &param);
+
+  param.name = "SQRLO_DC_THRESHOLD";
+  param.min_size = 8;
+  param.min_is_always = 0;
+  param.max_size = SQRLO_DC_THRESHOLD_LIMIT-1;
+  one (&sqrlo_dc_threshold, &param);
+
+  if (sqrlo_basecase_threshold >= sqrlo_dc_threshold)
+    {
+      print_define ("SQRLO_BASECASE_THRESHOLD", sqrlo_dc_threshold);
+      print_define_remark ("SQRLO_DC_THRESHOLD", 0, "never mpn_sqrlo_basecase");
+    }
+  else
+    {
+      print_define ("SQRLO_BASECASE_THRESHOLD", sqrlo_basecase_threshold);
+      print_define ("SQRLO_DC_THRESHOLD", sqrlo_dc_threshold);
+    }
+
+  if (WANT_FFT && sqr_fft_threshold < MP_SIZE_T_MAX / 2)
+    {
+      param.name = "SQRLO_SQR_THRESHOLD";
+      param.min_size = sqrlo_dc_threshold;
+      param.max_size = 2 * sqr_fft_threshold;
+      param.noprint = 0;
+      param.step_factor = 0.03;
+      one (&sqrlo_sqr_threshold, &param);
+    }
+  else
+    print_define_remark ("SQRLO_SQR_THRESHOLD", MP_SIZE_T_MAX,
+			 "without FFT use sqrlo forever");
+}
+
+void
+tune_mulmid (void)
+{
+  static struct param_t  param;
+
+  param.name = "MULMID_TOOM42_THRESHOLD";
+  param.function = speed_mpn_mulmid_n;
+  param.min_size = 4;
+  param.max_size = 100;
+  one (&mulmid_toom42_threshold, &param);
+}
+
+void
+tune_mulmod_bnm1 (void)
+{
+  static struct param_t  param;
+
+  param.name = "MULMOD_BNM1_THRESHOLD";
+  param.function = speed_mpn_mulmod_bnm1;
+  param.min_size = 4;
+  param.max_size = 100;
+  one (&mulmod_bnm1_threshold, &param);
+}
+
+void
+tune_sqrmod_bnm1 (void)
+{
+  static struct param_t  param;
+
+  param.name = "SQRMOD_BNM1_THRESHOLD";
+  param.function = speed_mpn_sqrmod_bnm1;
+  param.min_size = 4;
+  param.max_size = 100;
+  one (&sqrmod_bnm1_threshold, &param);
+}
+
+
+/* Start the basecase from 3, since 1 is a special case, and if mul_basecase
+   is faster only at size==2 then we don't want to bother with extra code
+   just for that.  Start karatsuba from 4 same as MUL above.  */
+
+void
+tune_sqr (void)
+{
+  /* disabled until tuned */
+  SQR_FFT_THRESHOLD = MP_SIZE_T_MAX;
+
+  if (HAVE_NATIVE_mpn_sqr_basecase)
+    {
+      print_define_remark ("SQR_BASECASE_THRESHOLD", 0, "always (native)");
+      sqr_basecase_threshold = 0;
+    }
+  else
+    {
+      static struct param_t  param;
+      param.name = "SQR_BASECASE_THRESHOLD";
+      param.function = speed_mpn_sqr;
+      param.min_size = 3;
+      param.min_is_always = 1;
+      param.max_size = TUNE_SQR_TOOM2_MAX;
+      param.noprint = 1;
+      one (&sqr_basecase_threshold, &param);
+    }
+
+  {
+    static struct param_t  param;
+    param.name = "SQR_TOOM2_THRESHOLD";
+    param.function = speed_mpn_sqr;
+    param.min_size = MAX (4, MPN_TOOM2_SQR_MINSIZE);
+    param.max_size = TUNE_SQR_TOOM2_MAX;
+    param.noprint = 1;
+    one (&sqr_toom2_threshold, &param);
+
+    if (! HAVE_NATIVE_mpn_sqr_basecase
+        && sqr_toom2_threshold < sqr_basecase_threshold)
+      {
+        /* Karatsuba becomes faster than mul_basecase before
+           sqr_basecase does.  Arrange for the expression
+           "BELOW_THRESHOLD (un, SQR_TOOM2_THRESHOLD))" which
+           selects mpn_sqr_basecase in mpn_sqr to be false, by setting
+           SQR_TOOM2_THRESHOLD to zero, making
+           SQR_BASECASE_THRESHOLD the toom2 threshold.  */
+
+        sqr_basecase_threshold = SQR_TOOM2_THRESHOLD;
+        SQR_TOOM2_THRESHOLD = 0;
+
+        print_define_remark ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold,
+                             "toom2");
+        print_define_remark ("SQR_TOOM2_THRESHOLD",SQR_TOOM2_THRESHOLD,
+                             "never sqr_basecase");
+      }
+    else
+      {
+        if (! HAVE_NATIVE_mpn_sqr_basecase)
+          print_define ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold);
+        print_define ("SQR_TOOM2_THRESHOLD", SQR_TOOM2_THRESHOLD);
+      }
+  }
+
+  {
+    static struct param_t  param;
+    mp_size_t next_toom_start;
+    int something_changed;
+
+    param.function = speed_mpn_sqr;
+    param.noprint = 1;
+
+  /* Threshold sequence loop.  Disable functions that would be used in a very
+     narrow range, re-measuring things when that happens.  */
+    something_changed = 1;
+    while (something_changed)
+      {
+	something_changed = 0;
+
+	next_toom_start = MAX (sqr_toom2_threshold, sqr_basecase_threshold);
+
+	sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT;
+	param.name = "SQR_TOOM3_THRESHOLD";
+	param.min_size = MAX (next_toom_start, MPN_TOOM3_SQR_MINSIZE);
+	param.max_size = SQR_TOOM3_THRESHOLD_LIMIT-1;
+	one (&sqr_toom3_threshold, &param);
+
+	next_toom_start = MAX (next_toom_start, sqr_toom3_threshold);
+
+	if (sqr_toom4_threshold != 0)
+	  {
+	    param.name = "SQR_TOOM4_THRESHOLD";
+	    sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT;
+	    param.min_size = MAX (next_toom_start, MPN_TOOM4_SQR_MINSIZE);
+	    param.max_size = SQR_TOOM4_THRESHOLD_LIMIT-1;
+	    one (&sqr_toom4_threshold, &param);
+
+	    if (next_toom_start * 1.05 >= sqr_toom4_threshold)
+	      {
+		sqr_toom4_threshold = 0;
+		something_changed = 1;
+	      }
+	  }
+
+	next_toom_start = MAX (next_toom_start, sqr_toom4_threshold);
+
+	if (sqr_toom6_threshold != 0)
+	  {
+	    param.name = "SQR_TOOM6_THRESHOLD";
+	    sqr_toom6_threshold = SQR_TOOM6_THRESHOLD_LIMIT;
+	    param.min_size = MAX (next_toom_start, MPN_TOOM6_SQR_MINSIZE);
+	    param.max_size = SQR_TOOM6_THRESHOLD_LIMIT-1;
+	    one (&sqr_toom6_threshold, &param);
+
+	    if (next_toom_start * 1.05 >= sqr_toom6_threshold)
+	      {
+		sqr_toom6_threshold = 0;
+		something_changed = 1;
+	      }
+	  }
+
+	next_toom_start = MAX (next_toom_start, sqr_toom6_threshold);
+
+	if (sqr_toom8_threshold != 0)
+	  {
+	    param.name = "SQR_TOOM8_THRESHOLD";
+	    sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT;
+	    param.min_size = MAX (next_toom_start, MPN_TOOM8_SQR_MINSIZE);
+	    param.max_size = SQR_TOOM8_THRESHOLD_LIMIT-1;
+	    one (&sqr_toom8_threshold, &param);
+
+	    if (next_toom_start * 1.05 >= sqr_toom8_threshold)
+	      {
+		sqr_toom8_threshold = 0;
+		something_changed = 1;
+	      }
+	  }
+      }
+
+    print_define ("SQR_TOOM3_THRESHOLD", SQR_TOOM3_THRESHOLD);
+    print_define ("SQR_TOOM4_THRESHOLD", SQR_TOOM4_THRESHOLD);
+    print_define ("SQR_TOOM6_THRESHOLD", SQR_TOOM6_THRESHOLD);
+    print_define ("SQR_TOOM8_THRESHOLD", SQR_TOOM8_THRESHOLD);
+  }
+}
+
+
+void
+tune_dc_div (void)
+{
+  s.r = 0;		/* clear to make speed function do 2n/n */
+  {
+    static struct param_t  param;
+    param.name = "DC_DIV_QR_THRESHOLD";
+    param.function = speed_mpn_sbpi1_div_qr;
+    param.function2 = speed_mpn_dcpi1_div_qr;
+    param.min_size = 6;
+    one (&dc_div_qr_threshold, &param);
+  }
+  {
+    static struct param_t  param;
+    param.name = "DC_DIVAPPR_Q_THRESHOLD";
+    param.function = speed_mpn_sbpi1_divappr_q;
+    param.function2 = speed_mpn_dcpi1_divappr_q;
+    param.min_size = 6;
+    one (&dc_divappr_q_threshold, &param);
+  }
+}
+
+static double
+speed_mpn_sbordcpi1_div_qr (struct speed_params *s)
+{
+  if (s->size < DC_DIV_QR_THRESHOLD)
+    return speed_mpn_sbpi1_div_qr (s);
+  else
+    return speed_mpn_dcpi1_div_qr (s);
+}
+
+void
+tune_mu_div (void)
+{
+  s.r = 0;		/* clear to make speed function do 2n/n */
+  {
+    static struct param_t  param;
+    param.name = "MU_DIV_QR_THRESHOLD";
+    param.function = speed_mpn_dcpi1_div_qr;
+    param.function2 = speed_mpn_mu_div_qr;
+    param.min_size = mul_toom22_threshold;
+    param.max_size = 5000;
+    param.step_factor = 0.02;
+    one (&mu_div_qr_threshold, &param);
+  }
+  {
+    static struct param_t  param;
+    param.name = "MU_DIVAPPR_Q_THRESHOLD";
+    param.function = speed_mpn_dcpi1_divappr_q;
+    param.function2 = speed_mpn_mu_divappr_q;
+    param.min_size = mul_toom22_threshold;
+    param.max_size = 5000;
+    param.step_factor = 0.02;
+    one (&mu_divappr_q_threshold, &param);
+  }
+  {
+    static struct param_t  param;
+    param.name = "MUPI_DIV_QR_THRESHOLD";
+    param.function = speed_mpn_sbordcpi1_div_qr;
+    param.function2 = speed_mpn_mupi_div_qr;
+    param.min_size = 6;
+    param.min_is_always = 1;
+    param.max_size = 1000;
+    param.step_factor = 0.02;
+    one (&mupi_div_qr_threshold, &param);
+  }
+}
+
+void
+tune_dc_bdiv (void)
+{
+  s.r = 0;		/* clear to make speed function do 2n/n*/
+  {
+    static struct param_t  param;
+    param.name = "DC_BDIV_QR_THRESHOLD";
+    param.function = speed_mpn_sbpi1_bdiv_qr;
+    param.function2 = speed_mpn_dcpi1_bdiv_qr;
+    param.min_size = 4;
+    one (&dc_bdiv_qr_threshold, &param);
+  }
+  {
+    static struct param_t  param;
+    param.name = "DC_BDIV_Q_THRESHOLD";
+    param.function = speed_mpn_sbpi1_bdiv_q;
+    param.function2 = speed_mpn_dcpi1_bdiv_q;
+    param.min_size = 4;
+    one (&dc_bdiv_q_threshold, &param);
+  }
+}
+
+void
+tune_mu_bdiv (void)
+{
+  s.r = 0;		/* clear to make speed function do 2n/n*/
+  {
+    static struct param_t  param;
+    param.name = "MU_BDIV_QR_THRESHOLD";
+    param.function = speed_mpn_dcpi1_bdiv_qr;
+    param.function2 = speed_mpn_mu_bdiv_qr;
+    param.min_size = dc_bdiv_qr_threshold;
+    param.max_size = 5000;
+    param.step_factor = 0.02;
+    one (&mu_bdiv_qr_threshold, &param);
+  }
+  {
+    static struct param_t  param;
+    param.name = "MU_BDIV_Q_THRESHOLD";
+    param.function = speed_mpn_dcpi1_bdiv_q;
+    param.function2 = speed_mpn_mu_bdiv_q;
+    param.min_size = dc_bdiv_q_threshold;
+    param.max_size = 5000;
+    param.step_factor = 0.02;
+    one (&mu_bdiv_q_threshold, &param);
+  }
+}
+
+void
+tune_invertappr (void)
+{
+  static struct param_t  param;
+
+  param.function = speed_mpn_ni_invertappr;
+  param.name = "INV_MULMOD_BNM1_THRESHOLD";
+  param.min_size = 5;
+  one (&inv_mulmod_bnm1_threshold, &param);
+
+  param.function = speed_mpn_invertappr;
+  param.name = "INV_NEWTON_THRESHOLD";
+  param.min_size = 5;
+  one (&inv_newton_threshold, &param);
+}
+
+void
+tune_invert (void)
+{
+  static struct param_t  param;
+
+  param.function = speed_mpn_invert;
+  param.name = "INV_APPR_THRESHOLD";
+  param.min_size = 5;
+  one (&inv_appr_threshold, &param);
+}
+
+void
+tune_binvert (void)
+{
+  static struct param_t  param;
+
+  param.function = speed_mpn_binvert;
+  param.name = "BINV_NEWTON_THRESHOLD";
+  param.min_size = 8;		/* pointless with smaller operands */
+  one (&binv_newton_threshold, &param);
+}
+
+void
+tune_redc (void)
+{
+#define TUNE_REDC_2_MAX 100
+#if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2
+#define WANT_REDC_2 1
+#endif
+
+#if WANT_REDC_2
+  {
+    static struct param_t  param;
+    param.name = "REDC_1_TO_REDC_2_THRESHOLD";
+    param.function = speed_mpn_redc_1;
+    param.function2 = speed_mpn_redc_2;
+    param.min_size = 1;
+    param.min_is_always = 1;
+    param.max_size = TUNE_REDC_2_MAX;
+    param.noprint = 1;
+    param.stop_factor = 1.5;
+    one (&redc_1_to_redc_2_threshold, &param);
+  }
+  {
+    static struct param_t  param;
+    param.name = "REDC_2_TO_REDC_N_THRESHOLD";
+    param.function = speed_mpn_redc_2;
+    param.function2 = speed_mpn_redc_n;
+    param.min_size = 16;
+    param.noprint = 1;
+    one (&redc_2_to_redc_n_threshold, &param);
+  }
+  if (redc_1_to_redc_2_threshold >= redc_2_to_redc_n_threshold)
+    {
+      redc_2_to_redc_n_threshold = 0;	/* disable redc_2 */
+
+      /* Never use redc2, measure redc_1 -> redc_n cutoff, store result as
+	 REDC_1_TO_REDC_2_THRESHOLD.  */
+      {
+	static struct param_t  param;
+	param.name = "REDC_1_TO_REDC_2_THRESHOLD";
+	param.function = speed_mpn_redc_1;
+	param.function2 = speed_mpn_redc_n;
+	param.min_size = 16;
+	param.noprint = 1;
+	one (&redc_1_to_redc_2_threshold, &param);
+      }
+    }
+  print_define ("REDC_1_TO_REDC_2_THRESHOLD", REDC_1_TO_REDC_2_THRESHOLD);
+  print_define ("REDC_2_TO_REDC_N_THRESHOLD", REDC_2_TO_REDC_N_THRESHOLD);
+#else
+  {
+    static struct param_t  param;
+    param.name = "REDC_1_TO_REDC_N_THRESHOLD";
+    param.function = speed_mpn_redc_1;
+    param.function2 = speed_mpn_redc_n;
+    param.min_size = 16;
+    one (&redc_1_to_redc_n_threshold, &param);
+  }
+#endif
+}
+
+void
+tune_matrix22_mul (void)
+{
+  static struct param_t  param;
+  param.name = "MATRIX22_STRASSEN_THRESHOLD";
+  param.function = speed_mpn_matrix22_mul;
+  param.min_size = 2;
+  one (&matrix22_strassen_threshold, &param);
+}
+
+void
+tune_hgcd2 (void)
+{
+  static struct param_t  param;
+  hgcd2_func_t *f[5] =
+    { mpn_hgcd2_1,
+      mpn_hgcd2_2,
+      mpn_hgcd2_3,
+      mpn_hgcd2_4,
+      mpn_hgcd2_5 };
+  speed_function_t speed_f[5] =
+    { speed_mpn_hgcd2_1,
+      speed_mpn_hgcd2_2,
+      speed_mpn_hgcd2_3,
+      speed_mpn_hgcd2_4,
+      speed_mpn_hgcd2_5 };
+  int best;
+
+  s.size = 1;
+  best = one_method (5, speed_f, "mpn_hgcd2", "HGCD2_DIV1_METHOD", &param);
+
+  /* Use selected function when tuning hgcd and gcd */
+  hgcd2_func = f[best];
+}
+
+void
+tune_hgcd (void)
+{
+  static struct param_t  param;
+  param.name = "HGCD_THRESHOLD";
+  param.function = speed_mpn_hgcd;
+  /* We seem to get strange results for small sizes */
+  param.min_size = 30;
+  one (&hgcd_threshold, &param);
+}
+
+void
+tune_hgcd_appr (void)
+{
+  static struct param_t  param;
+  param.name = "HGCD_APPR_THRESHOLD";
+  param.function = speed_mpn_hgcd_appr;
+  /* We seem to get strange results for small sizes */
+  param.min_size = 50;
+  param.stop_since_change = 150;
+  one (&hgcd_appr_threshold, &param);
+}
+
+void
+tune_hgcd_reduce (void)
+{
+  static struct param_t  param;
+  param.name = "HGCD_REDUCE_THRESHOLD";
+  param.function = speed_mpn_hgcd_reduce;
+  param.min_size = 30;
+  param.max_size = 7000;
+  param.step_factor = 0.04;
+  one (&hgcd_reduce_threshold, &param);
+}
+
+void
+tune_gcd_dc (void)
+{
+  static struct param_t  param;
+  param.name = "GCD_DC_THRESHOLD";
+  param.function = speed_mpn_gcd;
+  param.min_size = hgcd_threshold;
+  param.max_size = 3000;
+  param.step_factor = 0.02;
+  one (&gcd_dc_threshold, &param);
+}
+
+void
+tune_gcdext_dc (void)
+{
+  static struct param_t  param;
+  param.name = "GCDEXT_DC_THRESHOLD";
+  param.function = speed_mpn_gcdext;
+  param.min_size = hgcd_threshold;
+  param.max_size = 3000;
+  param.step_factor = 0.02;
+  one (&gcdext_dc_threshold, &param);
+}
+
+/* In tune_powm_sec we compute the table used by the win_size function.  The
+   cutoff points are in exponent bits, disregarding other operand sizes.  It is
+   not possible to use the one framework since it currently uses a granularity
+   of full limbs.
+*/
+
+/* This win_size replaces the variant in the powm code, allowing us to
+   control k in the k-ary algorithms.  */
+int winsize;
+int
+win_size (mp_bitcnt_t eb)
+{
+  return winsize;
+}
+
+void
+tune_powm_sec (void)
+{
+  mp_size_t n;
+  int k, i;
+  mp_size_t itch;
+  mp_bitcnt_t nbits, nbits_next, possible_nbits_cutoff;
+  const int n_max = 3000 / GMP_NUMB_BITS;
+  const int n_measurements = 5;
+  mp_ptr rp, bp, ep, mp, tp;
+  double ttab[n_measurements], tk, tkp1;
+  TMP_DECL;
+  TMP_MARK;
+
+  possible_nbits_cutoff = 0;
+
+  k = 1;
+
+  winsize = 10;			/* the itch function needs this */
+  itch = mpn_sec_powm_itch (n_max, n_max * GMP_NUMB_BITS, n_max);
+
+  rp = TMP_ALLOC_LIMBS (n_max);
+  bp = TMP_ALLOC_LIMBS (n_max);
+  ep = TMP_ALLOC_LIMBS (n_max);
+  mp = TMP_ALLOC_LIMBS (n_max);
+  tp = TMP_ALLOC_LIMBS (itch);
+
+  mpn_random (bp, n_max);
+  mpn_random (mp, n_max);
+  mp[0] |= 1;
+
+/* How about taking the M operand size into account?
+
+   An operation R=powm(B,E,N) will take time O(log(E)*M(log(N))) (assuming
+   B = O(M)).
+
+   Using k-ary and no sliding window, the precomputation will need time
+   O(2^(k-1)*M(log(N))) and the main computation will need O(log(E)*S(N)) +
+   O(log(E)/k*M(N)), for the squarings, multiplications, respectively.
+
+   An operation R=powm_sec(B,E,N) will take time like powm.
+
+   Using k-ary, the precomputation will need time O(2^k*M(log(N))) and the
+   main computation will need O(log(E)*S(N)) + O(log(E)/k*M(N)) +
+   O(log(E)/k*2^k*log(N)), for the squarings, multiplications, and full
+   table reads, respectively.  */
+
+  printf ("#define POWM_SEC_TABLE  ");
+
+  /* For nbits == 1, we should always use k == 1, so no need to tune
+     that. Starting with nbits == 2 also ensure that nbits always is
+     larger than the windowsize k+1. */
+  for (nbits = 2; nbits <= n_max * GMP_NUMB_BITS; )
+    {
+      n = (nbits - 1) / GMP_NUMB_BITS + 1;
+
+      /* Generate E such that sliding-window for k and k+1 works equally
+	 well/poorly (but sliding is not used in powm_sec, of course). */
+      for (i = 0; i < n; i++)
+	ep[i] = ~CNST_LIMB(0);
+
+      winsize = k;
+      for (i = 0; i < n_measurements; i++)
+	{
+	  speed_starttime ();
+	  mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp);
+	  ttab[i] = speed_endtime ();
+	}
+      tk = median (ttab, n_measurements);
+
+      winsize = k + 1;
+      speed_starttime ();
+      for (i = 0; i < n_measurements; i++)
+	{
+	  speed_starttime ();
+	  mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp);
+	  ttab[i] = speed_endtime ();
+	}
+      tkp1 = median (ttab, n_measurements);
+/*
+      printf ("testing: %ld, %d", nbits, k, ep[n-1]);
+      printf ("   %10.5f  %10.5f\n", tk, tkp1);
+*/
+      if (tkp1 < tk)
+	{
+	  if (possible_nbits_cutoff)
+	    {
+	      /* Two consecutive sizes indicate k increase, obey.  */
+
+	      /* Must always have x[k] >= k */
+	      ASSERT_ALWAYS (possible_nbits_cutoff >= k);
+
+	      if (k > 1)
+		printf (",");
+	      printf ("%ld", (long) possible_nbits_cutoff);
+	      k++;
+	      possible_nbits_cutoff = 0;
+	    }
+	  else
+	    {
+	      /* One measurement indicate k increase, save nbits for further
+		 consideration.  */
+	      /* The new larger k gets used for sizes > the cutoff
+		 value, hence the cutoff should be one less than the
+		 smallest size where it gives a speedup. */
+	      possible_nbits_cutoff = nbits - 1;
+	    }
+	}
+      else
+	possible_nbits_cutoff = 0;
+
+      nbits_next = nbits * 65 / 64;
+      nbits = nbits_next + (nbits_next == nbits);
+    }
+  printf ("\n");
+  TMP_FREE;
+}
+
+
+/* size_extra==1 reflects the fact that with high<divisor one division is
+   always skipped.  Forcing high<divisor while testing ensures consistency
+   while stepping through sizes, ie. that size-1 divides will be done each
+   time.
+
+   min_size==2 and min_is_always are used so that if plain division is only
+   better at size==1 then don't bother including that code just for that
+   case, instead go with preinv always and get a size saving.  */
+
+#define DIV_1_PARAMS                    \
+  param.check_size = 256;               \
+  param.min_size = 2;                   \
+  param.min_is_always = 1;              \
+  param.data_high = DATA_HIGH_LT_R;     \
+  param.size_extra = 1;                 \
+  param.stop_factor = 2.0;
+
+
+double (*tuned_speed_mpn_divrem_1) (struct speed_params *);
+
+void
+tune_divrem_1 (void)
+{
+  /* plain version by default */
+  tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1;
+
+  /* No support for tuning native assembler code, do that by hand and put
+     the results in the .asm file, there's no need for such thresholds to
+     appear in gmp-mparam.h.  */
+  if (HAVE_NATIVE_mpn_divrem_1)
+    return;
+
+  if (GMP_NAIL_BITS != 0)
+    {
+      print_define_remark ("DIVREM_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
+                           "no preinv with nails");
+      print_define_remark ("DIVREM_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
+                           "no preinv with nails");
+      return;
+    }
+
+  if (UDIV_PREINV_ALWAYS)
+    {
+      print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always");
+      print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L);
+      return;
+    }
+
+  tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune;
+
+  /* Tune for the integer part of mpn_divrem_1.  This will very possibly be
+     a bit out for the fractional part, but that's too bad, the integer part
+     is more important. */
+  {
+    static struct param_t  param;
+    param.name = "DIVREM_1_NORM_THRESHOLD";
+    DIV_1_PARAMS;
+    s.r = randlimb_norm ();
+    param.function = speed_mpn_divrem_1_tune;
+    one (&divrem_1_norm_threshold, &param);
+  }
+  {
+    static struct param_t  param;
+    param.name = "DIVREM_1_UNNORM_THRESHOLD";
+    DIV_1_PARAMS;
+    s.r = randlimb_half ();
+    param.function = speed_mpn_divrem_1_tune;
+    one (&divrem_1_unnorm_threshold, &param);
+  }
+}
+
+void
+tune_div_qr_1 (void)
+{
+  if (!HAVE_NATIVE_mpn_div_qr_1n_pi1)
+    {
+      static struct param_t  param;
+      speed_function_t f[2] =
+	{
+	 speed_mpn_div_qr_1n_pi1_1,
+	 speed_mpn_div_qr_1n_pi1_2,
+	};
+
+      s.size = 10;
+      s.r = randlimb_norm ();
+
+      one_method (2, f, "mpn_div_qr_1n_pi1", "DIV_QR_1N_PI1_METHOD", &param);
+    }
+
+  {
+    static struct param_t  param;
+    param.name = "DIV_QR_1_NORM_THRESHOLD";
+    DIV_1_PARAMS;
+    param.min_size = 1;
+    param.min_is_always = 0;
+    s.r = randlimb_norm ();
+    param.function = speed_mpn_div_qr_1_tune;
+    one (&div_qr_1_norm_threshold, &param);
+  }
+  {
+    static struct param_t  param;
+    param.name = "DIV_QR_1_UNNORM_THRESHOLD";
+    DIV_1_PARAMS;
+    param.min_size = 1;
+    param.min_is_always = 0;
+    s.r = randlimb_half();
+    param.function = speed_mpn_div_qr_1_tune;
+    one (&div_qr_1_unnorm_threshold, &param);
+  }
+}
+
+
+void
+tune_mod_1 (void)
+{
+  /* No support for tuning native assembler code, do that by hand and put
+     the results in the .asm file, there's no need for such thresholds to
+     appear in gmp-mparam.h.  */
+  if (HAVE_NATIVE_mpn_mod_1)
+    return;
+
+  if (GMP_NAIL_BITS != 0)
+    {
+      print_define_remark ("MOD_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
+                           "no preinv with nails");
+      print_define_remark ("MOD_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
+                           "no preinv with nails");
+      return;
+    }
+
+  if (!HAVE_NATIVE_mpn_mod_1_1p)
+    {
+      static struct param_t  param;
+      speed_function_t f[2] =
+	{
+	 speed_mpn_mod_1_1_1,
+	 speed_mpn_mod_1_1_2,
+	};
+
+      s.size = 10;
+      s.r = randlimb_half ();
+      one_method (2, f, "mpn_mod_1_1", "MOD_1_1P_METHOD", &param);
+    }
+
+  if (UDIV_PREINV_ALWAYS)
+    {
+      print_define ("MOD_1_NORM_THRESHOLD", 0L);
+      print_define ("MOD_1_UNNORM_THRESHOLD", 0L);
+    }
+  else
+    {
+      {
+	static struct param_t  param;
+	param.name = "MOD_1_NORM_THRESHOLD";
+	DIV_1_PARAMS;
+	s.r = randlimb_norm ();
+	param.function = speed_mpn_mod_1_tune;
+	one (&mod_1_norm_threshold, &param);
+      }
+      {
+	static struct param_t  param;
+	param.name = "MOD_1_UNNORM_THRESHOLD";
+	DIV_1_PARAMS;
+	s.r = randlimb_half ();
+	param.function = speed_mpn_mod_1_tune;
+	one (&mod_1_unnorm_threshold, &param);
+      }
+    }
+  {
+    static struct param_t  param;
+
+    param.check_size = 256;
+
+    s.r = randlimb_norm ();
+    param.function = speed_mpn_mod_1_tune;
+
+    param.name = "MOD_1N_TO_MOD_1_1_THRESHOLD";
+    param.min_size = 2;
+    one (&mod_1n_to_mod_1_1_threshold, &param);
+  }
+
+  {
+    static struct param_t  param;
+
+    param.check_size = 256;
+    s.r = randlimb_half ();
+    param.noprint = 1;
+
+    param.function = speed_mpn_mod_1_1;
+    param.function2 = speed_mpn_mod_1_2;
+    param.min_is_always = 1;
+    param.name = "MOD_1_1_TO_MOD_1_2_THRESHOLD";
+    param.min_size = 2;
+    one (&mod_1_1_to_mod_1_2_threshold, &param);
+
+    param.function = speed_mpn_mod_1_2;
+    param.function2 = speed_mpn_mod_1_4;
+    param.min_is_always = 1;
+    param.name = "MOD_1_2_TO_MOD_1_4_THRESHOLD";
+    param.min_size = 1;
+    one (&mod_1_2_to_mod_1_4_threshold, &param);
+
+    if (mod_1_1_to_mod_1_2_threshold >= mod_1_2_to_mod_1_4_threshold)
+      {
+	/* Never use mod_1_2, measure mod_1_1 -> mod_1_4 */
+	mod_1_2_to_mod_1_4_threshold = 0;
+
+	param.function = speed_mpn_mod_1_1;
+	param.function2 = speed_mpn_mod_1_4;
+	param.min_is_always = 1;
+	param.name = "MOD_1_1_TO_MOD_1_4_THRESHOLD fake";
+	param.min_size = 2;
+	one (&mod_1_1_to_mod_1_2_threshold, &param);
+      }
+
+    param.function = speed_mpn_mod_1_tune;
+    param.function2 = NULL;
+    param.name = "MOD_1U_TO_MOD_1_1_THRESHOLD";
+    param.min_size = 2;
+    param.min_is_always = 0;
+    one (&mod_1u_to_mod_1_1_threshold, &param);
+
+    if (mod_1u_to_mod_1_1_threshold >= mod_1_1_to_mod_1_2_threshold)
+      mod_1_1_to_mod_1_2_threshold = 0;
+    if (mod_1u_to_mod_1_1_threshold >= mod_1_2_to_mod_1_4_threshold)
+      mod_1_2_to_mod_1_4_threshold = 0;
+
+    print_define_remark ("MOD_1U_TO_MOD_1_1_THRESHOLD", mod_1u_to_mod_1_1_threshold, NULL);
+    print_define_remark ("MOD_1_1_TO_MOD_1_2_THRESHOLD", mod_1_1_to_mod_1_2_threshold,
+			 mod_1_1_to_mod_1_2_threshold == 0 ? "never mpn_mod_1_1p" : NULL);
+    print_define_remark ("MOD_1_2_TO_MOD_1_4_THRESHOLD", mod_1_2_to_mod_1_4_threshold,
+			 mod_1_2_to_mod_1_4_threshold == 0 ? "never mpn_mod_1s_2p" : NULL);
+  }
+
+  {
+    static struct param_t  param;
+
+    param.check_size = 256;
+
+    param.name = "PREINV_MOD_1_TO_MOD_1_THRESHOLD";
+    s.r = randlimb_norm ();
+    param.function = speed_mpn_preinv_mod_1;
+    param.function2 = speed_mpn_mod_1_tune;
+    param.min_size = 1;
+    one (&preinv_mod_1_to_mod_1_threshold, &param);
+  }
+}
+
+
+/* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would
+   imply that udiv_qrnnd_preinv is worth using, but it seems most
+   straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div
+   directly.  */
+
+void
+tune_preinv_divrem_1 (void)
+{
+  static struct param_t  param;
+  speed_function_t  divrem_1;
+  const char        *divrem_1_name;
+  double            t1, t2;
+
+  if (GMP_NAIL_BITS != 0)
+    {
+      print_define_remark ("USE_PREINV_DIVREM_1", 0, "no preinv with nails");
+      return;
+    }
+
+  /* Any native version of mpn_preinv_divrem_1 is assumed to exist because
+     it's faster than mpn_divrem_1.  */
+  if (HAVE_NATIVE_mpn_preinv_divrem_1)
+    {
+      print_define_remark ("USE_PREINV_DIVREM_1", 1, "native");
+      return;
+    }
+
+  /* If udiv_qrnnd_preinv is the only division method then of course
+     mpn_preinv_divrem_1 should be used.  */
+  if (UDIV_PREINV_ALWAYS)
+    {
+      print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always");
+      return;
+    }
+
+  /* If we've got an assembler version of mpn_divrem_1, then compare against
+     that, not the mpn_divrem_1_div generic C.  */
+  if (HAVE_NATIVE_mpn_divrem_1)
+    {
+      divrem_1 = speed_mpn_divrem_1;
+      divrem_1_name = "mpn_divrem_1";
+    }
+  else
+    {
+      divrem_1 = speed_mpn_divrem_1_div;
+      divrem_1_name = "mpn_divrem_1_div";
+    }
+
+  param.data_high = DATA_HIGH_LT_R; /* allow skip one division */
+  s.size = 200;                     /* generous but not too big */
+  /* Divisor, nonzero.  Unnormalized so as to exercise the shift!=0 case,
+     since in general that's probably most common, though in fact for a
+     64-bit limb mp_bases[10].big_base is normalized.  */
+  s.r = urandom() & (GMP_NUMB_MASK >> 4);
+  if (s.r == 0) s.r = 123;
+
+  t1 = tuneup_measure (speed_mpn_preinv_divrem_1, &param, &s);
+  t2 = tuneup_measure (divrem_1, &param, &s);
+  if (t1 == -1.0 || t2 == -1.0)
+    {
+      printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n",
+              divrem_1_name, (long) s.size);
+      abort ();
+    }
+  if (option_trace >= 1)
+    printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n",
+            (long) s.size, t1, divrem_1_name, t2);
+
+  print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL);
+}
+
+
+
+void
+tune_divrem_2 (void)
+{
+  static struct param_t  param;
+
+  /* No support for tuning native assembler code, do that by hand and put
+     the results in the .asm file, and there's no need for such thresholds
+     to appear in gmp-mparam.h.  */
+  if (HAVE_NATIVE_mpn_divrem_2)
+    return;
+
+  if (GMP_NAIL_BITS != 0)
+    {
+      print_define_remark ("DIVREM_2_THRESHOLD", MP_SIZE_T_MAX,
+                           "no preinv with nails");
+      return;
+    }
+
+  if (UDIV_PREINV_ALWAYS)
+    {
+      print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always");
+      return;
+    }
+
+  /* Tune for the integer part of mpn_divrem_2.  This will very possibly be
+     a bit out for the fractional part, but that's too bad, the integer part
+     is more important.
+
+     min_size must be >=2 since nsize>=2 is required, but is set to 4 to save
+     code space if plain division is better only at size==2 or size==3. */
+  param.name = "DIVREM_2_THRESHOLD";
+  param.check_size = 256;
+  param.min_size = 4;
+  param.min_is_always = 1;
+  param.size_extra = 2;      /* does qsize==nsize-2 divisions */
+  param.stop_factor = 2.0;
+
+  s.r = randlimb_norm ();
+  param.function = speed_mpn_divrem_2;
+  one (&divrem_2_threshold, &param);
+}
+
+void
+tune_div_qr_2 (void)
+{
+  static struct param_t  param;
+  param.name = "DIV_QR_2_PI2_THRESHOLD";
+  param.function = speed_mpn_div_qr_2n;
+  param.check_size = 500;
+  param.min_size = 4;
+  one (&div_qr_2_pi2_threshold, &param);
+}
+
+/* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so
+   tune for that.  Its speed can differ on odd or even divisor, so take an
+   average threshold for the two.
+
+   mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1
+   might not vary that way, but don't test this since high<divisor isn't
+   expected to occur often with small divisors.  */
+
+void
+tune_divexact_1 (void)
+{
+  static struct param_t  param;
+  mp_size_t  thresh[2], average;
+  int        low, i;
+
+  /* Any native mpn_divexact_1 is assumed to incorporate all the speed of a
+     full mpn_divrem_1.  */
+  if (HAVE_NATIVE_mpn_divexact_1)
+    {
+      print_define_remark ("DIVEXACT_1_THRESHOLD", 0, "always (native)");
+      return;
+    }
+
+  ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL);
+
+  param.name = "DIVEXACT_1_THRESHOLD";
+  param.data_high = DATA_HIGH_GE_R;
+  param.check_size = 256;
+  param.min_size = 2;
+  param.stop_factor = 1.5;
+  param.function  = tuned_speed_mpn_divrem_1;
+  param.function2 = speed_mpn_divexact_1;
+  param.noprint = 1;
+
+  print_define_start (param.name);
+
+  for (low = 0; low <= 1; low++)
+    {
+      s.r = randlimb_half();
+      if (low == 0)
+        s.r |= 1;
+      else
+        s.r &= ~CNST_LIMB(7);
+
+      one (&thresh[low], &param);
+      if (option_trace)
+        printf ("low=%d thresh %ld\n", low, (long) thresh[low]);
+
+      if (thresh[low] == MP_SIZE_T_MAX)
+        {
+          average = MP_SIZE_T_MAX;
+          goto divexact_1_done;
+        }
+    }
+
+  if (option_trace)
+    {
+      printf ("average of:");
+      for (i = 0; i < numberof(thresh); i++)
+        printf (" %ld", (long) thresh[i]);
+      printf ("\n");
+    }
+
+  average = 0;
+  for (i = 0; i < numberof(thresh); i++)
+    average += thresh[i];
+  average /= numberof(thresh);
+
+  /* If divexact turns out to be better as early as 3 limbs, then use it
+     always, so as to reduce code size and conditional jumps.  */
+  if (average <= 3)
+    average = 0;
+
+ divexact_1_done:
+  print_define_end (param.name, average);
+}
+
+
+/* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the
+   same as mpn_mod_1, but this might not be true of an assembler
+   implementation.  The threshold used is an average based on data where a
+   divide can be skipped and where it can't.
+
+   If modexact turns out to be better as early as 3 limbs, then use it
+   always, so as to reduce code size and conditional jumps.  */
+
+void
+tune_modexact_1_odd (void)
+{
+  static struct param_t  param;
+  mp_size_t  thresh_lt, thresh_ge, average;
+
+#if 0
+  /* Any native mpn_modexact_1_odd is assumed to incorporate all the speed
+     of a full mpn_mod_1.  */
+  if (HAVE_NATIVE_mpn_modexact_1_odd)
+    {
+      print_define_remark ("BMOD_1_TO_MOD_1_THRESHOLD", MP_SIZE_T_MAX, "always bmod_1");
+      return;
+    }
+#endif
+
+  param.name = "BMOD_1_TO_MOD_1_THRESHOLD";
+  param.check_size = 256;
+  param.min_size = 2;
+  param.stop_factor = 1.5;
+  param.function  = speed_mpn_modexact_1c_odd;
+  param.function2 = speed_mpn_mod_1_tune;
+  param.noprint = 1;
+  s.r = randlimb_half () | 1;
+
+  print_define_start (param.name);
+
+  param.data_high = DATA_HIGH_LT_R;
+  one (&thresh_lt, &param);
+  if (option_trace)
+    printf ("lt thresh %ld\n", (long) thresh_lt);
+
+  average = thresh_lt;
+  if (thresh_lt != MP_SIZE_T_MAX)
+    {
+      param.data_high = DATA_HIGH_GE_R;
+      one (&thresh_ge, &param);
+      if (option_trace)
+        printf ("ge thresh %ld\n", (long) thresh_ge);
+
+      if (thresh_ge != MP_SIZE_T_MAX)
+        {
+          average = (thresh_ge + thresh_lt) / 2;
+          if (thresh_ge <= 3)
+            average = 0;
+        }
+    }
+
+  print_define_end (param.name, average);
+}
+
+
+void
+tune_jacobi_base (void)
+{
+  static struct param_t  param;
+  speed_function_t f[4] =
+    {
+     speed_mpn_jacobi_base_1,
+     speed_mpn_jacobi_base_2,
+     speed_mpn_jacobi_base_3,
+     speed_mpn_jacobi_base_4,
+    };
+
+  s.size = GMP_LIMB_BITS * 3 / 4;
+
+  one_method (4, f, "mpn_jacobi_base", "JACOBI_BASE_METHOD", &param);
+}
+
+
+void
+tune_get_str (void)
+{
+  /* Tune for decimal, it being most common.  Some rough testing suggests
+     other bases are different, but not by very much.  */
+  s.r = 10;
+  {
+    static struct param_t  param;
+    GET_STR_PRECOMPUTE_THRESHOLD = 0;
+    param.name = "GET_STR_DC_THRESHOLD";
+    param.function = speed_mpn_get_str;
+    param.min_size = 4;
+    param.max_size = GET_STR_THRESHOLD_LIMIT;
+    one (&get_str_dc_threshold, &param);
+  }
+  {
+    static struct param_t  param;
+    param.name = "GET_STR_PRECOMPUTE_THRESHOLD";
+    param.function = speed_mpn_get_str;
+    param.min_size = GET_STR_DC_THRESHOLD;
+    param.max_size = GET_STR_THRESHOLD_LIMIT;
+    one (&get_str_precompute_threshold, &param);
+  }
+}
+
+
+double
+speed_mpn_pre_set_str (struct speed_params *s)
+{
+  unsigned char *str;
+  mp_ptr     wp;
+  mp_size_t  wn;
+  unsigned   i;
+  int        base;
+  double     t;
+  mp_ptr powtab_mem, tp;
+  powers_t powtab[GMP_LIMB_BITS];
+  mp_size_t un;
+  int chars_per_limb;
+  TMP_DECL;
+
+  SPEED_RESTRICT_COND (s->size >= 1);
+
+  base = s->r == 0 ? 10 : s->r;
+  SPEED_RESTRICT_COND (base >= 2 && base <= 256);
+
+  TMP_MARK;
+
+  str = (unsigned char *) TMP_ALLOC (s->size);
+  for (i = 0; i < s->size; i++)
+    str[i] = s->xp[i] % base;
+
+  LIMBS_PER_DIGIT_IN_BASE (wn, s->size, base);
+  SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);
+
+  /* use this during development to check wn is big enough */
+  /*
+  ASSERT_ALWAYS (mpn_set_str (wp, str, s->size, base) <= wn);
+  */
+
+  speed_operand_src (s, (mp_ptr) str, s->size/GMP_LIMB_BYTES);
+  speed_operand_dst (s, wp, wn);
+  speed_cache_fill (s);
+
+  chars_per_limb = mp_bases[base].chars_per_limb;
+  un = s->size / chars_per_limb + 1;
+  powtab_mem = TMP_BALLOC_LIMBS (mpn_str_powtab_alloc (un));
+  size_t n_pows = mpn_compute_powtab (powtab, powtab_mem, un, base);
+  powers_t *pt = powtab + n_pows;
+  tp = TMP_BALLOC_LIMBS (mpn_dc_set_str_itch (un));
+
+  speed_starttime ();
+  i = s->reps;
+  do
+    {
+      mpn_pre_set_str (wp, str, s->size, pt, tp);
+    }
+  while (--i != 0);
+  t = speed_endtime ();
+
+  TMP_FREE;
+  return t;
+}
+
+void
+tune_set_str (void)
+{
+  s.r = 10;  /* decimal */
+  {
+    static struct param_t  param;
+    SET_STR_PRECOMPUTE_THRESHOLD = 0;
+    param.step_factor = 0.01;
+    param.name = "SET_STR_DC_THRESHOLD";
+    param.function = speed_mpn_pre_set_str;
+    param.min_size = 100;
+    param.max_size = 50000;
+    one (&set_str_dc_threshold, &param);
+  }
+  {
+    static struct param_t  param;
+    param.step_factor = 0.02;
+    param.name = "SET_STR_PRECOMPUTE_THRESHOLD";
+    param.function = speed_mpn_set_str;
+    param.min_size = SET_STR_DC_THRESHOLD;
+    param.max_size = 100000;
+    one (&set_str_precompute_threshold, &param);
+  }
+}
+
+
+void
+tune_fft_mul (void)
+{
+  static struct fft_param_t  param;
+
+  if (option_fft_max_size == 0)
+    return;
+
+  param.table_name          = "MUL_FFT_TABLE3";
+  param.threshold_name      = "MUL_FFT_THRESHOLD";
+  param.p_threshold         = &mul_fft_threshold;
+  param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD";
+  param.p_modf_threshold    = &mul_fft_modf_threshold;
+  param.first_size          = MUL_TOOM33_THRESHOLD / 2;
+  param.max_size            = option_fft_max_size;
+  param.function            = speed_mpn_fft_mul;
+  param.mul_modf_function   = speed_mpn_mul_fft;
+  param.mul_function        = speed_mpn_mul_n;
+  param.sqr = 0;
+  fft (&param);
+}
+
+
+void
+tune_fft_sqr (void)
+{
+  static struct fft_param_t  param;
+
+  if (option_fft_max_size == 0)
+    return;
+
+  param.table_name          = "SQR_FFT_TABLE3";
+  param.threshold_name      = "SQR_FFT_THRESHOLD";
+  param.p_threshold         = &sqr_fft_threshold;
+  param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD";
+  param.p_modf_threshold    = &sqr_fft_modf_threshold;
+  param.first_size          = SQR_TOOM3_THRESHOLD / 2;
+  param.max_size            = option_fft_max_size;
+  param.function            = speed_mpn_fft_sqr;
+  param.mul_modf_function   = speed_mpn_mul_fft_sqr;
+  param.mul_function        = speed_mpn_sqr;
+  param.sqr = 1;
+  fft (&param);
+}
+
+void
+tune_fac_ui (void)
+{
+  static struct param_t  param;
+
+  param.function = speed_mpz_fac_ui_tune;
+
+  param.name = "FAC_DSC_THRESHOLD";
+  param.min_size = 70;
+  param.max_size = FAC_DSC_THRESHOLD_LIMIT;
+  one (&fac_dsc_threshold, &param);
+
+  param.name = "FAC_ODD_THRESHOLD";
+  param.min_size = 22;
+  param.stop_factor = 1.7;
+  param.min_is_always = 1;
+  one (&fac_odd_threshold, &param);
+}
+
+void
+all (void)
+{
+  time_t  start_time, end_time;
+  TMP_DECL;
+
+  TMP_MARK;
+  SPEED_TMP_ALLOC_LIMBS (s.xp_block, SPEED_BLOCK_SIZE, 0);
+  SPEED_TMP_ALLOC_LIMBS (s.yp_block, SPEED_BLOCK_SIZE, 0);
+
+  mpn_random (s.xp_block, SPEED_BLOCK_SIZE);
+  mpn_random (s.yp_block, SPEED_BLOCK_SIZE);
+
+  fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST);
+
+  speed_time_init ();
+  fprintf (stderr, "Using: %s\n", speed_time_string);
+
+  fprintf (stderr, "speed_precision %d", speed_precision);
+  if (speed_unittime == 1.0)
+    fprintf (stderr, ", speed_unittime 1 cycle");
+  else
+    fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime);
+  if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
+    fprintf (stderr, ", CPU freq unknown\n");
+  else
+    fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
+
+  fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n",
+           DEFAULT_MAX_SIZE, (long) option_fft_max_size);
+  fprintf (stderr, "\n");
+
+  time (&start_time);
+  {
+    struct tm  *tp;
+    tp = localtime (&start_time);
+    printf ("/* Generated by tuneup.c, %d-%02d-%02d, ",
+            tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday);
+
+#ifdef __GNUC__
+    /* gcc sub-minor version doesn't seem to come through as a define */
+    printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__);
+#define PRINTED_COMPILER
+#endif
+#if defined (__SUNPRO_C)
+    printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100);
+#define PRINTED_COMPILER
+#endif
+#if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION)
+    /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */
+    printf ("MIPSpro C %d.%d.%d */\n",
+	    _COMPILER_VERSION / 100,
+	    _COMPILER_VERSION / 10 % 10,
+	    _COMPILER_VERSION % 10);
+#define PRINTED_COMPILER
+#endif
+#if defined (__DECC) && defined (__DECC_VER)
+    printf ("DEC C %d */\n", __DECC_VER);
+#define PRINTED_COMPILER
+#endif
+#if ! defined (PRINTED_COMPILER)
+    printf ("system compiler */\n");
+#endif
+  }
+  printf ("\n");
+
+  tune_divrem_1 ();
+  tune_mod_1 ();
+  tune_preinv_divrem_1 ();
+  tune_div_qr_1 ();
+#if 0
+  tune_divrem_2 ();
+#endif
+  tune_div_qr_2 ();
+  tune_divexact_1 ();
+  tune_modexact_1_odd ();
+  printf("\n");
+
+  relspeed_div_1_vs_mul_1 ();
+  printf("\n");
+
+  tune_mul_n ();
+  printf("\n");
+
+  tune_mul ();
+  printf("\n");
+
+  tune_sqr ();
+  printf("\n");
+
+  tune_mulmid ();
+  printf("\n");
+
+  tune_mulmod_bnm1 ();
+  tune_sqrmod_bnm1 ();
+  printf("\n");
+
+  tune_fft_mul ();
+  printf("\n");
+
+  tune_fft_sqr ();
+  printf ("\n");
+
+  tune_mullo ();
+  tune_sqrlo ();
+  printf("\n");
+
+  tune_dc_div ();
+  tune_dc_bdiv ();
+
+  printf("\n");
+  tune_invertappr ();
+  tune_invert ();
+  printf("\n");
+
+  tune_binvert ();
+  tune_redc ();
+  printf("\n");
+
+  tune_mu_div ();
+  tune_mu_bdiv ();
+  printf("\n");
+
+  tune_powm_sec ();
+  printf("\n");
+
+  tune_get_str ();
+  tune_set_str ();
+  printf("\n");
+
+  tune_fac_ui ();
+  printf("\n");
+
+  tune_matrix22_mul ();
+  tune_hgcd2 ();
+  tune_hgcd ();
+  tune_hgcd_appr ();
+  tune_hgcd_reduce();
+  tune_gcd_dc ();
+  tune_gcdext_dc ();
+  tune_jacobi_base ();
+  printf("\n");
+
+  time (&end_time);
+  printf ("/* Tuneup completed successfully, took %ld seconds */\n",
+          (long) (end_time - start_time));
+
+  TMP_FREE;
+}
+
+
+int
+main (int argc, char *argv[])
+{
+  int  opt;
+
+  /* Unbuffered so if output is redirected to a file it isn't lost if the
+     program is killed part way through.  */
+  setbuf (stdout, NULL);
+  setbuf (stderr, NULL);
+
+  while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF)
+    {
+      switch (opt) {
+      case 'f':
+        if (optarg[0] == 't')
+          option_fft_trace = 2;
+        else
+          option_fft_max_size = atol (optarg);
+        break;
+      case 'o':
+        speed_option_set (optarg);
+        break;
+      case 'p':
+        speed_precision = atoi (optarg);
+        break;
+      case 't':
+        option_trace++;
+        break;
+      case '?':
+        exit(1);
+      }
+    }
+
+  all ();
+  exit (0);
+}

diff --git a/third_party/gmp/tune/x86_64.asm b/third_party/gmp/tune/x86_64.asm
new file mode 100644
index 0000000..b7ec44c
--- /dev/null
+++ b/third_party/gmp/tune/x86_64.asm

@@ -0,0 +1,55 @@
+dnl  x86 pentium time stamp counter access routine.
+
+dnl  Copyright 1999, 2000, 2003-2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+
+C void speed_cyclecounter (unsigned p[2]);
+C
+C Get the pentium rdtsc cycle counter, storing the least significant word in
+C p[0] and the most significant in p[1].
+C
+C cpuid is used to serialize execution.  On big measurements this won't be
+C significant but it may help make small single measurements more accurate.
+
+PROLOGUE(speed_cyclecounter)
+
+	C rdi	p
+
+	movq	%rbx, %r10
+	xorl	%eax, %eax
+	cpuid
+	rdtsc
+	movl	%eax, (%rdi)
+	movl	%edx, 4(%rdi)
+	movq	%r10, %rbx
+	ret
+EPILOGUE()
commit	bb1338cd84d865f1eb7a653969204a06bba8261c	[log] [tgz]
author	Austin Schuh <austin.linux@gmail.com>	Sat Jun 15 19:31:16 2024 -0700
committer	Austin Schuh <austin.linux@gmail.com>	Wed Jun 19 19:49:35 2024 -0700
tree	e291dbf975ebfaebab464c64131d63191b8b0e39
parent	e4a8c6c24f636a763b512a3f87dee2225762d817 [diff]